2022高教社杯全国大学生数学建模竞赛C题问题二(1) Python代码

- 问题 2
- - 2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律
  - - 数据类别编码
    - 不平衡数据处理
    - 分类模型
    - - 决策树分类
      - 随机森林分类
      - XGBoost分类
      - LightGBM分类
      - Catboost分类
      - 基于直方图的梯度提升Histogram-Based Gradient Boosting
      - 梯度提升树Gradient Boosting Tree
      - 逻辑回归Logistic
      - 朴素贝叶斯Naive Bayes
      - 支持向量机SVM
      - 神经网络Neural network

问题 2

2.1 依据附件数据分析高钾玻璃、铅钡玻璃的分类规律

在这里插入图片描述

数据类别编码

d12 = d12.drop('rowSum', axis=1)

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder# data encode
# Check for and handle categorical variables
label_encoder = LabelEncoder()
x_categorical = d12.select_dtypes(include=['object']).apply(label_encoder.fit_transform)
x_numerical = d12.select_dtypes(exclude=['object']).valuesdf_encode = pd.concat([pd.DataFrame(x_numerical), x_categorical], axis=1)

# rename columns
colnames = list(d12.columns[i] for i in ([0] + list(range(6,20)))) + list(df_encode.columns[i] for i in list(range(15,21)))
df_encode.columns = colnames
df_encode.head()

	文物编号	二氧化硅(SiO2)	氧化钾(K2O)	氧化钙(CaO)	氧化镁(MgO)	氧化铝(Al2O3)	氧化铁(Fe2O3)	氧化铜(CuO)	氧化铅(PbO)	...	五氧化二磷(P2O5)	氧化锶(SrO)	二氧化硫(SO2)	纹饰	类型	颜色	表面风化	文物采样点	风化标记
0	1.0	69.33	9.99	6.32	0.87	3.93	1.74	3.87	0.00	...	1.17	0.00	0.39	2	1	6	0	0	1
1	2.0	36.28	1.05	2.34	1.18	5.73	1.86	0.26	47.43	...	3.57	0.19	0.00	0	0	1	1	1	1
2	3.0	87.05	5.19	2.01	0.00	4.06	0.00	0.78	0.25	...	0.66	0.00	0.00	0	1	6	0	2	1
3	3.0	61.71	12.37	5.87	1.11	5.50	2.16	5.09	1.41	...	0.70	0.10	0.00	0	1	6	0	3	1
4	4.0	65.88	9.67	7.12	1.56	6.44	2.06	2.18	0.00	...	0.79	0.00	0.36	0	1	6	0	4	1

5 rows × 21 columns

from sklearn.model_selection import train_test_split
X = df_encode.drop('类型', axis=1)
y = df_encode['类型']X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

不平衡数据处理

d12['类型'].value_counts()

类型
铅钡    49
高钾    18
Name: count, dtype: int64

df_encode['类型'].value_counts()

类型
0    49
1    18
Name: count, dtype: int64

from imblearn.over_sampling import SMOTEoversample = SMOTE()
X_train_smote, y_train_smote = oversample.fit_resample(X_train, y_train)

y_train_smote.value_counts()

类型
0    37
1    37
Name: count, dtype: int64

分类模型

决策树分类

模型评估：https://www.statology.org/sklearn-classification-report/

# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics 
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train_smote, y_train_smote)
#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplaycf_mat = confusion_matrix(y_test, y_pred)
print('混淆矩阵')
print(cf_mat)disp = ConfusionMatrixDisplay(confusion_matrix=cf_mat,display_labels=clf.classes_)
disp.plot()
plt.show()

混淆矩阵
[[12  0][ 0  2]]

在这里插入图片描述

决策树可视化
https://mljar.com/blog/visualize-decision-tree/

from sklearn import tree
text_representation = tree.export_text(clf)
print(text_representation)

|--- feature_9 <= 5.46
|   |--- class: 1
|--- feature_9 >  5.46
|   |--- class: 0

fig = plt.figure(figsize=(25,20))
my_plot = tree.plot_tree(clf, feature_names=list(X.columns),  class_names=['高钾','铅钡'],filled=True)

在这里插入图片描述

#define metrics
y_pred_proba = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)plt.rcParams['figure.dpi'] = 300
plt.rcParams['savefig.dpi'] = 300
fig = plt.figure(figsize=(10,8))#create ROC curve
plt.plot(fpr,tpr,label="AUC="+str(auc))
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc=4)
plt.rc('font', size=20)  
plt.rc('figure', titlesize=20)
plt.show()

在这里插入图片描述

随机森林分类

import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer
from sklearn import tree
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_splitrf = RandomForestClassifier(n_estimators=500,random_state=0)
rf.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = clf.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

fn = list(X_train_smote.columns) 
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=800)
my_plot = tree.plot_tree(rf.estimators_[200],feature_names = fn, class_names=['高钾','铅钡'],filled = True)

在这里插入图片描述

XGBoost分类

import xgboost as xgb# Use "hist" for constructing the trees, with early stopping enabled.
xgb = xgb.XGBClassifier(tree_method="hist", early_stopping_rounds=2)
# Fit the model, test sets are used for early stopping.
xgb.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = xgb.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

# plot single tree
#plot_tree(xgb)
#plt.show()

LightGBM分类

https://machinelearningmastery.com/gradient-boosting-with-scikit-learn-xgboost-lightgbm-and-catboost/

from lightgbm import LGBMClassifiergbm = LGBMClassifier()
gbm.fit(X_train_smote, y_train_smote)

[LightGBM] [Info] Number of positive: 37, number of negative: 37
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 292
[LightGBM] [Info] Number of data points in the train set: 74, number of used features: 16
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -inf
[LightGBM] [Warning] No further splits with positive gain, best gain: -infLGBMClassifier()

#Predict the response for test dataset
y_pred = gbm.predict(X_test)print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

Catboost分类

from catboost import CatBoostClassifier
cat = CatBoostClassifier(verbose=0, n_estimators=100)
cat.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = cat.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

基于直方图的梯度提升Histogram-Based Gradient Boosting

from sklearn.ensemble import HistGradientBoostingClassifierhbg = HistGradientBoostingClassifier()
hbg.fit(X_train_smote, y_train_smote)#Predict the response for test dataset
y_pred = hbg.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

梯度提升树Gradient Boosting Tree

from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

逻辑回归Logistic

from sklearn import linear_model
import numpylogr = linear_model.LogisticRegression()
logr.fit(X_train_smote, y_train_smote)y_pred = gb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

print('模型的回归系数：')
log_odds = logr.coef_
odds = numpy.exp(log_odds)
odds

模型的回归系数：array([[0.82909358, 1.21679202, 1.00751889, 1.15839779, 1.06619743,0.99198439, 0.96850246, 0.99970948, 1.01100367, 0.75457187,0.91357586, 0.99337601, 0.99650844, 1.00080449, 1.00000987,1.020977  , 1.02869626, 0.99088894, 0.8292335 , 0.98475949]])

朴素贝叶斯Naive Bayes

from sklearn.naive_bayes import GaussianNBgnb = GaussianNB()
y_pred = gnb.fit(X_train_smote, y_train_smote)y_pred = gnb.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

支持向量机SVM

from sklearn import svm
svm = svm.SVC()
svm.fit(X_train_smote, y_train_smote)y_pred = svm.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14

神经网络Neural network

from sklearn.neural_network import MLPClassifiernn = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
nn.fit(X_train_smote, y_train_smote)y_pred = nn.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0precision    recall  f1-score   support0       1.00      1.00      1.00        121       1.00      1.00      1.00         2accuracy                           1.00        14macro avg       1.00      1.00      1.00        14
weighted avg       1.00      1.00      1.00        14