Machine Learning项目业务标准流程-记录个人总结

📅 2026/6/25 17:38:13
Machine Learning项目业务标准流程-记录个人总结
机器学习项目业务标准流程-个人总结简化代码 # 基础库 import numpy as np import pandas as pd # sklearn核心库 # 数据预处理 from sklearn.preprocessing import StandardScaler # 数据集划分 from sklearn.model_selection import train_test_split, cross_val_score # 特征选择L1正则化 from sklearn.linear_model import LassoCV # 回归模型L2正则化 from sklearn.linear_model import RidgeCV # 模型评估 from sklearn.metrics import mean_squared_error, r2_score # 模型保存与加载 import joblib # 其他工具 import warnings warnings.filterwarnings(ignore) # 忽略警告信息 # 1. 获取数据 # - 从数据库/文件/API读取数据 # - 初步了解数据结构 df pd.read_csv(data.csv) print(df.head()) print(df.info()) # 2. 数据预处理 # 2.1 数据清洗 # - 处理缺失值 df.dropna(inplaceTrue) # 或删除/填充 # - 处理异常值 df df[(df[column] lower_bound) (df[column] upper_bound)] # - 处理重复值 df.drop_duplicates(inplaceTrue) # 2.2 特征工程 # - 生成新特征 df[new_feature] df[feature1] * df[feature2] # - 编码分类变量 df pd.get_dummies(df, columns[category]) # 2.3 数据转换 # - 分离特征和目标 X df.drop(target, axis1) y df[target] # - 划分训练集和测试集重要 from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test train_test_split( X, y, test_size0.2, random_state42 ) # - 标准化/归一化 from sklearn.preprocessing import StandardScaler scaler StandardScaler() X_train_scaled scaler.fit_transform(X_train) X_test_scaled scaler.transform(X_test) # 3. 特征选择可选但推荐 # 3.1 用L1正则化筛选特征 from sklearn.linear_model import LassoCV lasso LassoCV(alphas[0.001, 0.01, 0.1, 1.0], cv5)# 设置参数 lasso.fit(X_train_scaled, y_train)# 这里已经获取到了最佳alpha # 找出重要特征 selected_mask lasso.coef_ ! 0 selected_indices np.where(selected_mask)[0] print(f原始特征数: {X_train.shape[1]}) print(f选中特征数: {len(selected_indices)}) # 提取选中的特征 X_train_selected X_train_scaled[:, selected_indices] X_test_selected X_test_scaled[:, selected_indices] # 4. 模型训练 # 4.1 模型选择与参数设置 from sklearn.linear_model import RidgeCV # 或用其他模型RandomForest, GBDT, SVM等 # 4.2 用筛选后的特征训练模型 model RidgeCV(alphas[0.01, 0.1, 1.0, 10.0], cv5) model.fit(X_train_selected, y_train) # 4.3 查看模型性能 train_score model.score(X_train_selected, y_train) print(f训练集得分: {train_score:.4f}) # 5. 模型评估 from sklearn.metrics import mean_squared_error, r2_score # 在测试集上评估 y_pred model.predict(X_test_selected) mse mean_squared_error(y_test, y_pred) r2 r2_score(y_test, y_pred) print(f测试集 MSE: {mse:.4f}) print(f测试集 R²: {r2:.4f}) # 交叉验证评估 from sklearn.model_selection import cross_val_score cv_scores cross_val_score(model, X_train_selected, y_train, cv5, scoringr2) print(f交叉验证 R²: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}) # 6. 模型保存 import joblib # 保存模型和相关对象 model_package { model: model, scaler: scaler, lasso: lasso, selected_indices: selected_indices, feature_names: X.columns[selected_indices] if hasattr(X, columns) else None } joblib.dump(model_package, model.pkl) print(模型已保存) # 7. 模型预测新数据 # 加载模型 loaded_package joblib.load(model.pkl) # 对新数据进行同样的预处理 new_data pd.read_csv(new_data.csv) # ... 同样的清洗、特征工程 ... # 标准化 new_data_scaled loaded_package[scaler].transform(new_data) # 提取选中的特征 new_data_selected new_data_scaled[:, loaded_package[selected_indices]] # 预测 predictions loaded_package[model].predict(new_data_selected) print(f预测结果: {predictions})