阅读量:0
#!/usr/bin/env python # coding: utf-8 import pandas as pd from imblearn.over_sampling import SMOTE from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report import matplotlib.pyplot as plt import pickle df=pd.read_csv('credit_default.csv') df.head() df.info() # 平均值填充空值 for column in list(df.columns[df.isnull().sum() > 0]): mean_val = df[column].mean() df[column].fillna(mean_val, inplace=True) # In[52]: # 删除重复行 df.drop_duplicates(inplace=True) # In[54]: # 对每个特征绘制类别分布柱状图 columns = list(df.columns) columns.remove('Default') for feature in columns: # 绘制柱状图 df[[feature, 'Default']]['Default'].value_counts().plot(kind='bar') plt.title(f'Distribution of {feature}') plt.xlabel(feature) plt.ylabel('Frequency') plt.show() # In[24]: # 查看不同的类别的个数 normal = df[df['Default']==0] fraud = df[df['Default']==1] normal.shape, fraud.shape # In[25]: # 数据不平衡,过采样平衡数据 X = df.drop('Default', axis = 1) y= df['Default'] X_res, y_res = SMOTE().fit_resample(X,y) # In[26]: # 查看采样后的类别个数 y_res.value_counts() # In[44]: # 7:3拆分数据集 X_train,X_test,y_train,y_test=train_test_split(X_res,y_res,test_size=0.3,random_state=42) # In[45]: # 训练模型 clf=RandomForestClassifier(n_estimators=50) clf.fit(X_train,y_train) # In[46]: # 评估模型 y_pred = clf.predict(X_test) print(f"\n Accuaracy: {accuracy_score(y_test, y_pred)}") print(f"\n Precision: {precision_score(y_test, y_pred)}") print(f"\n Recall: {recall_score(y_test, y_pred)}") print(f"\n F1 Score: {f1_score(y_test, y_pred)}") # In[47]: # 混淆矩阵 confusion_matrix(y_test,y_pred) # In[48]: print(classification_report(y_test,y_pred)) # In[56]: # 保存模型 with open('model.pkl', 'wb') as f: pickle.dump(clf, f)