完善资料让更多小伙伴认识你,还能领取20积分哦, 立即完善>
|
|
相关推荐
1个回答
|
|
#导入需要的库 import numpy as np import pandas as pd import seaborn as sns from sklearn import svm #支持向量机 import matplotlib.pyplot as plt from sklearn.decomposition import PCA #主成分分析法 from sklearn import preprocessing from sklearn.metrics import mean_squared_error from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression #逻辑回归 from sklearn.model_selection import train_test_split,cross_val_score,cross_validate # 交叉验证所需的函数 from sklearn.model_selection import KFold,LeaveOneOut,LeavePOut,ShuffleSplit # 交叉验证所需的子集划分方法 from sklearn.model_selection import StratifiedKFold,StratifiedShuffleSplit # 分层分割 from sklearn.ensemble import RandomForestRegressor #随机森林 from statsmodels.stats.outliers_influence import variance_inflation_factor #多重共线性方差膨胀因子 #读取数据 train_data0=pd.read_table(‘C:\Users\Administrator\Desktop\zhengqi_train.txt’,sep=‘t’) test_data0=pd.read_table(‘C:\Users\Administrator\Desktop\zhengqi_test.txt’,sep=‘t’) #训练数据总览 train_data0.info() #异常值 plt.figure(figsize=(18, 10)) plt.boxplot(x=train_data0.values,labels=train_data0.columns) plt.hlines([-7.5, 7.5], 0, 40, colors=‘r’) plt.show() #剔除异常值 train_data0=train_data0[train_data0[‘V9’]》-7.5] train_data0.describe() #最大最小归一化 min_max_scaler = preprocessing.MinMaxScaler() train_data1=min_max_scaler.fit_transform(train_data0) test_data1=min_max_scaler.fit_transform(test_data0) train_data1=pd.DataFrame(train_data1) train_data1.columns=train_data0.columns test_data1=pd.DataFrame(test_data1) test_data1.columns=test_data0.columns #训练数据分布情况 plt.figure(figsize=(18, 18)) for column_index, column in enumerate(train_data1.columns): plt.subplot(10, 4, column_index + 1) g=sns.kdeplot(train_data1[column]) g.set_xlabel(column) g.set_ylabel(‘Frequency’) #特征相关性 plt.figure(figsize=(20, 16)) colnm = train_data1.columns.tolist() mcorr = train_data1[colnm].corr(method=“spearman”) mask = np.zeros_like(mcorr, dtype=np.bool) mask[np.triu_indices_from(mask)] = True cmap = sns.diverging_palette(220, 10, as_cmap=True) g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt=‘0.2f’) plt.show() #特征筛选 mcorr=mcorr.abs() numerical_corr=mcorr[mcorr[‘target’]》0.1][‘target’] print(numerical_corr.sort_values(ascending=False)) index0 = numerical_corr.sort_values(ascending=False).index print(train_data1[index0].corr(‘spearman’)) #多重共线性 new_numerical=[‘V0’, ‘V2’, ‘V3’, ‘V4’, ‘V5’, ‘V6’, ‘V10’,‘V11’, ‘V13’, ‘V15’, ‘V16’, ‘V18’, ‘V19’, ‘V20’, ‘V22’,‘V24’,‘V30’, ‘V31’, ‘V37’] X=np.matrix(train_data1[new_numerical]) VIF_list=[variance_inflation_factor(X, i) for i in range(X.shape[1])] VIF_list #PCA方法降维 #保持90%的信息 pca = PCA(n_components=0.9) new_train0=pca.fit_transform(train_data1.iloc[:,0:-1]) new_train0=pd.DataFrame(new_train0) new_train0.describe() train=new_train0 pca = PCA(n_components=16) test=pca.fit_transform(test_data1) target=train_data1.iloc[:,-1] #数据分割 train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2, random_state=0) #随机森林预测模型 m1=RandomForestRegressor() m1.fit(train_data, train_target) score1=mean_squared_error(test_target,m1.predict(test_data)) print(score1) #逻辑回归预测模型 m2=LogisticRegression(C=1000.0,random_state=0) m2.fit(train_data.astype(‘int’), train_target.astype(‘int’)) score2=mean_squared_error(test_target,m2.predict(test_data.astype(‘int’))) print(score2) #支持向量机预测模型 m3=svm.SVC(kernel=‘poly’) m3.fit(train_data.astype(‘int’), train_target.astype(‘int’)) score3=mean_squared_error(test_target,m3.predict(test_data.astype(‘int’))) print(score3) #多次重复,查看随机森林预测模型预测结果稳定性 model_accuracies = [] for repetition in range(100): train_data,test_data,train_target,test_target=train_test_split(train,target,test_size=0.2, random_state=0) m1=RandomForestRegressor() m1.fit(train_data, train_target) score=mean_squared_error(test_target,m1.predict(test_data)) model_accuracies.append(score) sns.distplot(model_accuracies) #网格搜索,参数调优 param_grid={‘n_estimators’:[***], ‘max_features’:(‘auto’,‘sqrt’,‘log2’)} m=GridSearchCV(RandomForestRegressor(),param_grid) m=m.fit(train_data,train_target) score=mean_squared_error(test_target,m.predict(test_data)) print(score) print(m.best_score_) print(m.best_params_) #交叉检验 m = RandomForestRegressor(n_estimators=100,max_features=‘auto’) scores = cross_validate(m,train,target, scoring=‘mean_squared_error’, cv=5) scores #结果输出 m = RandomForestRegressor(n_estimators=100,max_features=‘auto’) m.fit(train_data, train_target) predict = m.predict(test) np.savetxt(‘C:\Users\Administrator\Desktop\predict.txt’,predict) |
|
|
|
只有小组成员才能发言,加入小组>>
2480 浏览 0 评论
9376 浏览 4 评论
37077 浏览 19 评论
5079 浏览 0 评论
25117 浏览 34 评论
1682浏览 2评论
1938浏览 1评论
2400浏览 1评论
1705浏览 0评论
699浏览 0评论
小黑屋| 手机版| Archiver| 电子发烧友 ( 湘ICP备2023018690号 )
GMT+8, 2025-2-2 23:28 , Processed in 1.250628 second(s), Total 42, Slave 36 queries .
Powered by 电子发烧友网
© 2015 bbs.elecfans.com
关注我们的微信
下载发烧友APP
电子发烧友观察
版权所有 © 湖南华秋数字科技有限公司
电子发烧友 (电路图) 湘公网安备 43011202000918 号 电信与信息服务业务经营许可证:合字B2-20210191 工商网监 湘ICP备2023018690号