本文共 23855 字,大约阅读时间需要 79 分钟。
#导入数据集from sklearn.datasets import make_blobs# 导入KNNfrom sklearn.neighbors import KNeighborsClassifierimport matplotlib.pyplot as pltfrom sklearn.model_selection import train_test_split
# 生成样本数200,分类数为2的数据集data = make_blobs(n_samples=200,centers=2,random_state=8)X,y = data
# 可视化plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')plt.show()
import numpy as npclf = KNeighborsClassifier()clf.fit(X,y)
KNeighborsClassifier()
# 画图的代码x_min,x_max = X[:,0].min()-1,X[:,0].max()+1y_min,y_max = X[:,1].min()-1,X[:,1].max()+1xx,yy = np.meshgrid(np.arange(x_min,x_max,.02),np.arange(y_min,y_max,.02))
Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])
Z=Z.reshape(xx.shape)plt.pcolormesh(xx,yy,Z,cmap=plt.cm.Pastel1)plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')# 测试新的数据点plt.scatter(6.5,4.3,marker="*",c='red',s=200)plt.xlim(xx.min(),xx.max())plt.ylim(yy.min(),yy.max())plt.title("Classifier:KNN")plt.show()
:2: MatplotlibDeprecationWarning: shading='flat' when X and Y have the same dimensions as C is deprecated since 3.3. Either specify the corners of the quadrilaterals with X and Y, or pass shading='auto', 'nearest' or 'gouraud', or set rcParams['pcolor.shading']. This will become an error two minor releases later. plt.pcolormesh(xx,yy,Z,cmap=plt.cm.Pastel1)
print("新数据点的分类:", clf.predict([[6.5,4.3]]))
新数据点的分类: [1]
# 生成样本数500,分类数为5的数据集data = make_blobs(n_samples=500,centers=5,random_state=8)X,y = data
# 可视化plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')plt.show()
clf = KNeighborsClassifier()clf.fit(X,y)# 画图的代码x_min,x_max = X[:,0].min()-1,X[:,0].max()+1y_min,y_max = X[:,1].min()-1,X[:,1].max()+1xx,yy = np.meshgrid(np.arange(x_min,x_max,.02),np.arange(y_min,y_max,.02))Z = clf.predict(np.c_[xx.ravel(),yy.ravel()])Z=Z.reshape(xx.shape)plt.pcolormesh(xx,yy,Z,cmap=plt.cm.Pastel1)plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')# 测试新的数据点plt.scatter(6.5,4.3,marker="*",c='red',s=200)plt.xlim(xx.min(),xx.max())plt.ylim(yy.min(),yy.max())plt.title("Classifier:KNN")plt.show()
:9: MatplotlibDeprecationWarning: shading='flat' when X and Y have the same dimensions as C is deprecated since 3.3. Either specify the corners of the quadrilaterals with X and Y, or pass shading='auto', 'nearest' or 'gouraud', or set rcParams['pcolor.shading']. This will become an error two minor releases later. plt.pcolormesh(xx,yy,Z,cmap=plt.cm.Pastel1)
print("模型正确率:",clf.score(X,y))
模型正确率: 0.956
"""KNN做线性回归"""# 导入make_regression数据集生成器from sklearn.datasets import make_regression# 生成特征数为1,噪声为50的数据集X,y = make_regression(n_features=1,n_informative=1,noise=50,random_state=8)# 用散点图进行可视化plt.scatter(X,y,c='orange',edgecolors='k')plt.show()
# 导入用于回归分析的KNN模型from sklearn.neighbors import KNeighborsRegressorreg = KNeighborsRegressor()reg.fit(X,y)plt.scatter(X,y,c='orange',edgecolors='k')z=np.linspace(-3,3,200).reshape(-1, 1)plt.plot(z,reg.predict(z),c='k',linewidth=3)plt.title('KNN Regressor')plt.show()
print("模型评分:",reg.score(X,y))
模型评分: 0.7721167832505298
# 减小模型的n_neighbors为2reg2 = KNeighborsRegressor(n_neighbors=2)reg2.fit(X,y)plt.scatter(X,y,c='orange',edgecolors='k')plt.plot(z,reg2.predict(z),c='k',linewidth=3)plt.title('KNN Regressor')plt.show()
print("模型评分:",reg2.score(X,y))
模型评分: 0.8581798802065704
# 导入糖尿病数据集from sklearn.datasets import load_diabetesfrom sklearn.linear_model import LinearRegressionfrom sklearn.datasets import make_regressionfrom sklearn.model_selection import train_test_splitimport matplotlib.pyplot as pltimport numpy as npX,y = load_diabetes().data,load_diabetes().targetx_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)model = LinearRegression().fit(x_train,y_train)y_hat = model.predict(x_test)plt.figure()plt.scatter(np.arange(len(y_test)),y_test,c='b',s=80)plt.plot(np.arange(len(y_test)),y_hat,c='r')plt.title('Linear Regression')plt.show()print("model.coef_:%s\nmodel.intercept_:%s\n"%(model.coef_,model.intercept_))print("训练集得分:%s\n测试集得分:%s\n"%(model.score(x_train,y_train),model.score(x_test,y_test)))
model.coef_:[ -14.75505679 -241.43303314 526.17137532 309.18178014 -1212.52478411 775.29031634 369.07778164 292.96840184 933.01606171 71.1684786 ]model.intercept_:154.04782481274086训练集得分:0.5047765530203177测试集得分:0.5503967724591143
现实中,线性回归及其容易过拟合,因此引入Ridge回归(增加了参数惩罚项)
α \alpha α越大,参数model.coef_越小,越容易防止模型过拟合注意:若数据量足够大,那么是否正则化就没有这么重要了
# 导入糖尿病数据集from sklearn.datasets import load_diabetesfrom sklearn.linear_model import Ridgefrom sklearn.datasets import make_regressionfrom sklearn.model_selection import train_test_split,GridSearchCVimport matplotlib.pyplot as pltimport numpy as npX,y = load_diabetes().data,load_diabetes().targetx_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)alpha = np.logspace(-3,2,10)model =Ridge()model = GridSearchCV(model,param_grid={ 'alpha':alpha},cv=5)model.fit(x_train,y_train)model = model.best_estimator_y_hat = model.predict(x_test)plt.figure()plt.scatter(np.arange(len(y_test)),y_test,c='b',s=80)plt.plot(np.arange(len(y_test)),y_hat,c='r')plt.title('Linear Regression')plt.show()print("model.coef_:%s\nmodel.intercept_:%s\n"%(model.coef_,model.intercept_))print("训练集得分:%s\n测试集得分:%s\n"%(model.score(x_train,y_train),model.score(x_test,y_test)))
model.coef_:[ 7.66936383 -229.35690411 507.13340775 320.64869666 -136.63000277 -30.12558503 -183.03638164 111.35921077 443.23198819 55.44890572]model.intercept_:153.8951274627954训练集得分:0.4957238036650097测试集得分:0.5478414342339397
L1正则化,Ridge是L2正则化
如果特征过多,而且只有一部分是真正重要的,用Lasso回归# 导入糖尿病数据集from sklearn.datasets import load_diabetesfrom sklearn.linear_model import Lassofrom sklearn.datasets import make_regressionfrom sklearn.model_selection import train_test_split,GridSearchCVimport matplotlib.pyplot as pltimport numpy as npX,y = load_diabetes().data,load_diabetes().targetx_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)alpha = np.logspace(-3,2,10)model =Lasso()model = GridSearchCV(model,param_grid={ 'alpha':alpha},cv=5)model.fit(x_train,y_train)model = model.best_estimator_y_hat = model.predict(x_test)plt.figure()plt.scatter(np.arange(len(y_test)),y_test,c='b',s=80)plt.plot(np.arange(len(y_test)),y_hat,c='r')plt.title('Linear Regression')plt.show()print("特征总数:",X.shape[1])print("Lasso使用的特征数:%s\n"%(np.sum(model.coef_!=0)))print("训练集得分:%s\n测试集得分:%s\n"%(model.score(x_train,y_train),model.score(x_test,y_test)))
特征总数: 10Lasso使用的特征数:6训练集得分:0.5359093939208899测试集得分:0.40774356697454894
随机森林不需要对数据进行预处理,不必太在意参数的调节。
注意:
# 导入糖尿病数据集from sklearn.datasets import load_diabetesfrom sklearn.ensemble import RandomForestRegressorfrom sklearn.datasets import make_regressionfrom sklearn.model_selection import train_test_split,GridSearchCVimport matplotlib.pyplot as pltimport numpy as npX,y = load_diabetes().data,load_diabetes().targetprint(len(X))x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)model =RandomForestRegressor(random_state=1,n_jobs=-1)param_grid = { "n_estimators":[100,500,200],"max_depth":[3,5,10,20]}model = GridSearchCV(model,param_grid=param_grid,cv=5)model.fit(x_train,y_train)model = model.best_estimator_print("最优模型为:", model)y_hat = model.predict(x_test)plt.figure()plt.scatter(np.arange(len(y_test)),y_test,c='b',s=80)plt.plot(np.arange(len(y_test)),y_hat,c='r')plt.title('Linear Regression')plt.show()print("训练集得分:%s\n测试集得分:%s\n"%(model.score(x_train,y_train),model.score(x_test,y_test)))
442最优模型为: RandomForestRegressor(max_depth=3, n_estimators=500, n_jobs=-1, random_state=1)
训练集得分:0.5957864136004931测试集得分:0.42539072690529023
1、适合小数量级的样本,比如1w以内。
2、对数据预处理和参数调节要求非常高,随机森林就对参数和数据预处理要求不高。from sklearn.datasets import load_bostonfrom sklearn.model_selection import train_test_splitfrom sklearn.svm import SVR# 导入房价数据集boston = load_boston()# 划分数据集X,y = boston.data, boston.targetx_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)# 分别测试线性核函数和rbf核函数for kernel in ['linear','rbf']: svr = SVR(kernel=kernel) svr.fit(x_train,y_train) print("训练集得分:%s\n测试集得分:%s\n"%(svr.score(x_train,y_train),svr.score(x_test,y_test)))
训练集得分:0.705886179377861测试集得分:0.6754116930707594训练集得分:0.1938033042097428测试集得分:0.2603372256573472
注意到效果较差,考虑是不是特征量级相差过大,所以将特征的最大值和最小值绘制出来
遂考虑数据预处理。(可以看出SVM对于数据预处理的要求是很苛刻的)
# 导入数据预处理工具from sklearn.preprocessing import StandardScaler# 对训练集和测试集进行预处理scaler = StandardScaler()scaler.fit(x_train)x_train_scaled = scaler.transform(x_train)x_test_scaled = scaler.transform(x_test)# 分别测试线性核函数和rbf核函数for kernel in ['linear','rbf']: svr = SVR(kernel=kernel) svr.fit(x_train_scaled,y_train) print("训练集得分:%s\n测试集得分:%s\n"%(svr.score(x_train_scaled,y_train),svr.score(x_test_scaled,y_test)))svr = SVR(C=100,gamma=0.1)svr.fit(x_train_scaled,y_train)y_hat = svr.predict(x_test_scaled)print("训练集得分:%s\n测试集得分:%s\n"%(svr.score(x_train_scaled,y_train),svr.score(x_test_scaled,y_test)))
训练集得分:0.7092659913398127测试集得分:0.6771114835234099训练集得分:0.6654487747233812测试集得分:0.7237752269094007训练集得分:0.9713986175310146测试集得分:0.8467577918219455
# 可视化plt.figure()plt.scatter(np.arange(len(y_test)),y_test,c='b',s=80)plt.plot(y_hat,c='r')plt.title('Linear Regression')plt.show()
多层感知机(MLP)也称前馈神经网络。
只适合小数据集
对特征要求高,需要归一化
神经网络的隐藏层节点数约等于训练数据集的特征数量,但是一般不要超过500,在开始训练模型时,让模型尽可能复杂,然后对正则化参数alpha进行调节提高模型表现。
import numpy as npimport matplotlib.pyplot as pltfrom sklearn.neural_network import MLPRegressor# 生成随机序列rnd = np.random.RandomState(38)x = rnd.uniform(-5,5,size=50)# 像数据中添加噪声y_no_noise = (np.cos(6*x)+x)X = x.reshape(-1,1)y = (y_no_noise + rnd.normal(size = len(x)))/2# 生成等差数列line = np.linspace(-5,5,1000,endpoint = False).reshape(-1,1)model = MLPRegressor().fit(X,y)plt.plot(line,model.predict(line),label = 'MLP',c='k')plt.scatter(X,y)plt.legend(loc='best')plt.show()
import numpyimport matplotlib.pyplot as pltfrom sklearn.datasets import make_blobsX, y = make_blobs(n_samples=40, centers=2, random_state=50, cluster_std=2)plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.cool)plt.show()
用均值和方差进行转换
将数据的特征值转变为均值为0,而方差为1的状态,确保数据“大小”一致,利于模型训练
# 数据预处理from sklearn.preprocessing import StandardScalerX_1 = StandardScaler().fit_transform(X)plt.scatter(X_1[:,0],X_1[:,1],c=y,cmap=plt.cm.cool)plt.show()
将所有特征值转换到0到1之间
from sklearn.preprocessing import MinMaxScalerX_2 = MinMaxScaler().fit_transform(X)plt.scatter(X_2[:,0],X_2[:,1],c=y,cmap=plt.cm.cool)plt.show()
用中位数和四分位数转换,直接将数据的异常值踢出去。
from sklearn.preprocessing import RobustScalerX_3 = RobustScaler().fit_transform(X)plt.scatter(X_3[:,0],X_3[:,1],c=y,cmap=plt.cm.cool)plt.show()
将样本距离转换为欧几里得距离为1,把数据的分布变为半径为1的圆或者球。
在我们只想保留特征向量的方向,而忽略其数值的时候使用。
from sklearn.preprocessing import NormalizerX_4 = Normalizer().fit_transform(X)plt.scatter(X_4[:,0],X_4[:,1],c=y,cmap=plt.cm.cool)plt.show()
特征降维(特征过多,特征的相关性过强)
from sklearn.preprocessing import StandardScalerfrom sklearn.datasets import load_wine# 对红酒数据集预处理scaler = StandardScaler()wine_dataset = load_wine()X, y = wine_dataset['data'], wine_dataset['target']X_scaled = scaler.fit_transform(X)# 打印处理后的数据集形态print(X_scaled.shape)# 导入PCAfrom sklearn.decomposition import PCA# 设置主成分为2方便可视化pca = PCA(n_components=2)pca.fit(X_scaled)X_pca = pca.transform(X_scaled)print(X_pca.shape)
(178, 13)(178, 2)
# 可视化X0 = X_pca[y==0]X1 = X_pca[y==1]X2 = X_pca[y==2]plt.scatter(X0[:,0],X0[:,1],c='b',s=60,edgecolors='k')plt.scatter(X1[:,0],X1[:,1],c='g',s=60,edgecolors='k')plt.scatter(X2[:,0],X2[:,1],c='r',s=60,edgecolors='k')plt.legend(wine_dataset['target_names'])plt.xlabel('component 1')plt.ylabel('component 2')plt.show()
在数据集极为复杂的情况下,我们要进行特征提取。
PCA的白化功能,对于图片数据,相邻像素之间有很大的相关性,这样一来,样本特征就是冗余的了,白化的目的就是降低冗余性。他会让样本特征间的相关度降低,且所有特征具有相同的方差。
参数:
import numpy as npimport pandas as pdimport matplotlib.pyplot as pltimport sklearn.datasets as dsfrom sklearn.cluster import KMeans,DBSCAN"""自己做一份数据"""# x是(400,2),2个特征,y是400行,有四类x, y = ds.make_blobs(400, n_features=2, centers=4, random_state=2)"""K-means聚类"""model=KMeans(n_clusters=4,init='k-means++')y_pred=model.fit_predict(x,y)"""密度聚类"""model_1 = DBSCAN(eps=0.75, min_samples=10)y_pred_1 = model_1.fit_predict(x, y)"""绘制图像"""# 原始图像plt.figure()plt.subplot(2,2,1)plt.scatter(x[:,0],x[:,1],c=y,edgecolors='k',alpha=0.7)plt.title('initial picture')plt.grid()plt.subplot(2,2,2)plt.scatter(x[:,0],x[:,1],c=y_pred,edgecolors='k',alpha=0.7)plt.title('k-means picture')plt.grid()plt.subplot(2,2,3)plt.scatter(x[:,0],x[:,1],c=y_pred_1,edgecolors='k',alpha=0.7)plt.title('DBSCAN picture')plt.grid()plt.show()plt.show()
import pandas as pd fruits = pd.DataFrame({ '数值特征':[5,6,7,8,9], '类型特征':['西瓜','香蕉','橘子','苹果','葡萄']})display(fruits)
数值特征 | 类型特征 | |
---|---|---|
0 | 5 | 西瓜 |
1 | 6 | 香蕉 |
2 | 7 | 橘子 |
3 | 8 | 苹果 |
4 | 9 | 葡萄 |
# 转化数据表中的字符串为数值fruits_dum = pd.get_dummies(fruits)# fruits_dum = pd.get_dummies(fruits,columns=['类型特征'])display(fruits_dum)
数值特征 | 类型特征_橘子 | 类型特征_苹果 | 类型特征_葡萄 | 类型特征_西瓜 | 类型特征_香蕉 | |
---|---|---|---|---|---|---|
0 | 5 | 0 | 0 | 0 | 1 | 0 |
1 | 6 | 0 | 0 | 0 | 0 | 1 |
2 | 7 | 1 | 0 | 0 | 0 | 0 |
3 | 8 | 0 | 1 | 0 | 0 | 0 |
4 | 9 | 0 | 0 | 1 | 0 | 0 |
不同模型对于同样的特征,其结果相差是很大的,如下面的MLP和KNN回归对比
import numpy as npimport matplotlib.pyplot as pltfrom sklearn.neural_network import MLPRegressorfrom sklearn.neighbors import KNeighborsRegressor # 生成随机序列rnd = np.random.RandomState(38)x = rnd.uniform(-5,5,size=50)# 像数据中添加噪声y_no_noise = (np.cos(6*x)+x)X = x.reshape(-1,1)y = (y_no_noise + rnd.normal(size = len(x)))/2# 生成等差数列line = np.linspace(-5,5,1000,endpoint = False).reshape(-1,1)model = MLPRegressor().fit(X,y)knn = KNeighborsRegressor().fit(X,y)plt.plot(line,model.predict(line),label = 'MLP',c='k')plt.plot(line,knn.predict(line),label = 'KNN',c='r')plt.scatter(X,y)plt.legend(loc='best')plt.show()
# 设置箱体数为11bins = np.linspace(-5,5,11)# 装箱操作target_bin = np.digitize(X,bins=bins)print("装箱数据范围:\n",bins)print('前十个数据点的特征:\n',X[:10])print('前十个数据点所在的箱子:\n',target_bin[:10])
装箱数据范围: [-5. -4. -3. -2. -1. 0. 1. 2. 3. 4. 5.]前十个数据点的特征: [[-1.1522688 ] [ 3.59707847] [ 4.44199636] [ 2.02824894] [ 1.33634097] [ 1.05961282] [-2.99873157] [-1.12612112] [-2.41016836] [-4.25392719]]前十个数据点所在的箱子: [[ 4] [ 9] [10] [ 8] [ 7] [ 7] [ 3] [ 4] [ 3] [ 1]]
用新的方法来表达已经装箱的数据
from sklearn.preprocessing import OneHotEncoderonehot = OneHotEncoder(sparse=False)onehot.fit(target_bin)x_in_bin = onehot.transform(target_bin)print("装箱后数据形态:{}".format(x_in_bin.shape))print('Onehot处理装箱后的10个数据点的新特征:',x_in_bin[:10])
装箱后数据形态:(50, 10)Onehot处理装箱后的10个数据点的新特征: [[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.] [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.] [0. 0. 0. 0. 0. 0. 0. 1. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.] [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.] [1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]
# 对数据装箱后,观察KNN和MLP的区别new_line = onehot.transform(np.digitize(line,bins=bins))model = MLPRegressor().fit(x_in_bin,y)knn = KNeighborsRegressor().fit(x_in_bin,y)plt.plot(line,model.predict(new_line),label = 'MLP',c='k')plt.plot(line,knn.predict(new_line),label = 'KNN',c='r')plt.scatter(X,y)plt.legend(loc='best')plt.show()
c:\users\chengjingd\pycharmprojects\pythonproject\venv\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py:614: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (200) reached and the optimization hasn't converged yet. warnings.warn(
线性模型在高维数据集上表现良好,低维表现不好,因此可以用下述方法进行特征扩充(Onehot拼接原始特征等方式)
array_1 = [1,2,3,4,5]array_2 = [6,7,8,9,10]array_3 = np.hstack((array_1,array_2))print('合并后的特征:', array_3)
合并后的特征: [ 1 2 3 4 5 6 7 8 9 10]
# 将原始数据和装箱后的数据重叠x_stack = np.hstack([X, x_in_bin])print(x_stack.shape)
(50, 11)
line_stack = np.hstack([line,new_line])model = MLPRegressor().fit(x_stack,y)plt.plot(line,model.predict(line_stack),label = 'MLP',c='k')plt.scatter(X,y)for vline in bins: plt.plot([vline,vline],[-5,5],':',c='k')plt.legend(loc='best')plt.show()
# 使用新的堆叠方式处理数据x_multi = np.hstack([x_in_bin,X*x_in_bin])print(x_multi.shape)print(x_multi[0])
(50, 20)[ 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. -0. -0. -0. -1.1522688 -0. -0. -0. -0. -0. -0. ]
line_multi = np.hstack([new_line,line*new_line])model = MLPRegressor().fit(x_multi,y)plt.plot(line,model.predict(line_multi),label = 'MLP',c='k')plt.scatter(X,y)for vline in bins: plt.plot([vline,vline],[-5,5],':',c='k')plt.legend(loc='best')plt.show()
可以在一定程度上解决模型欠拟合的问题
from sklearn.preprocessing import PolynomialFeaturespoly = PolynomialFeatures(degree=20,include_bias=False)X_poly = poly.fit_transform(X)print(X_poly.shape)print('原始数据集第一个特征:',X[0])print('添加多项式特征后的数据集第一个特征:',X_poly[0])print('PolynomialFeatures对原始数据处理:',poly.get_feature_names())
(50, 20)原始数据集第一个特征: [-1.1522688]添加多项式特征后的数据集第一个特征: [ -1.1522688 1.3277234 -1.52989425 1.76284942 -2.0312764 2.34057643 -2.6969732 3.10763809 -3.58083443 4.1260838 -4.75435765 5.47829801 -6.3124719 7.27366446 -8.38121665 9.65741449 -11.12793745 12.82237519 -14.77482293 17.02456756]PolynomialFeatures对原始数据处理: ['x0', 'x0^2', 'x0^3', 'x0^4', 'x0^5', 'x0^6', 'x0^7', 'x0^8', 'x0^9', 'x0^10', 'x0^11', 'x0^12', 'x0^13', 'x0^14', 'x0^15', 'x0^16', 'x0^17', 'x0^18', 'x0^19', 'x0^20']
## 9.4 自动特征选择### 9.4.1 单一变量法进行特征选择对于噪声特别多的数据集,特征选择后模型评分会提高。单一变量进行特征选择时,无论选择哪一个模型,对数据的处理方式是相同的。### 9.4.2 基于模型的特征选择基于决策树的模型里内置了feature_importances_的属性,可以让SelectFromModel直接从属性中提取特征重要性。threshold=median,意味着模型会选择一半左右的特征。### 9.4.3 迭代式特征选择基于若干个模型进行特征选择。
对模型的泛化性能进行评估
K折交叉验证
from sklearn.datasets import load_winefrom sklearn.model_selection import cross_val_scorefrom sklearn.svm import SVCwine = load_wine()svc = SVC(kernel='linear')# 使用交叉验证法对SVC评分,默认cv=5scores = cross_val_score(svc, wine.data, wine.target,cv=6)print("交叉验证得分:",scores)
交叉验证得分: [0.86666667 0.9 0.93333333 0.96666667 1. 1. ]
# 模型得分为平均得分print('交叉验证平均得分:',scores.mean())
交叉验证平均得分: 0.9444444444444445
原理是,先从数据集中随机抽取一部分数据集作为训练集,再从其余部分随机抽取一部分作为测试集,进行评分后迭代,重复上述步骤,直到把希望的迭代次数跑完。
from sklearn.model_selection import ShuffleSplit# 设置拆分的份数为10shuffle_split = ShuffleSplit(test_size =.2, train_size =.7, n_splits = 10)scores = cross_val_score(svc, wine.data, wine.target, cv=shuffle_split)print('随机拆分交叉验证模型得分:\n{}'.format(scores))
随机拆分交叉验证模型得分:[0.94444444 0.97222222 0.91666667 1. 0.97222222 0.97222222 1. 0.91666667 0.94444444 1. ]
在交叉验证法中,令k等于数据集的个数。
from sklearn.model_selection import LeaveOneOutcv = LeaveOneOut()scores = cross_val_score(svc,wine.data,wine.target,cv=cv)print('迭代次数:',len(scores))print('模型平均得分:',scores.mean())
迭代次数: 178模型平均得分: 0.9550561797752809
from sklearn.model_selection import GridSearchCVfrom sklearn.linear_model import Lassofrom sklearn.model_selection import train_test_splitX_train,X_test,y_train,y_test = train_test_split(wine.data,wine.target,random_state=0)# 将需要遍历的参数定义为字典params = { 'alpha':[0.01,0.1,1.0,10.0], 'max_iter':[100,1000,5000,10000]}# 定义网格搜索需要的模型和参数grid_search = GridSearchCV(Lasso(),params,cv=6)grid_search.fit(X_train,y_train)print('模型最高分:',grid_search.score(X_test,y_test))print('最优参数:',grid_search.best_params_)
模型最高分: 0.819334891919453最优参数: {'alpha': 0.01, 'max_iter': 100}
from sklearn.datasets import make_blobsimport matplotlib.pyplot as pltfrom sklearn.naive_bayes import GaussianNBX,y=make_blobs(n_samples=200,random_state=1,centers=2,cluster_std=5)X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=68)model = GaussianNB()model.fit(X_train,y_train)# 分类的准确率predict_proba = model.predict_proba(X_test)print('分类准确率的形态:',predict_proba.shape)print(predict_proba[:5])plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.cool,edgecolors='k')plt.show()
分类准确率的形态: (50, 2)[[0.98849996 0.01150004] [0.0495985 0.9504015 ] [0.01648034 0.98351966] [0.8168274 0.1831726 ] [0.00282471 0.99717529]]
在二分类中,负数代表分类2,正数代表分类1
from sklearn.svm import SVCsvc = SVC().fit(X_train,y_train)# 获取SVC的决定系数dec_func = svc.decision_function(X_test)print(dec_func[:5])
[-1.36071347 1.53694862 1.78825594 -0.96133081 1.81826853]
回归问题的 R 2 R^2 R2指标
Pipeline可以将许多算法模型串联起来,比如将特征提取、归一化、分类组织在一起形成一个典型的机器学习问题工作流。主要带来两点好处:
1、直接调用fit和predict方法来对pipeline中的所有算法模型进行训练和预测。
2、可以结合grid search对参数进行选择。
from sklearn.datasets import make_blobsfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalerfrom sklearn.neural_network import MLPClassifierfrom sklearn.model_selection import GridSearchCVfrom sklearn.pipeline import Pipelinefrom sklearn.feature_selection import SelectFromModelfrom sklearn.ensemble import RandomForestRegressorimport matplotlib.pyplot as pltX, y = make_blobs(n_samples=200,centers=2,cluster_std=5)X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=38)# 建立包含预处理和神经网络的管道模型pipe = Pipeline([('scaler',StandardScaler()), ('select_model',SelectFromModel(RandomForestRegressor(random_state=38))), ('mlp',MLPClassifier(max_iter=2000,random_state = 38))])# 网格化寻优params = { 'mlp__hidden_layer_sizes':[(50,),(100,),(100,100)], 'mlp__alpha':[0.0001,0.001,0.01,0.1]}grid = GridSearchCV(pipeline,param_grid=params,cv=3)#拟合数据grid.fit(X_train,y_train)print("模型最佳得分:",grid.best_score_)print("模型最佳参数:",grid.best_estimator_)print('测试集得分:',grid.score(X_test,y_test))print(pipe.steps)
模型最佳得分: 0.8466666666666667模型最佳参数: Pipeline(steps=[('scaler', StandardScaler()), ('mlp', MLPClassifier(max_iter=2000, random_state=38))])测试集得分: 0.76[('scaler', StandardScaler()), ('select_model', SelectFromModel(estimator=RandomForestRegressor(random_state=38))), ('mlp', MLPClassifier(max_iter=2000, random_state=38))]
# pipeline进行模型选择params = [{ 'reg':[MLPRegressor(random_state=38)], 'scaler':[StandardScaler(),None]}, { 'reg':[RandomForestRegressor(random_state=38)], 'scaler':[None]}]# pipeline实例化pipe = Pipeline([('scaler',StandardScaler()),('reg',MLPRegressor())])#网格搜索grid = GridSearchCV(pipe,params,cv=3)grid.fit(X,y)print("模型最佳得分:",grid.best_score_)print("模型最佳参数:",grid.best_params_)
模型最佳得分: 0.4566285156223768模型最佳参数: {'reg': MLPRegressor(random_state=38), 'scaler': None}
# pipeline进行模型选择params = [{ 'reg':[MLPRegressor(random_state=38,max_iter=2000)], 'scaler':[StandardScaler(),None], 'reg__hidden_layer_sizes':[(50,),(100,),(100,100)]}, { 'reg':[RandomForestRegressor(random_state=38)], 'scaler':[None], 'reg__n_estimators':[10,50,100]}]# pipeline实例化pipe = Pipeline([('scaler',StandardScaler()),('reg',MLPRegressor())])#网格搜索grid = GridSearchCV(pipe,params,cv=3)grid.fit(X,y)print("模型最佳得分:",grid.best_score_)print("模型最佳参数:",grid.best_params_)
模型最佳得分: 0.4882498100109817模型最佳参数: {'reg': MLPRegressor(hidden_layer_sizes=(100, 100), max_iter=2000, random_state=38), 'reg__hidden_layer_sizes': (100, 100), 'scaler': StandardScaler()}
转载地址:http://ekgjz.baihongyu.com/