机器学习总结-sklearn参数解释

本文转自：lytforgood

机器学习总结-sklearn参数解释

实验数据集选取：

1分类数据选取 load_iris 鸢尾花数据集

from sklearn.datasets import load_iris

data = load_iris()

data.data[[10, 25, 50]]

data.target[[10, 25, 50]]

list(data.target_names)

list(data.feature_names)

2回归数据选取

from sklearn.datasets import load_boston

boston = load_boston()

print(boston.data.shape)

boston.feature_names

数据集-切分为训练集-验证集

GBDT

系数说明参考

GradientBoostingClassifier支持二进制和多类分类

from  sklearn.datasets  import  make_hastie_10_2

from  sklearn.ensemble  import  GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)

X_train, X_test = X[:2000], X[2000:]

y_train, y_test = y[:2000], y[2000:]

clf = GradientBoostingClassifier(

loss='deviance',  ##损失函数默认deviance  deviance具有概率输出的分类的偏差

n_estimators=100, ##默认100 回归树个数 弱学习器个数

learning_rate=0.1,  ##默认0.1学习速率/步长0.0-1.0的超参数  每个树学习前一个树的残差的步长

max_depth=3,   ## 默认值为3每个回归树的深度  控制树的大小 也可用叶节点的数量max leaf nodes控制

subsample=1,  ##树生成时对样本采样 选择子样本<1.0导致方差的减少和偏差的增加

min_samples_split=2, ##生成子节点所需的最小样本数 如果是浮点数代表是百分比

min_samples_leaf=1, ##叶节点所需的最小样本数  如果是浮点数代表是百分比

max_features=None, ##在寻找最佳分割点要考虑的特征数量auto全选/sqrt开方/log2对数/None全选/int自定义几个/float百分比

max_leaf_nodes=None, ##叶节点的数量 None不限数量

min_impurity_split=1e-7, ##停止分裂叶子节点的阈值

verbose=0,  ##打印输出 大于1打印每棵树的进度和性能

warm_start=False, ##True在前面基础上增量训练(重设参数减少训练次数) False默认擦除重新训练

random_state=0  ##随机种子-方便重现

).fit(X_train, y_train)  ##多类别回归建议使用随机森林

print clf.score(X_test, y_test)  ##tp / (tp + fp)正实例占所有正实例的比例

test_y= clf.predict(X_test)

test_y= clf.predict_proba(X_test)[:,1] ##预测概率

print clf.feature_importances_  ##输出特征重要性

print clf.train_score_  ##每次迭代后分数

##test_y= clf.predict(X_test)

##from sklearn.metrics import precision_score

##precision_score(test_y, y_test,average='micro')  ##tp / (tp + fp)

##from sklearn import metrics

##fpr, tpr, thresholds = metrics.roc_curve(y_test, test_y)

##print("auc : %.4g" % metrics.auc(fpr, tpr)

y_pre= clf.predict(X_test)

y_pro= clf.predict_proba(X_test)[:,1] ##预测概率

from sklearn import metrics

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pro)

print("auc : %.4g" % metrics.auc(fpr, tpr),x%10000/100,x%100) #auc表示一

print "AUC Score (Train): %f" % metrics.roc_auc_score(y_test, y_pro) #auc表示二 两种方式等价

print"Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre) ##等价于clf.score(X_test, y_test)

sklearn.ensemble.GradientBoostingRegressor

import numpy as np

from sklearn.metrics import mean_squared_error

from sklearn.datasets import make_friedman1

from sklearn.ensemble import GradientBoostingRegressor

X, y = make_friedman1(n_samples=1200, random_state=0, noise=1.0)

X_train, X_test = X[:200], X[200:]

y_train, y_test = y[:200], y[200:]

est = GradientBoostingRegressor(

loss='ls',      ##默认ls损失函数'ls'是指最小二乘回归lad'（最小绝对偏差）'huber'是两者的组合

n_estimators=100, ##默认100 回归树个数 弱学习器个数

learning_rate=0.1,  ##默认0.1学习速率/步长0.0-1.0的超参数  每个树学习前一个树的残差的步长

max_depth=3,   ## 默认值为3每个回归树的深度  控制树的大小 也可用叶节点的数量max leaf nodes控制

subsample=1,  ##用于拟合个别基础学习器的样本分数 选择子样本<1.0导致方差的减少和偏差的增加

min_samples_split=2, ##生成子节点所需的最小样本数 如果是浮点数代表是百分比

min_samples_leaf=1, ##叶节点所需的最小样本数  如果是浮点数代表是百分比

max_features=None, ##在寻找最佳分割点要考虑的特征数量auto全选/sqrt开方/log2对数/None全选/int自定义几个/float百分比

max_leaf_nodes=None, ##叶节点的数量 None不限数量

min_impurity_split=1e-7, ##停止分裂叶子节点的阈值

verbose=0,  ##打印输出 大于1打印每棵树的进度和性能

warm_start=False, ##True在前面基础上增量训练 False默认擦除重新训练 增加树

random_state=0  ##随机种子-方便重现

).fit(X_train, y_train)

mean_squared_error(y_test, est.predict(X_test))

import numpy as np

from sklearn import ensemble

from sklearn import datasets

from sklearn.utils import shuffle

from sklearn.metrics import mean_squared_error

from sklearn.metrics import r2_score

boston = datasets.load_boston()

X, y = shuffle(boston.data, boston.target, random_state=13) #抽取

X = X.astype(np.float32)

offset = int(X.shape[0] * 0.9) #设置取0.9做样本

X_train, y_train = X[:offset], y[:offset]

X_test, y_test = X[offset:], y[offset:]

##参数可以放入一个字典当中

params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,

          'learning_rate': 0.01, 'loss': 'ls'}

clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)

mse = mean_squared_error(y_test, clf.predict(X_test))

r2 = r2_score(y_test, clf.predict(X_test))

print("MSE: %.4f" % mse) ##输出均方误差

print("r^2 on test data : %f" % r2) ##R^2 拟合优度=(预测值-均值)^2之和/(真实值-均值)^2之和

##绘图查看

import matplotlib.pyplot as plt

test_score = np.zeros((params['n_estimators'],), dtype=np.float64)

##计算每次迭代分数变化

for i, y_pred in enumerate(clf.staged_predict(X_test)):

    test_score[i] = clf.loss_(y_test, y_pred)

plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)

plt.title('Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, clf.train_score_, 'b-',

         label='Training Set Deviance')

plt.plot(np.arange(params['n_estimators']) + 1, test_score, 'r-',

         label='Test Set Deviance')

plt.legend(loc='upper right')

plt.xlabel('Boosting Iterations')

plt.ylabel('Deviance')

##输出特征重要性

feature_importance = clf.feature_importances_

# make importances relative to max importance

feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)  ##返回的是数组值从小到大的索引值

pos = np.arange(sorted_idx.shape[0]) + .5

plt.subplot(1, 2, 2)

plt.barh(pos, feature_importance[sorted_idx], align='center')

plt.yticks(pos, boston.feature_names[sorted_idx])

plt.xlabel('Relative Importance')

plt.title('Variable Importance')

plt.show()

网格搜索调整超参数

from sklearn.model_selection import GridSearchCV

clf=GridSearchCV(

estimator, ##模型

param_grid, ##参数字典或者字典列表

scoring=None,  ##评价分数的方法

fit_params=None, ##fit的参数 字典

n_jobs=1, ##并行数  -1全部启动

iid=True,  ##每个cv集上等价

refit=True,  ##使用整个数据集重新编制最佳估计量

cv=None,   ##几折交叉验证None默认3

verbose=0, ##控制详细程度：越高，消息越多

pre_dispatch='2*n_jobs',  ##总作业的确切数量

error_score='raise',  ##错误时选择的分数

return_train_score=True   ##如果'False'，该cv_results_属性将不包括训练得分

)

clf.cv_results_  ##结果表 常看mean_test_score std_test_score

clf.cv_results_.keys()  ##clf.cv_results_['mean_test_score']

clf.best_estimator_  ##最优模型

clf.best_score_  ##最优分数

clf.best_params_  ##最优参数

from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report

from sklearn import metrics

from  sklearn.datasets  import  make_hastie_10_2

from  sklearn.ensemble  import  GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例

##设置参数

tuned_parameters= [{'n_estimators':range(20,81,10),

                  'max_depth':range(3,14,2),

                  'learning_rate':[0.1, 0.5, 1.0],

                  'subsample':[0.6,0.7,0.75,0.8,0.85,0.9]

                  }]

##设置分数计算方法精度/召回

scores = ['precision', 'recall']  ## roc_auc

for score in scores:

    print("评测选择 %s" % score)

    clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5,

                       scoring='%s_macro' % score)

    clf.fit(X_train, y_train)

    print(clf.best_params_)

    means = clf.cv_results_['mean_test_score']  ##tp / (tp + fp)

    stds = clf.cv_results_['std_test_score']

    for mean, std, params in zip(means, stds, clf.cv_results_['params']):

        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    ##预测

    y_true, y_pred = y_test, clf.predict(X_test)

    ##y_true, y_pred = y_test, clf.predict_proba(X_test)

    print(classification_report(y_true, y_pred))

    ##print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)

XGBoost

xgb原始

from sklearn.model_selection import train_test_split

from sklearn import metrics

from  sklearn.datasets  import  make_hastie_10_2

import xgboost as xgb

#记录程序运行时间

import time

start_time = time.time()

X, y = make_hastie_10_2(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例

#xgb矩阵赋值

xgb_train = xgb.DMatrix(X_train, label=y_train)

xgb_test = xgb.DMatrix(X_test,label=y_test)

##参数

params={

'booster':'gbtree',

'silent':1 ,#设置成1则没有运行信息输出，最好是设置为0.

#'nthread':7,# cpu 线程数 默认最大

'eta': 0.007, # 如同学习率

'min_child_weight':3,

# 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言

#，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。

#这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。

'max_depth':6, # 构建树的深度，越大越容易过拟合

'gamma':0.1,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守，一般0.1、0.2这样子。

'subsample':0.7, # 随机采样训练样本

'colsample_bytree':0.7, # 生成树时进行的列采样

'lambda':2,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。

#'alpha':0, # L1 正则项参数

#'scale_pos_weight':1, #如果取值大于0的话，在类别样本不平衡的情况下有助于快速收敛。

#'objective': 'multi:softmax', #多分类的问题

#'num_class':10, # 类别数，多分类与 multisoftmax 并用

'seed':1000, #随机种子

#'eval_metric': 'auc'

}

plst = list(params.items())

num_rounds = 100 # 迭代次数

watchlist = [(xgb_train, 'train'),(xgb_test, 'val')]

#训练模型并保存

# early_stopping_rounds 当设置的迭代次数较大时，early_stopping_rounds 可在一定的迭代次数内准确率没有提升就停止训练

model = xgb.train(plst, xgb_train, num_rounds, watchlist,early_stopping_rounds=100,pred_margin=1)

#model.save_model('./model/xgb.model') # 用于存储训练出的模型

print "best best_ntree_limit",model.best_ntree_limit

y_pred = model.predict(xgb_test,ntree_limit=model.best_ntree_limit)

print ('error=%f' % (  sum(1 for i in range(len(y_pred)) if int(y_pred[i]>0.5)!=y_test[i]) /float(len(y_pred))))

#输出运行时长

cost_time = time.time()-start_time

print "xgboost success!",'\n',"cost time:",cost_time,"(s)......"

xgb使用sklearn接口(推荐)

官方

会改变的函数名是：

eta -> learning_rate

lambda -> reg_lambda

alpha -> reg_alpha

from sklearn.model_selection import train_test_split

from sklearn import metrics

from  sklearn.datasets  import  make_hastie_10_2

from xgboost.sklearn import XGBClassifier

X, y = make_hastie_10_2(random_state=0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)##test_size测试集合所占比例

clf = XGBClassifier(

silent=0 ,#设置成1则没有运行信息输出，最好是设置为0.是否在运行升级时打印消息。

#nthread=4,# cpu 线程数 默认最大

learning_rate= 0.3, # 如同学习率

min_child_weight=1,

# 这个参数默认是 1，是每个叶子里面 h 的和至少是多少，对正负样本不均衡时的 0-1 分类而言

#，假设 h 在 0.01 附近，min_child_weight 为 1 意味着叶子节点中最少需要包含 100 个样本。

#这个参数非常影响结果，控制叶子节点中二阶导的和的最小值，该参数值越小，越容易 overfitting。

max_depth=6, # 构建树的深度，越大越容易过拟合

gamma=0,  # 树的叶子节点上作进一步分区所需的最小损失减少,越大越保守，一般0.1、0.2这样子。

subsample=1, # 随机采样训练样本 训练实例的子采样比

max_delta_step=0,#最大增量步长，我们允许每个树的权重估计。

colsample_bytree=1, # 生成树时进行的列采样

reg_lambda=1,  # 控制模型复杂度的权重值的L2正则化项参数，参数越大，模型越不容易过拟合。

#reg_alpha=0, # L1 正则项参数

#scale_pos_weight=1, #如果取值大于0的话，在类别样本不平衡的情况下有助于快速收敛。平衡正负权重

#objective= 'multi:softmax', #多分类的问题 指定学习任务和相应的学习目标

#num_class=10, # 类别数，多分类与 multisoftmax 并用

n_estimators=100, #树的个数

seed=1000 #随机种子

#eval_metric= 'auc'

)

clf.fit(X_train,y_train,eval_metric='auc')

y_true, y_pred = y_test, clf.predict(X_test)

print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)

#回归

#m_regress = xgb.XGBRegressor(n_estimators=1000,seed=0)

网格搜索

可以先固定一个参数最优化后继续调整

第一步：确定学习速率和tree_based 给个常见初始值根据是否类别不平衡调节

max_depth,min_child_weight,gamma,subsample,scale_pos_weight

max_depth=3 起始值在4-6之间都是不错的选择。

min_child_weight比较小的值解决极不平衡的分类问题eg:1

subsample, colsample_bytree = 0.8: 这个是最常见的初始值了

scale_pos_weight = 1: 这个值是因为类别十分不平衡。

第二步： max_depth 和 min_weight 对最终结果有很大的影响

'max_depth':range(3,10,2),

'min_child_weight':range(1,6,2)

先大范围地粗调参数，然后再小范围地微调。

第三步：gamma参数调优

'gamma':[i/10.0 for i in range(0,5)]

第四步：调整subsample 和 colsample_bytree 参数

'subsample':[i/100.0 for i in range(75,90,5)],

'colsample_bytree':[i/100.0 for i in range(75,90,5)]

第五步：正则化参数调优

'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]

'reg_lambda'

第六步：降低学习速率

learning_rate =0.01,

from sklearn.model_selection import GridSearchCV

tuned_parameters= [{'n_estimators':[100,200,500],

                  'max_depth':[3,5,7], ##range(3,10,2)

                  'learning_rate':[0.5, 1.0],

                  'subsample':[0.75,0.8,0.85,0.9]

                  }]

tuned_parameters= [{'n_estimators':[100,200,500,1000]

                  }]

clf = GridSearchCV(XGBClassifier(silent=0,nthread=4,learning_rate= 0.5,min_child_weight=1, max_depth=3,gamma=0,subsample=1,colsample_bytree=1,reg_lambda=1,seed=1000), param_grid=tuned_parameters,scoring='roc_auc',n_jobs=4,iid=False,cv=5)

clf.fit(X_train, y_train)

##clf.grid_scores_, clf.best_params_, clf.best_score_

print(clf.best_params_)

y_true, y_pred = y_test, clf.predict(X_test)

print"Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred)

y_proba=clf.predict_proba(X_test)[:,1]

print "AUC Score (Train): %f" % metrics.roc_auc_score(y_true, y_proba)

from sklearn.model_selection import GridSearchCV

parameters= [{'learning_rate':[0.01,0.1,0.3],'n_estimators':[1000,1200,1500,2000,2500]}]

clf = GridSearchCV(XGBClassifier(

             max_depth=3,

             min_child_weight=1,

             gamma=0.5,

             subsample=0.6,

             colsample_bytree=0.6,

             objective= 'binary:logistic', #逻辑回归损失函数

             scale_pos_weight=1,

             reg_alpha=0,

             reg_lambda=1,

             seed=27

            ),

            param_grid=parameters,scoring='roc_auc')

clf.fit(X_train, y_train)

print(clf.best_params_)

y_pre= clf.predict(X_test)

y_pro= clf.predict_proba(X_test)[:,1]

print "AUC Score : %f" % metrics.roc_auc_score(y_test, y_pro)

print"Accuracy : %.4g" % metrics.accuracy_score(y_test, y_pre)

输出特征重要性

import pandas as pd

import matplotlib.pylab as plt

feat_imp = pd.Series(clf.booster().get_fscore()).sort_values(ascending=False)

feat_imp.plot(kind='bar', title='Feature Importances')

plt.ylabel('Feature Importance Score')

plt.show()

GBDT输出新特征+blending/stacking/联级森林

R生成新特征

library(xgboost)

training <-iris

x1=rep(0,50)

x2=rep(1,50)

x3=rep(2,50)

x=c(x1,x2,x3)

d=training[,c(1:4)]

training=data.frame(d,x)

ind<-sample(2,nrow(training),replace=TRUE,prob=c(0.7,0.3)) #对数据分成两部分，70%训练数据，30%检测数据  traindata<- training [ind==1,]  #训练集

testdata<- training [ind==2,]  #测试集

traindatax=as.matrix(traindata[,c(1:4)])

traindatay=as.matrix(traindata[,5])

testdatax=as.matrix(testdata[,c(1:4)])

testdatay=as.matrix(testdata[,5])

##多分类 默认从0开始

bst <- xgboost(data = traindatax, label = traindatay, max.depth = 3, eta = 0.1,nround = 1000 ,objective = "multi:softmax",num_class=3)

pred <- predict(bst, testdatax)

new_feature_train <- predict(bst, traindatax,predleaf = T)

new_feature_test <- predict(bst, testdatax,predleaf = T)

t_train=cbind(traindatax,new_feature_train,traindatay)

t_test=cbind(testdatax,new_feature_test,testdatay)

python生成GBDT特征

clf.apply(X_train)