1. 载入数据

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from collections import Counter from sklearn.datasets import load_boston
sns.set_style('darkgrid') boston_dataset = load_boston()
dataset = pd.DataFrame(boston_dataset.data, columns = boston_dataset.feature_names)
dataset.head()

输出:

      CRIM    ZN  INDUS  CHAS    NOX  ...  RAD    TAX  PTRATIO       B  LSTAT
0 0.00632 18.0 2.31 0.0 0.538 ... 1.0 296.0 15.3 396.90 4.98
1 0.02731 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 396.90 9.14
2 0.02729 0.0 7.07 0.0 0.469 ... 2.0 242.0 17.8 392.83 4.03
3 0.03237 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 394.63 2.94
4 0.06905 0.0 2.18 0.0 0.458 ... 3.0 222.0 18.7 396.90 5.33 [5 rows x 13 columns]

列解释Columns:

  1. CRIM: 按城镇划分的人均犯罪率
  2. ZN: 大于25000平方英尺的住宅用地比例
  3. INDUS: 每个城镇非零售业务英亩比例
  4. CHAS : 查尔斯河流 哑变量 (靠近河流为1; 否则为0)
  5. NOX: 一氧化氮浓度 (百万分之)
  6. RM: 每个住宅的房间数
  7. AGE: 1940年之前建造的自由单位
  8. DIS: 与5个波士顿就业中心的加权距离
  9. RAD: 高速公里同行能力指数
  10. PTRATIO: 按城镇划分的师生比例
  11. B: 1000(Bk — 0.63)², 按城镇划分的非裔人口结构比例
  12. LSTAT: 低收入人口百分比
  13. MEDV: 自有住房的中位数价值(单位:1000美元)

上图看到没有MEDV这个我们要预测的列,先加进来。

dataset['MEDV'] = boston_dataset.target

现在再看就有了。

2. 数据分析

2.1 预处理

看看有没有缺失值

dataset.isnull().sum()

设置特征和标签

X = dataset.iloc[:, 0:13].values
y = dataset.iloc[:, 13].values.reshape(-1,1)

分割训练和测试

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 25)

看看分割结果

print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)
输出:
Shape of X_train: (354, 13)
Shape of X_test: (152, 13)
Shape of y_train: (354, 1)
Shape of y_test (152, 1)

2.2 可视化

#相关系数矩阵,即给出了任意两个变量之间的相关系数
corr = dataset.corr()
import matplotlib.pyplot as plt
import seaborn as sns
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corr, cmap='RdBu', annot=True, fmt=".2f")
plt.xticks(range(len(corr.columns)), corr.columns)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.show()

sns.pairplot(dataset)
plt.show()

3. 训练模型

3.1 线性拟合

from sklearn.linear_model import LinearRegression
regressor_linear = LinearRegression()
regressor_linear.fit(X_train, y_train)

看看此时的预测得分

from sklearn.metrics import r2_score
# 交叉验证:将数据集分为10折,做一次交叉验证,实际上它是计算了十次,将每一折都当做一次测试集,其余九折当做训练集,这样循环十次。通过传入的模型,训练十次,最后将十次结果求平均值。
cv_linear = cross_val_score(estimator = regressor_linear, X = X_train, y = y_train, cv = 10) # R2 score,即决定系数,反映因变量的全部变异能通过回归关系被自变量解释的比例
y_pred_linear_train = regressor_linear.predict(X_train)
r2_score_linear_train = r2_score(y_train, y_pred_linear_train) y_pred_linear_test = regressor_linear.predict(X_test)
r2_score_linear_test = r2_score(y_test, y_pred_linear_test) # RMSE一般指均方根误差。均方根误差亦称标准误差。
rmse_linear = (np.sqrt(mean_squared_error(y_test, y_pred_linear_test))) print("CV: ", cv_linear.mean())
print('R2_score (train): ', r2_score_linear_train)
print('R2_score (test): ', r2_score_linear_test)
print("RMSE: ", rmse_linear)

输出:

CV:  0.6984854476156042
R2_score (train): 0.7435787589010061
R2_score (test): 0.7133593313710366
RMSE: 4.6472797457242

3.2 多项式回归(二次)

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
poly_reg.fit(X_poly, y_train)
regressor_poly2 = LinearRegression()
regressor_poly2.fit(X_poly, y_train)

看预测得分

from sklearn.metrics import r2_score
cv_poly2 = cross_val_score(estimator = regressor_poly2, X = X_train, y = y_train, cv = 10) y_pred_poly2_train = regressor_poly2.predict(poly_reg.fit_transform(X_train))
r2_score_poly2_train = r2_score(y_train, y_pred_poly2_train) y_pred_poly2_test = regressor_poly2.predict(poly_reg.fit_transform(X_test))
r2_score_poly2_test = r2_score(y_test, y_pred_poly2_test) rmse_poly2 = (np.sqrt(mean_squared_error(y_test, y_pred_poly2_test)))
print('CV: ', cv_poly2.mean())
print('R2_score (train): ', r2_score_poly2_train)
print('R2_score (test): ', r2_score_poly2_test)
print("RMSE: ", rmse_poly2)

输出略,最后汇总

3.3 脊回归(Ridge Regression),又叫岭回归

lasso 回归和岭回归(ridge regression)其实就是在标准线性回归的基础上分别加入 L1 和 L2 正则化(regularization)

from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Ridge(alpha=3.8, fit_intercept=True))
] ridge_pipe = Pipeline(steps)
ridge_pipe.fit(X_train, y_train)

评估

from sklearn.metrics import r2_score

cv_ridge = cross_val_score(estimator = ridge_pipe, X = X_train, y = y_train.ravel(), cv = 10)

y_pred_ridge_train = ridge_pipe.predict(X_train)
r2_score_ridge_train = r2_score(y_train, y_pred_ridge_train) y_pred_ridge_test = ridge_pipe.predict(X_test)
r2_score_ridge_test = r2_score(y_test, y_pred_ridge_test) rmse_ridge = (np.sqrt(mean_squared_error(y_test, y_pred_ridge_test)))
print('CV: ', cv_ridge.mean())
print('R2_score (train): ', r2_score_ridge_train)
print('R2_score (test): ', r2_score_ridge_test)
print("RMSE: ", rmse_ridge)

3.4 Lasso 回归

from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures steps = [
('scalar', StandardScaler()),
('poly', PolynomialFeatures(degree=2)),
('model', Lasso(alpha=0.012, fit_intercept=True, max_iter=3000))
] lasso_pipe = Pipeline(steps)
lasso_pipe.fit(X_train, y_train)

评估

from sklearn.metrics import r2_score

# Predicting Cross Validation Score
cv_lasso = cross_val_score(estimator = lasso_pipe, X = X_train, y = y_train, cv = 10) # Predicting R2 Score the Test set results
y_pred_lasso_train = lasso_pipe.predict(X_train)
r2_score_lasso_train = r2_score(y_train, y_pred_lasso_train) # Predicting R2 Score the Test set results
y_pred_lasso_test = lasso_pipe.predict(X_test)
r2_score_lasso_test = r2_score(y_test, y_pred_lasso_test) # Predicting RMSE the Test set results
rmse_lasso = (np.sqrt(mean_squared_error(y_test, y_pred_lasso_test)))
print('CV: ', cv_lasso.mean())
print('R2_score (train): ', r2_score_lasso_train)
print('R2_score (test): ', r2_score_lasso_test)
print("RMSE: ", rmse_lasso)

3.5 支持向量回归 Support Vector Regression

支持向量分类的方法能被推广到解决回归问题,称为支持向量回归

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()
X_scaled = sc_X.fit_transform(X_train)
y_scaled = sc_y.fit_transform(y_train.reshape(-1,1)) # Fitting the SVR Model to the dataset
from sklearn.svm import SVR
regressor_svr = SVR(kernel = 'rbf', gamma = 'scale')
regressor_svr.fit(X_scaled, y_scaled.ravel())

评估

from sklearn.metrics import r2_score

# Predicting Cross Validation Score
cv_svr = cross_val_score(estimator = regressor_svr, X = X_scaled, y = y_scaled.ravel(), cv = 10) # Predicting R2 Score the Train set results
y_pred_svr_train = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_train)))
r2_score_svr_train = r2_score(y_train, y_pred_svr_train) # Predicting R2 Score the Test set results
y_pred_svr_test = sc_y.inverse_transform(regressor_svr.predict(sc_X.transform(X_test)))
r2_score_svr_test = r2_score(y_test, y_pred_svr_test) # Predicting RMSE the Test set results
rmse_svr = (np.sqrt(mean_squared_error(y_test, y_pred_svr_test)))
print('CV: ', cv_svr.mean())
print('R2_score (train): ', r2_score_svr_train)
print('R2_score (test): ', r2_score_svr_test)
print("RMSE: ", rmse_svr)

3.6 决策树回归 Decision Tree Regression

from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state = 0)
regressor_dt.fit(X_train, y_train)

评估

from sklearn.metrics import r2_score

# Predicting Cross Validation Score
cv_dt = cross_val_score(estimator = regressor_dt, X = X_train, y = y_train, cv = 10) # Predicting R2 Score the Train set results
y_pred_dt_train = regressor_dt.predict(X_train)
r2_score_dt_train = r2_score(y_train, y_pred_dt_train) # Predicting R2 Score the Test set results
y_pred_dt_test = regressor_dt.predict(X_test)
r2_score_dt_test = r2_score(y_test, y_pred_dt_test) # Predicting RMSE the Test set results
rmse_dt = (np.sqrt(mean_squared_error(y_test, y_pred_dt_test)))
print('CV: ', cv_dt.mean())
print('R2_score (train): ', r2_score_dt_train)
print('R2_score (test): ', r2_score_dt_test)
print("RMSE: ", rmse_dt)

3.7 随机森林回归 Random Forest Regression

from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 500, random_state = 0)
regressor_rf.fit(X_train, y_train.ravel())

评估

from sklearn.metrics import r2_score

# Predicting Cross Validation Score
cv_rf = cross_val_score(estimator = regressor_rf, X = X_scaled, y = y_train.ravel(), cv = 10) # Predicting R2 Score the Train set results
y_pred_rf_train = regressor_rf.predict(X_train)
r2_score_rf_train = r2_score(y_train, y_pred_rf_train) # Predicting R2 Score the Test set results
y_pred_rf_test = regressor_rf.predict(X_test)
r2_score_rf_test = r2_score(y_test, y_pred_rf_test) # Predicting RMSE the Test set results
rmse_rf = (np.sqrt(mean_squared_error(y_test, y_pred_rf_test)))
print('CV: ', cv_rf.mean())
print('R2_score (train): ', r2_score_rf_train)
print('R2_score (test): ', r2_score_rf_test)
print("RMSE: ", rmse_rf)

4. 评估结果汇总

models = [('Linear Regression', rmse_linear, r2_score_linear_train, r2_score_linear_test, cv_linear.mean()),
('Polynomial Regression (2nd)', rmse_poly2, r2_score_poly2_train, r2_score_poly2_test, cv_poly2.mean()),
('Ridge Regression', rmse_ridge, r2_score_ridge_train, r2_score_ridge_test, cv_ridge.mean()),
('Lasso Regression', rmse_lasso, r2_score_lasso_train, r2_score_lasso_test, cv_lasso.mean()),
('Support Vector Regression', rmse_svr, r2_score_svr_train, r2_score_svr_test, cv_svr.mean()),
('Decision Tree Regression', rmse_dt, r2_score_dt_train, r2_score_dt_test, cv_dt.mean()),
('Random Forest Regression', rmse_rf, r2_score_rf_train, r2_score_rf_test, cv_rf.mean())
]

看看表格

predict = pd.DataFrame(data = models, columns=['Model', 'RMSE', 'R2_Score(training)', 'R2_Score(test)', 'Cross-Validation'])
predict

输出:

Model RMSE R2_Score(training) R2_Score(test) Cross-Validation
0 Linear Regression 4.647280 0.743579 0.713359
1 Polynomial Regression (2nd) 4.194313 0.930656 0.766513
2 Ridge Regression 2.853062 0.922818 0.891965
3 Lasso Regression 2.811451 0.923402 0.895094
4 Support Vector Regression 3.838898 0.874272 0.804407
5 Decision Tree Regression 5.723785 1.000000 0.565183
6 Random Forest Regression 3.211470 0.976717 0.863118

5. 可视化评估结果

f, axe = plt.subplots(1,1, figsize=(18,6))

predict.sort_values(by=['Cross-Validation'], ascending=False, inplace=True)

sns.barplot(x='Cross-Validation', y='Model', data = predict, ax = axe)
#axes[0].set(xlabel='Region', ylabel='Charges')
axe.set_xlabel('Cross-Validaton Score', size=16)
axe.set_ylabel('Model')
axe.set_xlim(0,1.0)
plt.show()

f, axes = plt.subplots(2,1, figsize=(14,10))

predict.sort_values(by=['R2_Score(training)'], ascending=False, inplace=True)

sns.barplot(x='R2_Score(training)', y='Model', data = predict, palette='Blues_d', ax = axes[0])
#axes[0].set(xlabel='Region', ylabel='Charges')
axes[0].set_xlabel('R2 Score (Training)', size=16)
axes[0].set_ylabel('Model')
axes[0].set_xlim(0,1.0) predict.sort_values(by=['R2_Score(test)'], ascending=False, inplace=True) sns.barplot(x='R2_Score(test)', y='Model', data = predict, palette='Reds_d', ax = axes[1])
#axes[0].set(xlabel='Region', ylabel='Charges')
axes[1].set_xlabel('R2 Score (Test)', size=16)
axes[1].set_ylabel('Model')
axes[1].set_xlim(0,1.0) plt.show()

predict.sort_values(by=['RMSE'], ascending=False, inplace=True)

f, axe = plt.subplots(1,1, figsize=(18,6))
sns.barplot(x='Model', y='RMSE', data=predict, ax = axe)
axe.set_xlabel('Model', size=16)
axe.set_ylabel('RMSE', size=16) plt.show()

[Python] 波士顿房价的7种模型(线性拟合、二次多项式、Ridge、Lasso、SVM、决策树、随机森林)的训练效果对比的更多相关文章

  1. 算法岗面试题:模型的bias和variance是什么?用随机森林举例

    校招在即,准备准备一些面试可能会用到的东西吧.希望这次面试不会被挂. 基本概念 说到机器学习模型的误差,主要就是bias和variance. Bias:如果一个模型的训练错误大,然后验证错误和训练错误 ...

  2. Python 实现的随机森林

    随机森林是一个高度灵活的机器学习方法,拥有广泛的应用前景,从市场营销到医疗保健保险. 既可以用来做市场营销模拟的建模,统计客户来源,保留和流失.也可用来预测疾病的风险和病患者的易感性. 随机森林是一个 ...

  3. Python机器学习笔记——随机森林算法

    随机森林算法的理论知识 随机森林是一种有监督学习算法,是以决策树为基学习器的集成学习算法.随机森林非常简单,易于实现,计算开销也很小,但是它在分类和回归上表现出非常惊人的性能,因此,随机森林被誉为“代 ...

  4. H2O中的随机森林算法介绍及其项目实战(python实现)

    H2O中的随机森林算法介绍及其项目实战(python实现) 包的引入:from h2o.estimators.random_forest import H2ORandomForestEstimator ...

  5. 机器学习之路:python 集成分类器 随机森林分类RandomForestClassifier 梯度提升决策树分类GradientBoostingClassifier 预测泰坦尼克号幸存者

    python3 学习使用随机森林分类器 梯度提升决策树分类 的api,并将他们和单一决策树预测结果做出对比 附上我的git,欢迎大家来参考我其他分类器的代码: https://github.com/l ...

  6. Python数据科学手册-机器学习: 决策树与随机森林

    无参数 算法 随机森林 随机森林是一种集成方法,集成多个比较简单的评估器形成累计效果. 导入标准程序库 随机森林的诱因: 决策树 随机森林是建立在决策树 基础上 的集成学习器 建一颗决策树 二叉决策树 ...

  7. 笔记+R︱风控模型中变量粗筛(随机森林party包)+细筛(woe包)

    每每以为攀得众山小,可.每每又切实来到起点,大牛们,缓缓脚步来俺笔记葩分享一下吧,please~ --------------------------- 本内容来源于CDA-DSC课程内容,原内容为& ...

  8. 机器学习之路:python 集成回归模型 随机森林回归RandomForestRegressor 极端随机森林回归ExtraTreesRegressor GradientBoostingRegressor回归 预测波士顿房价

    python3 学习机器学习api 使用了三种集成回归模型 git: https://github.com/linyi0604/MachineLearning 代码: from sklearn.dat ...

  9. AdaBoost 算法-分析波士顿房价数据集

    公号:码农充电站pro 主页:https://codeshellme.github.io 在机器学习算法中,有一种算法叫做集成算法,AdaBoost 算法是集成算法的一种.我们先来看下什么是集成算法. ...

随机推荐

  1. Google Developer Days 2019 & GDD

    Google Developer Days 2019 2019 Google 开发者大会 GDD Google Developer Days https://events.google.cn/intl ...

  2. rename github

    rename GitHub github repo rename xgqfrms 2012-2020 www.cnblogs.com 发布文章使用:只允许注册用户才可以访问!

  3. cookie & session & token compare

    cookie & session & token compare cookie.session.token 区别和优缺点 存储位置 cookie 存在 client 端 session ...

  4. 微信小程序 UI 组件库

    微信小程序 UI 组件库 Vant Weapp 需要注意的是 package.json 和 node_modules 必须在 miniprogram 目录下 $ yarn add @vant/weap ...

  5. Sketch & UI & PS

    Sketch & UI & PS app ui https://sketchapp.com/learn https://www.sketch.com/docs/ https://ske ...

  6. 软件工程中的CI&CD

    wiki 在软件工程中,CI/CD或CICD通常是指持续集成以及持续交付或持续部署的组合实践 持续集成 在软件工程中,持续集成(CI)是每天将所有开发人员的工作副本合并到共享主线中的一种做法.[1] ...

  7. django学习-5.获取url参数和name的作用

    1.前言 假如我们要打开这两个博客园地址:[https://www.cnblogs.com/xiamen-momo/archive/2020/11.html].[https://www.cnblogs ...

  8. 工具类:每次随机生成有销售库存有实际库存的1个店铺商品和对应的2个店铺商品sku

    # coding:utf-8 # @fileName :2.每次随机生成有销售库存有实际库存的1个店铺商品和对应的2个店铺商品sku.py # @createTime :2020/4/4 10:33 ...

  9. 发布Jar包到maven中央仓库

    什么是maven中央仓库 maven是java世界最流行的构建工具,构建内容囊括了一个java项目的整个生命周期.其中最重要的功能就是依赖管理,maven通过一个类似云的ftp站点统一管理所有java ...

  10. 1053 Path of Equal Weight——PAT甲级真题

    1053 Path of Equal Weight 给定一个非空的树,树根为 RR. 树中每个节点 TiTi 的权重为 WiWi. 从 RR 到 LL 的路径权重定义为从根节点 RR 到任何叶节点 L ...