线性回归和Ridge回归
网址:https://www.cnblogs.com/pinard/p/6023000.html
线性回归和交叉验证
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets,linear_model
读取csv里面的数据
data = pd.read_excel("F:\data\CCPP\Folds5x2_pp.xlsx");
x = data[['AT', 'V', 'AP', 'RH']]
y = data[['PE']]
划分训练集和测试集
from sklearn.cross_validation import train_test_split
x为待划分的样本特征集合,y为待划分的样本标签,x,y经train_test_split划分后,x_train为训练集特征集合,x_test为训练的标签;y_train为测试集合的样本特征集合,y_test为测试集合的样本标签
x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=1)#可以通过test_size来设置划分比列
导入线性模型
from sklearn.linear_model import LinearRegression
linreg = LinearRegression()#线性回归函数
拟合
linreg.fit(x_train,y_train)
print("linreg.intercept_",linreg.intercept_,"linreg.coef_",linreg.coef_)
模型拟合测试级
y_pred = linreg.predict(x_test)
from sklearn import metrics
用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(y_test, y_pred))
用scikit-learn计算RMSE
print("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
利用交叉验证来优化模型
from sklearn.model_selection import cross_val_predict
cv为s折交验证
predicted = cross_val_predict(linreg, x, y, cv=100)
print("predicted:",predicted.shape)
用scikit-learn计算MSE
print("MSE:",metrics.mean_squared_error(y, predicted))
用scikit-learn计算RMSE
print ("RMSE:",np.sqrt(metrics.mean_squared_error(y, predicted)))
画出图像
fig, ax = plt.subplots()
ax.scatter(y, predicted)
ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.show()
Ridge回归用scikit-learn选择Ridge回归超参数α
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets,linear_model
from sklearn import metrics
读取csv里面的数据
data = pd.read_excel("F:\data\CCPP\Folds5x2_pp.xlsx");
x = data[['AT', 'V', 'AP', 'RH']]
y = data[['PE']]
划分训练集和测试集
from sklearn.model_selection import train_test_split
x为待划分的样本特征集合,y为待划分的样本标签,x,y经train_test_split划分后,x_train为训练集特征集合,x_test为训练的标签;y_train为测试集合的样本特征集合,y_test为测试集合的样本标签
x_train,x_test,y_train,y_test = train_test_split(x, y, random_state=1)#可以通过test_size来设置划分比列
n_alphas = 200
alphas = np.logspace(-10,-2,n_alphas)
print("alphas:",alphas)
clf = linear_model.Ridge(fit_intercept=False)
coefs = []
for a in alphas:
#设置本次循环的超参数
clf.set_params(alpha=a)
#针对每个alpha做ridge回归
clf.fit(x_train, y_train)
y_predict = clf.predict(x_test)
error = metrics.mean_squared_error(y_predict,y_test)#计算方差
print("error:",error)
# 把每一个超参数alpha对应的theta存下来
coefs.append(clf.coef_)
print("coefs:",coefs)
from sklearn import metrics
ax = plt.gca()
ax.plot(alphas, coefs)
#将alpha的值取对数便于画图
ax.set_xscale('log')
#翻转x轴的大小方向,让alpha从大到小显示
ax.set_xlim(ax.get_xlim()[::-1])
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
自己编写的逻辑编写代码
定义LR回归模型
class LogisticReression:
def init(self, max_iter=200, learning_rate=0.01):
self.max_iter = max_iter
self.learning_rate = learning_rate
def sigmoid(self, x):
return 1 / (1 + exp(-x))
def data_matrix(self, X):
data_mat = []
for d in X:
data_mat.append([1.0, *d])
return data_mat
#训练
def train(self, X, y):
# label = np.mat(y)
data_mat = self.data_matrix(X) # mn
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
for iter_ in range(self.max_iter):
for i in range(len(X)):
result = self.sigmoid(np.dot(data_mat[i], self.weights))
error = y[i] - result
self.weights += self.learning_rate error * np.transpose( [data_mat[i]])
print('LR模型学习率={},最大迭代次数={}'.format( self.learning_rate, self.max_iter))
# 准确率
def accuracy(self, X_test, y_test):
right = 0
X_test = self.data_matrix(X_test)
for x, y in zip(X_test, y_test):
result = np.dot(x, self.weights)
if (result > 0 and y == 1) or (result < 0 and y == 0):
right += 1
return right / len(X_test)
构建数据
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0,1,-1]])
return data[:,:2], data[:,-1]
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
训练数据
LR = LogisticReression()
LR.train(X_train, y_train)
计算测试精度
score = LR.accuracy(X_test, y_test)
print("score:",score)
x_ponits = np.arange(3, 9)
y_ = -(LR.weights[1]*x_ponits + LR.weights[0])/LR.weights[2]
plt.plot(x_ponits, y_)
绘制图
plt.scatter(X[:50,0],X[:50,1], label='0')
plt.scatter(X[50:,0],X[50:,1], label='1')
plt.legend()
plt.show()
scikit-learn
# from sklearn import datasets
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import StandardScaler
# from sklearn.metrics import accuracy_score
# import matplotlib.pyplot as plt
# from matplotlib.colors import ListedColormap
# from mlxtend.plotting import plot_decision_regions
# from sklearn.metrics import accuracy_score
# import numpy as np
# iris = datasets.load_iris()
# x = iris.data[:,[2,3]]
# y = iris.target
# print("y:",y)
# X_train,X_test,y_train,y_test = train_test_split(x , y, test_size=0.3, random_state = 1)
# print("x_train:",X_test.shape)
# print("y_train;",y_test.shape)
# #sigmoid 函数
# def sigmoid(z):
# return 1.0/(1.0+np.exp(-z))
# sc = StandardScaler()
# sc.fit(X_train)
# X_train_std = sc.transform(X_train)
# X_test_std = sc.transform(X_test)
# print("X_test:",X_test)
# Ir = LogisticRegression(C=1000.0,random_state=1)
# Ir.fit(X_train_std,y_train)
# y_pred = Ir.predict(X_test)
# print("y_test:",y_test)
# print("y_pred:",y_pred)
# print("score:",accuracy_score(y_test, y_pred))
# print("Ir.coef:",Ir.coef_)
# X_combined_std = np.vstack((X_train_std,X_test_std))
# y_combined = np.hstack((y_train,y_test))
# plot_decision_regions(X=X_combined_std,y=y_combined,clf=Ir,legend = 2)
# plt.xlabel('petal length [standardized]')
# plt.ylabel('petal width [standardized]')
# plt.legend(loc='upper right')
# plt.savefig('Iris.png')
# plt.show()
from math import exp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
#定义LR回归模型
class LogisticReression:
def init(self, max_iter=200, learning_rate=0.01):
self.max_iter = max_iter
self.learning_rate = learning_rate
def sigmoid(self, x):
return 1 / (1 + exp(-x))
def data_matrix(self, X):
data_mat = []
for d in X:
data_mat.append([1.0, *d])
return data_mat
#训练
def train(self, X, y):
# label = np.mat(y)
data_mat = self.data_matrix(X) # m*n
self.weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)
for iter_ in range(self.max_iter):
for i in range(len(X)):
result = self.sigmoid(np.dot(data_mat[i], self.weights))
error = y[i] - result
self.weights += self.learning_rate * error * np.transpose( [data_mat[i]])
print('LR模型学习率={},最大迭代次数={}'.format( self.learning_rate, self.max_iter))
# 准确率
def accuracy(self, X_test, y_test):
right = 0
X_test = self.data_matrix(X_test)
for x, y in zip(X_test, y_test):
result = np.dot(x, self.weights)
if (result > 0 and y == 1) or (result < 0 and y == 0):
right += 1
return right / len(X_test)
#构建数据
def create_data():
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
data = np.array(df.iloc[:100, [0,1,-1]])
return data[:,:2], data[:,-1]
X, y = create_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
#训练数据
LR = LogisticReression()
LR.train(X_train, y_train)
#计算测试精度
score = LR.accuracy(X_test, y_test)
print("score:",score)
x_ponits = np.arange(3, 9)
y_ = -(LR.weights[1]*x_ponits + LR.weights[0])/LR.weights[2]
plt.plot(x_ponits, y_)
# 绘制图
plt.scatter(X[:50,0],X[:50,1], label='0')
plt.scatter(X[50:,0],X[50:,1], label='1')
plt.legend()
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import seaborn as sn
import matplotlib.pyplot as plt
第一步构建数据
candidates = {'gmat': [780,750,690,710,680,730,690,720,740,690,610,690,710,680,770,610,580,650,540,590,620,600,550,550,570,670,660,580,650,660,640,620,660,660,680,650,670,580,590,690],
'gpa': [4,3.9,3.3,3.7,3.9,3.7,2.3,3.3,3.3,1.7,2.7,3.7,3.7,3.3,3.3,3,2.7,3.7,2.7,2.3,3.3,2,2.3,2.7,3,3.3,3.7,2.3,3.7,3.3,3,2.7,4,3.3,3.3,2.3,2.7,3.3,1.7,3.7],
'work_experience': [3,4,3,5,4,6,1,4,5,1,3,5,6,4,3,1,4,6,2,3,2,1,4,1,2,6,4,2,6,5,1,2,4,6,5,1,2,1,4,5],
'admitted': [1,1,1,1,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,0,0,0,0,0,0,1,1,0,1,1,0,0,1,1,1,0,0,0,0,1] }
df = pd.DataFrame(candidates,columns= ['gmat', 'gpa','work_experience','admitted'])
X = df[['gmat', 'gpa','work_experience']]
y = df['admitted']
75%的数据用来做训练集,25%的数据用作测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=0)
logistic_regression= LogisticRegression()
训练
logistic_regression.fit(X_train,y_train)
预测
y_pred=logistic_regression.predict(X_test)
绘制热力图
confusion_matrix = pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted'])
sn.heatmap(confusion_matrix, annot=True)
plt.show()
print('精度: ',metrics.accuracy_score(y_test, y_pred))
线性回归和Ridge回归的更多相关文章
- 回归算法比较(线性回归,Ridge回归,Lasso回归)
代码: # -*- coding: utf-8 -*- """ Created on Mon Jul 16 09:08:09 2018 @author: zhen &qu ...
- 用scikit-learn和pandas学习Ridge回归
本文将用一个例子来讲述怎么用scikit-learn和pandas来学习Ridge回归. 1. Ridge回归的损失函数 在我的另外一遍讲线性回归的文章中,对Ridge回归做了一些介绍,以及什么时候适 ...
- 线性回归、Logistic回归、Softmax回归
线性回归(Linear Regression) 什么是回归? 给定一些数据,{(x1,y1),(x2,y2)…(xn,yn) },x的值来预测y的值,通常地,y的值是连续的就是回归问题,y的值是离散的 ...
- Spark MLlib回归算法------线性回归、逻辑回归、SVM和ALS
Spark MLlib回归算法------线性回归.逻辑回归.SVM和ALS 1.线性回归: (1)模型的建立: 回归正则化方法(Lasso,Ridge和ElasticNet)在高维和数据集变量之间多 ...
- 1.线性回归、Logistic回归、Softmax回归
本次回归章节的思维导图版总结已经总结完毕,但自我感觉不甚理想.不知道是模型太简单还是由于自己本身的原因,总结出来的东西感觉很少,好像知识点都覆盖上了,但乍一看,好像又什么都没有.不管怎样,算是一次尝试 ...
- Matlab实现线性回归和逻辑回归: Linear Regression & Logistic Regression
原文:http://blog.csdn.net/abcjennifer/article/details/7732417 本文为Maching Learning 栏目补充内容,为上几章中所提到单参数线性 ...
- 回归——线性回归,Logistic回归,范数,最大似然,梯度,最小二乘……
写在前面:在本篇博客中,旨在对线性回归从新的角度考虑,然后引入解决线性回归中会用到的最大似然近似(Maximum Likelihood Appropriation-MLA) 求解模型中的参数,以及梯度 ...
- 对线性回归,logistic回归和一般回归的认识
原文:http://www.cnblogs.com/jerrylead/archive/2011/03/05/1971867.html#3281650 对线性回归,logistic回归和一般回归的认识 ...
- 【数据分析】线性回归与逻辑回归(R语言实现)
文章来源:公众号-智能化IT系统. 回归模型有多种,一般在数据分析中用的比较常用的有线性回归和逻辑回归.其描述的是一组因变量和自变量之间的关系,通过特定的方程来模拟.这么做的目的也是为了预测,但有时也 ...
随机推荐
- CF-Educational Codeforces Round 77 (Rated for Div. 2)(A-E题解)
A. Heating (水题) 题目链接 大致思路: 因为是代价是平方,所以让每一个房间的大小平均即可,即最大和最小相差不超过一. 代码: #include<bits/stdc++.h> ...
- c#学习笔记1-简单算法
using System; namespace Demo { class Studycs { public static void Main(String[] args) { // String re ...
- python入门基础 03
整型 -- 数字 (int) 用于比较和运算的 32位 -2 ** 31 -1 ~ 2 ** 31 -1 64位 -2 ** 63 -1 ~ 2 ** 63 -1 + - * / // ** % &q ...
- LeetCode 5198. 丑数 III(Java)容斥原理和二分查找
题目链接:5198. 丑数 III 请你帮忙设计一个程序,用来找出第 n 个丑数. 丑数是可以被 a 或 b 或 c 整除的 正整数. 示例 1: 输入:n = 3, a = 2, b = 3, c ...
- C指针的一些小细节
1 int *c;*c=4-->int *c;int b;c=&b;*c=4; 在使用指针之前,一定要将其初始化,当然,如果是赋予一个地址,就相当于使用的同时就进行了初始化.
- [译]开发者须知的SOLID原则
原文:SOLID Principles every Developer Should Know – Bits and Pieces SOLID Principles every devloper sh ...
- 全栈项目|小书架|服务器端-NodeJS+Koa2 实现搜索功能
搜索功能会包含:热搜.搜索列表. 热搜功能在电商的搜索中经常看到,热搜数据的来源有两种 用户真实的搜索数据,根据算法进行排序 人为推送的数据 想想微博热搜是可以买的就知道热搜功能多么重要了. 我采用第 ...
- Ext.create使用(上)
本文介绍前两种使用方法: 通过full name, alias 或者 alternate name实例化一个类 // 别名 // alias var window = Ext.create('widg ...
- Mac版StarUML破解方法
StarUML是用nodejs写的.确切的说是用Electron前端框架写的.新版本中所有的starUML源代码是通过asar工具打包而成.确切的代码位置在“%LOCALAPPDATA%\Progra ...
- php 函数阶乘理解
<?php //函数阶乘 函数调用自身,函数在执行的时候每次都会开辟一个空间,如 /** * $a =3的话,首先判断 3>1 为真 $r=3*demo(3-1) 开辟一个空间调用自身. ...