03.Regression
01.regression
# -*- coding: utf-8 -*-
"""
scipy 패키지 선형 회귀분석
"""
from scipy import stats #선형 회귀분석 모듈
import pandas as pd
score_df=pd.read_csv("../data/score_iq.csv")
print(score_df.info()) #150x6
"""
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
sid 150 non-null int64
score 150 non-null int64
iq 150 non-null int64
academy 150 non-null int64
game 150 non-null int64
tv 150 non-null int64
dtypes: int64(6)
"""
print(score_df.head())
"""
sid score iq academy game tv
0 10001 90 140 2 1 0
1 10002 75 125 1 3 3
2 10003 77 120 1 0 4
3 10004 83 135 2 3 2
4 10005 65 105 0 4 4
"""
#1)단순 선현회귀분석
#독립변수 (x:1) -> 종속변수(y:1)
#변수 모델링
x=score_df.iq #score_df['iq']
y=score_df.score # #score_df['score']
#단순 선형 회귀모형
model=stats.linregress(x,y)
#모델 결과
print('model=',model)
"""
model= LinregressResult(
slope=0.6514309527270075, ->기울기
intercept=-2.8564471221974657, ->절편
rvalue=0.8822203446134699, ->설명력 1=100% 1에 가까우면 좋다
pvalue=2.8476895206683644e-50, ->모델 유의성(0.05보다 크면 의미 없다)
stderr=0.028577934409305443)->표준오차
"""
#회귀방정식 =1차 함수
#Y =aX+b (a:기울기 ,b:절편)
#score:90 iq:140
Y=model.slope*140-model.intercept
print("점수 예측치=",Y) #점수 예측치= 88.34388625958358
err=90-Y
print("모델 오차=",err)#모델 오차= 1.6561137404164157
print('x 기울기=',model.slope)#x 기울기= 0.6514309527270075
print('x 절편=',model.intercept)#x 절편= -2.8564471221974657
print('x 설명력=',model.rvalue)#x 설명력= 0.8822203446134699
print('x 유의성=',model.pvalue)#x 유의성= 2.8476895206683644e-50
print('x 표준오차=',model.stderr)#x 표준오차= 0.028577934409305443
#2)다중 선형 회귀모형
# -독립 변수 (X) 2개이상
import statsmodels.formula.api as sm
corr=score_df.corr()
print("상관 계수 행렬")
print(corr)
"""
sid score iq academy game tv
sid 1.000000 -0.014399 -0.007048 -0.004398 0.018806 0.024565
score -0.014399 1.000000 0.882220 0.896265 -0.298193 -0.819752
iq -0.007048 0.882220 1.000000 0.671783 -0.031516 -0.585033
academy -0.004398 0.896265 0.671783 1.000000 -0.351315 -0.948551
game 0.018806 -0.298193 -0.031516 -0.351315 1.000000 0.239217
tv 0.024565 -0.819752 -0.585033 -0.948551 0.239217 1.000000
"""
#변수 모델 :X(iq,academy )->y(score)
model = sm.ols(formula="score ~ iq + academy",
data=score_df).fit()
print("model",model) #object info
#model <statsmodels.regression.linear_model.RegressionResultsWrapper object at 0x000000000CEAC588>
#모델의 파라메터: 기울기 절편
print(model.params)
"""
Intercept 25.229141-> 절편
iq 0.376966 ->X1 기울기
academy 2.992800 ->X2 기울기
dtype: float64
"""
#다중 선형 회귀 방정식
print(score_df.head())
"""
sid score iq academy game tv
0 10001 90 140 2 1 0
1 10002 75 125 1 3 3
2 10003 77 120 1 0 4
3 10004 83 135 2 3 2
4 10005 65 105 0 4 4
"""
Y=0.376966*140+2.992800*2+25.229141
print("예측치=",Y)#예측치= 83.989981
#모델 결과
print(model.summary())
"""
OLS Regression Results
==============================================================================
Dep. Variable: score R-squared: 0.946
Model: OLS Adj. R-squared: 0.946
Method: Least Squares F-statistic: 1295.
Date: Sat, 16 Feb 2019 Prob (F-statistic): 4.50e-94
Time: 11:23:48 Log-Likelihood: -275.05
No. Observations: 150 AIC: 556.1
Df Residuals: 147 BIC: 565.1
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 25.2291 2.187 11.537 0.000 20.907 29.551
iq 0.3770 0.019 19.786 0.000 0.339 0.415
academy 2.9928 0.140 21.444 0.000 2.717 3.269
==============================================================================
Omnibus: 36.342 Durbin-Watson: 1.913
Prob(Omnibus): 0.000 Jarque-Bera (JB): 54.697
Skew: 1.286 Prob(JB): 1.33e-12
Kurtosis: 4.461 Cond. No. 2.18e+03
==============================================================================
Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.18e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
"""
"""
1.Prob (F-statistic): 4.50e-94:유의성 (0.05미만이여야 한다)
2.Adj. R-squared: 0.946:설명력 (1에 가까와야 좋다)
3.P>|t| :X 유의성 검정: 0.05미만예야 좋타
"""
02.dot_regression
# -*- coding: utf-8 -*-
"""
회귀모형 예측에 행렬곱(dot) 적용예
"""
import pandas as pd
import numpy as np
#1.data set 가져오기
score_df=pd.read_csv("../data/score_iq.csv")
print(score_df.head())# 6칼럼
"""
sid score iq academy game tv
0 10001 90 140 2 1 0
1 10002 75 125 1 3 3
2 10003 77 120 1 0 4
3 10004 83 135 2 3 2
4 10005 65 105 0 4 4
"""
#2.subset 생성
score_arr=score_df[['score','iq','academy']]#3칼럼
print(score_arr.shape)#(150, 3)
print(score_arr.info())
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 3 columns):
score 150 non-null int64
iq 150 non-null int64
academy 150 non-null int64
dtypes: int64(3)
memory usage: 3.6 KB
None
"""
#3.X,y변수 선택
score_X=score_arr.ix[:,1:] #2개 (150x2) 2차원
score_y=score_arr.ix[:,0]#1개(150) 1차원
print(score_X.shape) #(150, 2)
print(score_y.shape) #(150,)
#4.기울기 ,절편
"""
Intercept 25.229141-> 절편
iq 0.376966 -> X1 기울기
academy 2.992800 -> X2 기울기
dtype: float64
"""
#기울기 변수
slop=np.array([[0.376966],[2.992800]]) #2차원
Intercept=25.229141 #상수 0차원
#Y=(a1*x1+a2*x2)+b
#(a1*x1+a2*x2)->행렵곱
#5.행렬곱(dot) 적용
print(score_X.shape) #(150, 2)
print(slop.shape) #(2, 1)
#(150, 2) * (2, 1) =(150,1)
matmul = np.dot(score_X,slop)
Y = matmul + Intercept
print(Y)
"""
[[83.989981]
[75.342691]
...
[73.457861]]
"""
#6. model 평가 (정답 vs 예측치)
#Y = 예측치
#score_y #정답
print(Y.shape) #(150, 1) 2차원 ->1차원
print(score_y.shape) #(150,) 1차원
#2차원 ->1차원
Y_fitted=Y.reshape(150) # (150,)
df=pd.DataFrame({"Y_fitted":Y_fitted,'score':score_y})
print(df) # (150, 2)
#상관 분석
print(df.head())
"""
Y_fitted score
0 83.989981 90
1 75.342691 75
2 73.457861 77
3 82.105151 83
4 64.810571 65
"""
cor=df.Y_fitted.corr(df.score)
print('corr=',cor) #corr= 0.9727792069594755
03.sklearn_Dataset
# -*- coding: utf-8 -*- """ sklearn 제공 datasets """ from sklearn import datasets import numpy as np #1.선형회귀분석 적합 데이터셋 #1) iris (붖꽃) iris=datasets.load_iris() print(iris) iris_x=iris.data #x iris_y=iris.target #y print(type(iris_x)) #<class 'numpy.ndarray'> print(np.shape(iris_x)) #(150, 4) print(np.shape(iris_y)) #(150,) print(iris_x) """ [[5.1 3.5 1.4 0.2] [4.9 3. 1.4 0.2] [4.7 3.2 1.3 0.2] [4.6 3.1 1.5 0.2]] """ print(iris_y) """ [0 0 ... 0 0] """ #y범주 print(list(iris.target_names)) #['setosa'=0, 'versicolor'=1, 'virginica'=2] #2)당뇨병 데이터셋 diabetes=datasets.load_diabetes() diabetes_x=diabetes.data # x diabetes_y=diabetes.target # y print(diabetes_x.shape) #(442, 10) print(diabetes_y.shape) #(442,) print(diabetes_y) #3)보스톤 데이터셋 boston=datasets.load_boston() boston_x=boston.data boston_y=boston.target print(boston_x.shape)#(506, 13) print(boston_y.shape)#(506,) print(boston.feature_names) #['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO' 'B' 'LSTAT'] #2. 분류분석에 적합한 데이터셋 #4) wine 데이터셋 다항분류 (softmax 함수) #'class_0:0.98,+class_1:0.01,+class_2:0.01=1 wine= datasets.load_wine() wine_x=wine.data #(442, 10) wine_y=wine.target #(442,) print(wine.target_names) #['class_0' 'class_1' 'class_2'] print(wine_x.shape)#(178, 13) print(wine_y) """ [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2] """ #5) 이진분류 (sigmoid 함수) # YES 0.5> ,NO 0.5 < breast=datasets.load_breast_cancer() print(breast.data.shape) #(569, 30) print(breast.target.shape)#(569,) print(breast.target_names) #['malignant' 'benign'] print(breast)
04.sklearn_Regression
# -*- coding: utf-8 -*-
"""
sklearn 관련 Regressin모델
- y변수가 연속인 경우
"""
import pandas as pd
from sklearn import datasets
from sklearn.linear_model import LinearRegression #model
from sklearn.model_selection import train_test_split #train set VS test set
from sklearn.metrics import mean_squared_error #MES (평균제곱 오차)
# 1. dataset 가져오기
iris=pd.read_csv("../data/iris.csv")
print(iris.info())
"""
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal.Length 150 non-null float64
Sepal.Width 150 non-null float64
Petal.Length 150 non-null float64
Petal.Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
"""
print(iris.head())
"""
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
"""
#2. 변수(x,y) 선택
cols=list(iris.columns)
print(cols)
#['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species']
x_cols = cols[1:4] #'Sepal.Width', 'Petal.Length', 'Petal.Width'
y_cols = cols[0] #'Sepal.Length'
#subset
data_df=iris[cols[:4]] #1~4칼럼
print(data_df.shape)#(150, 4)
#3 train set(70%)/test set(30%) #자동 랜덤 ,random_state=123똑같은 랜덤
iris_train,iris_test=train_test_split(
data_df,test_size=0.3,random_state=123)
print(iris_train.shape)#(105, 4) model 생성
print(iris_test.shape) #(45, 4) model 검정
#4.model 생성
#help(LinearRegression)
#class-> object
lr_model=LinearRegression()#default model객체
#fit(train_x,train_y) :학습->model
lr_model.fit(iris_train[x_cols],iris_train[y_cols]) #train set
#획귀 계수(기울기),절편
print("기울기=",lr_model.coef_)#기울기= [ 0.63924286 0.75744562 -0.68796484]
print("절편=",lr_model.intercept_)#절편= 1.8609363992411732
#5. 모델 평가 :test 예측치 =회귀방정식
#1)train set
model_socre1=lr_model.score(iris_train[x_cols],
iris_train[y_cols])
#2)test set
model_socre2=lr_model.score(iris_test[x_cols],
iris_test[y_cols])
#1.socre
print('train_model score=',model_socre1)#train_model score= 0.8581515699458577
print('test_model score=',model_socre2)#test_model score= 0.854680765745176
#model 예측치 vs 정답
pred=lr_model.predict(iris_test[x_cols])# 예측치 predict(x)
Y=iris_test[y_cols]#정답
#2.평균제곱오차 (MSE)
MSE=mean_squared_error(Y,pred) #(정답,예측치)
print('MSE=',MSE)#MSE= 0.11633863200224713
######################
### load_iris()
######################
from sklearn.datasets import load_iris
#1. data loading
iris=load_iris()
# 2. 변수 선택
X=iris.data # x
y=iris.target #y(0~2)
print(X.shape)#(150, 4)
print(y.shape)#(150,)
# 3. train /test split(7:3)
x_train,x_test,y_train,y_test=train_test_split(
X,y, test_size=0.3,random_state=123)
print(x_train.shape)#(105, 4) - 1~4번째
print(x_test.shape)#(45, 4)
print(y_train.shape)#(105,) - 5번째
print(y_test.shape)#(45,)
#4.model 생성:tran set
lr_model2=LinearRegression()
lr_model2.fit(x_train,y_train) # train -> model
print(lr_model2.coef_) #기울기 [-0.12591445 -0.0481559 0.24484363 0.57025678]
print(lr_model2.intercept_) #절편 0.2537496076784179
#5. model평가 :test set
#1) score
model_score=lr_model2.score(x_test,y_test)
print(model_score) #0.9427868501294299
#2) Mes(예측치 vs 정답)
pred=lr_model2.predict(x_test)
Y=y_test
MSE=mean_squared_error(pred,Y)
print('MSE=',MSE)#MSE= 0.04447086315865546
#E=pred-Y
#sqared=E^2
import numpy as np
mes=np.mean((pred-Y)**2)
print('MSE=',MSE) #MSE= 0.04447086315865546
#3시각화 평가
import matplotlib.pyplot as plt
fig=plt.figure(figsize=(20,5))
chart=fig.add_subplot(1,1,1)
chart.plot(pred,color='r',label="pred")
chart.plot(Y,color='b',label="y")
plt.legend(loc='best')
plt.show()
05.LogisticRegression
# -*- coding: utf-8 -*-
"""
sklearn logistic Regreesion
- y변수가 범주인 경우
"""
from sklearn.datasets import load_iris #다항분류
from sklearn.datasets import load_breast_cancer #이항분류
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import pandas as np
#####################################
## 1. load_breast_cancer : 이항분류
#####################################
#1.loading data
breast=load_breast_cancer()
# 2. 변수 선택
X=breast.data
y=breast.target
print(X.shape,y.shape)#(569, 30) (569,)
# 3.model 생성
#help(LogisticRegression)
#1.random_state : 난수 seed값
#2.solver :최적화 알고리즘
# {'newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'} default: 'liblinear'
# 작은 데이터셋:'liblinear'
# 큰 데이터셋:'sag', 'saga'
# 멀티 클래스 문제:'newton-cg','lbfgs'
# 다항붕류 'multinomal'
#적용 예)
#1.일반 데이터셋 ,이항분류 :default
#2일반 데이터셋 ,다항분류 :solver='lbfgs',multi_class="multinomial"
#3.빅 데이터셋 ,이항분류 :solver='sag'
#object
lr_model=LogisticRegression(random_state=0)
lr_model.fit(X,y) #model 생성
#예측치 predict
pred=lr_model.predict(X)
print('prdict=',pred[:5])#prdict= [0 0 0 1 0]
print('y정답=',y[:5])#y정답= [0 0 0 0 0]
# model 평가 : score = 분류정확도(accuracy)
score=lr_model.score(X,y)
print(score) #0.9595782073813708
#:교차 분할표(confusing matrix)
tab=pd.crosstab(y,pred) #crosstab(row:정답,col:예측치)
print(tab)
"""
col_0 0 1
row_0
0 198 14
1 9 348
"""
acc=(198+348)/len(y)
print('accuracy=',acc)#accuracy= 0.9595782073813708
#################################
## 2. load_irsi : 다항분류
#################################
#1.data loading
X,y=load_iris(return_X_y=True)
#2.model 생성
lr_model2=LogisticRegression(random_state=123,
solver='lbfgs',
multi_class="multinomial")
lr_model2.fit(X,y)
print(lr_model2) #model 정보
"""
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='multinomial',
n_jobs=1, penalty='l2', random_state=123, solver='lbfgs',
tol=0.0001, verbose=0, warm_start=False)
"""
# 예측치
pred=lr_model2.predict(X) #예측치
Y=y #정답
score=lr_model2.score(X,y)
print('accuracy=',score)#accuracy= 0.9733333333333334
tab=pd.crosstab(Y,pred)
print(tab)
"""
col_0 0 1 2
row_0
0 50 0 0
1 0 47 3
2 0 1 49
"""
print(type(tab))#<class 'pandas.core.frame.DataFrame'>
acc=(tab.ix[0,0]+tab.ix[1,1]+tab.ix[2,2])/len(y)
print('accuracy=',acc) #accuracy= 0.9733333333333334
# 분류정확도(accuracy) 시각화
import seaborn as sn # heatmap - Accuracy Score
# confusion matrix heatmap
plt.figure(figsize=(6,6)) # chart size
sn.heatmap(tab, annot=True, fmt=".3f", linewidths=.5, square = True);# , cmap = 'Blues_r' : map »ö»ó
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 18)
plt.show()
03.Regression的更多相关文章
- 线性回归 Linear Regression
成本函数(cost function)也叫损失函数(loss function),用来定义模型与观测值的误差.模型预测的价格与训练集数据的差异称为残差(residuals)或训练误差(test err ...
- Multivariance Linear Regression练习
%% 方法一:梯度下降法 x = load('E:\workstation\data\ex3x.dat'); y = load('E:\workstation\data\ex3y.dat'); x = ...
- SparkMLlib之 logistic regression源码分析
最近在研究机器学习,使用的工具是spark,本文是针对spar最新的源码Spark1.6.0的MLlib中的logistic regression, linear regression进行源码分析,其 ...
- Linear regression with multiple variables(多特征的线型回归)算法实例_梯度下降解法(Gradient DesentMulti)以及正规方程解法(Normal Equation)
,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, ,, , ...
- PRML读书会第三章 Linear Models for Regression(线性基函数模型、正则化方法、贝叶斯线性回归等)
主讲人 planktonli planktonli(1027753147) 18:58:12 大家好,我负责给大家讲讲 PRML的第3讲 linear regression的内容,请大家多多指教,群 ...
- Logistic Regression and Gradient Descent
Logistic Regression and Gradient Descent Logistic regression is an excellent tool to know for classi ...
- 机器学习之多变量线性回归(Linear Regression with multiple variables)
1. Multiple features(多维特征) 在机器学习之单变量线性回归(Linear Regression with One Variable)我们提到过的线性回归中,我们只有一个单一特征量 ...
- 转载 Deep learning:三(Multivariance Linear Regression练习)
前言: 本文主要是来练习多变量线性回归问题(其实本文也就3个变量),参考资料见网页:http://openclassroom.stanford.edu/MainFolder/DocumentPage. ...
- 机器学习笔记-1 Linear Regression with Multiple Variables(week 2)
1. Multiple Features note:X0 is equal to 1 2. Feature Scaling Idea: make sure features are on a simi ...
随机推荐
- python之函数初识
一.函数的初识1.如何定义函数:def 函数名(): 代码... 例如:def my_len(): l = [1, 2, 3, 4] count = 0 for i in l: count += 1 ...
- Go语言中的Iota
一.复习常量 提到Iota这个关键字,就必须要复习一下Go语言的常量. 1.Go语言的常量一般使用const声明 2.Go语言的常量只能是布尔型.数字型(整数型.浮点型和复数)和字符串型 3.Go语言 ...
- [十二省联考2019]字符串问题——后缀自动机+parent树优化建图+拓扑序DP+倍增
题目链接: [十二省联考2019]字符串问题 首先考虑最暴力的做法就是对于每个$B$串存一下它是哪些$A$串的前缀,然后按每组支配关系连边,做一遍拓扑序DP即可. 但即使忽略判断前缀的时间,光是连边的 ...
- nowcoder16450 托米的简单表示法
题目链接 思路 仔细理解一下题意可以发现. 对于每个完整的括号序列都是独立的,然后就想到分治.高度是序列中所有括号序列的最大值,宽度是所有括号序列宽度和\(+1\). 然后仔细想了一下,这种分治应该是 ...
- Tomcat 部署java web项目直接ip地址访问项目
正常情况下,在访问在Tomcat中部署的项目是 http://localhost:8080/demo 方式 其中,IP,端口,项目名(Demo)都是必须的. 那么,怎么样才能通过 http://loc ...
- 来了解质量管理工具——质量屋(HOQ)
质量屋(The House Of Quality),又名HOQ,它是质量功能配置(QFD)的核心.一般QFD的学习会涉及到.同时HOQ也是项目管理十大知识领域领域中质量管理工具中的一种,今天我们就来了 ...
- kafka 基础知识梳理及集群环境部署记录
一.kafka基础介绍 Kafka是最初由Linkedin公司开发,是一个分布式.支持分区的(partition).多副本的(replica),基于zookeeper协调的分布式消息系统,它的最大的特 ...
- 20165232 实现pwd
20165232 实现mypwd 题目要求 学习pwd命令 研究pwd实现需要的系统调用(man -k; grep),写出伪代码 实现mypwd 测试mypwd 学习pwd命令 用man pwd 查看 ...
- 【Sql Server】SQL SERVER 收缩日志
事务日志记录着在相关数据库上的操作,同时还存储数据库恢复(recovery)的相关信息. 收缩日志的原因有很多种,有些是考虑空间不足,有些则是应用程序限制导致的. 下面介绍的是在简单模式下,进行收缩操 ...
- java matlab 混合编程 Failed to find the required library mclmcrrt9_2.dll on java.library.path.
问题描述: Exception in thread "main" java.lang.UnsatisfiedLinkError: Failed to find the requir ...