Emsemble
RM
# -*- coding: utf-8 -*-
"""
RandomForestClassifier 예
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_wine
from sklearn import metrics #model 평가 도구
#1.dataset load
wine=load_wine()
wine_x=wine.data
wine_y=wine.target# 3개 범주
#data set보기
print(wine_x[:5,:])
"""
[[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
[1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
[1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
[1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
[1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]
"""
print(wine_y[:5]) #[0 0 0 0 0]
print(wine_y[170:175]) #[2 2 2 2 2]
#2. train/test
X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)
#3.RM model 생성
obj=RandomForestClassifier()
model=obj.fit(X_train,y_train)
print(model)
"""
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
"""
#4.model 평가 : 도구
pred=model.predict(X_test)
Y=y_test
# 평가 도구
acc=metrics.accuracy_score(Y,pred)
print(acc) #0.9629629629629629
report=metrics.classification_report(Y,pred)
print(report)
"""
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.90 1.00 0.95 18
2 1.00 0.91 0.95 22
avg / total 0.97 0.96 0.96 54
"""
##############################################
# RF model Tuning
##############################################
"""
n_estimators=10 tree개수 (400~500) 제일 좋음
min_samples_split=2 : 변수의 개수(sqrt(n))
"""
#2. train/test
X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)
print(wine_x.shape)#(178, 13) 13의 루트
print(np.sqrt(13)) #3.605551275463989=>4
# 3. RM model 생성
obj2=RandomForestClassifier(n_estimators=400,
min_samples_split=3)
model2=obj2.fit(X_train,y_train)
print(model2)
"""
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=4,
min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
oob_score=False, random_state=None, verbose=0,
warm_start=False)
"""
# 4. model 평가 : 도구
pred2=model.predict(X_test)
Y=y_test
# 평가 도구
acc=metrics.accuracy_score(Y,pred2)
print(acc) #0.9814814814814815
report=metrics.classification_report(Y,pred2)
print(report)
"""
precision recall f1-score support
0 1.00 1.00 1.00 14
1 0.95 1.00 0.97 18
2 1.00 0.95 0.98 22
avg / total 0.98 0.98 0.98 54
"""
cross_validation
# -*- coding: utf-8 -*-
"""
교차 검정예
"""
import pandas as pd
from sklearn.model_selection import cross_validate # 교차검정
from sklearn.ensemble import RandomForestClassifier # RM
# 1. data set
iris=pd.read_csv("../data/iris.csv")
print(iris.info())
"""
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
Sepal.Length 150 non-null float64
Sepal.Width 150 non-null float64
Petal.Length 150 non-null float64
Petal.Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.9+ KB
None
"""
cols=list(iris.columns)
x_data=iris[cols[:4]] #1~4
y_data=iris[cols[-1]]
#2.model 생성
obj=RandomForestClassifier()
model=obj.fit(x_data,y_data)
#3.교차 검정 cv=5(5겹 교차검정)
score=cross_validate(model,x_data,y_data,cv=5)
print(score)
"""
{'fit_time': array([0.01000023, 0.01000023, 0.00900006, 0.00999999, 0.01000023]),
'score_time': array([0.00099993, 0.00099993, 0.00099993, 0.00100017, 0.00099993]),
'test_score': array([0.96666667, 0.96666667, 0.9 , 0.93333333, 1. ]),
'train_score': array([1., 1., 1., 1., 1.])}
"""
test_score=list(score['test_score'])
print(test_score) #[0.966, 0.966, 0.933, 0.9, 1.0]
import numpy as np
score_arr=np.array(test_score)
print(score_arr.mean())#0.9533333333333334
RM_regression
# -*- coding: utf-8 -*-
"""
RandomForestRegressor 예
"""
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston # data set
from sklearn import metrics # model 평가 도구
# 1. dataset load
boston = load_boston()
boston_x = boston.data
boston_y = boston.target # 연속형
# data set 보기
print(boston_x.shape) # (506, 13)
print(boston_y.shape) # (506,)
# 2. train/test
x_train, x_test, y_train, y_test = train_test_split(
boston_x, boston_y, test_size=0.3, random_state=123)
# 3. RM model 생성
obj = RandomForestRegressor(random_state=234)
model = obj.fit(x_train, y_train)
print(model)
"""
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
oob_score=False, random_state=234, verbose=0, warm_start=False)
"""
# 4. model 평가 : 도구
y_pred = model.predict(x_test)
y_real_value = y_test
# 평가 도구
model_score = model.score(x_test, y_test)
print(model_score)
# 0.7998066141697237
xgboost_test
# -*- coding: utf-8 -*-
"""
xgboot분류분석
"""
import pandas as pd
from xgboost import XGBClassifier #model
from xgboost import plot_importance #중요변수 시각화
from xgboost import plot_tree# tree 시각화
from sklearn.model_selection import train_test_split
#1.iris data set load
iris=pd.read_csv("../data/iris.csv")
cols=list(iris.columns)
iris_x=iris[cols[:4]]
iris_y=iris[cols[-1]]
# 2. train/test set
x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3,random_state=123)
# 3. model 생성
obj=XGBClassifier()
model=obj.fit(x_train,y_train)
print(model)
"""
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)
objective = "binary:logistic": binary:logistic" : y변수 이항
• max_depth = 2: tree 구조가 간단한 경우 : 2
• nthread = 2 : cpu 사용 수 : 2
• nrounds = 2 : 실제값과 예측값의 차이를 줄이기 위한 반복학습 횟수
• eta = 1 : 학습률을 제어하는 변수(Default: 0.3), 오버 피팅을 방지
"""
# 4. model 평가
y_pred=model.predict(x_test)
print(y_pred)
Y=y_test
"""
['versicolor' 'virginica' 'virginica' 'versicolor' 'setosa' 'versicolor'
'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica' 'setosa'
'versicolor' 'virginica' 'virginica' 'virginica' 'setosa' 'setosa'
'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa' 'virginica' 'setosa'
'setosa' 'setosa' 'virginica' 'virginica' 'setosa' 'virginica'
'versicolor' 'setosa' 'setosa' 'versicolor' 'versicolor' 'virginica'
'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
'virginica' 'virginica']
"""
# 중요변수 시각화
import matplotlib.pyplot as plt
plot_importance(model)
plt.show()
#fscore 중요변수 확인
score=model.get_booster().get_fscore()
print('x 중요변수=',score)
#x 중요변수= {'Petal.Length': 255, 'Petal.Width': 135, 'Sepal.Width': 64, 'Sepal.Length': 118}
#모델 평가
from sklearn import metrics
acc=metrics.accuracy_score(y_pred,Y)
print("acc=",acc) #acc= 0.9333333333333333
report=metrics.classification_report(Y,y_pred)
print(report)
"""
precision recall f1-score support
setosa 1.00 1.00 1.00 18
versicolor 0.77 1.00 0.87 10
virginica 1.00 0.82 0.90 17
avg / total 0.95 0.93 0.93
"""
plot_tree(model)
plt.show()
xgboost_regression
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 24 15:18:35 2019
@author: 502-03
"""
import pandas as pd
from xgboost import XGBRegressor #model (회귀모델)
from xgboost import plot_importance #중요변수 시각화
from xgboost import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_boston# dataset
import matplotlib.pyplot as plt
# 1. dataset load
boston=load_boston()
x=boston.data
y=boston.target
print(x.shape) #(506, 13)
print(y.shape) #(506,)
# 2. train/test set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)
# 3. model 생성
obj=XGBRegressor(n_estimators=400,max_depth=6)
model=obj.fit(x_train,y_train)
print(model)
"""
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=True, subsample=1)
"""
# 중요변수
score=model.get_booster().get_fscore()
print(score)
"""
{'f5': 83, 'f12': 78, 'f0': 91, 'f4': 42, 'f7': 110, 'f10': 32,
'f6': 46, 'f9': 38, 'f3': 1, 'f8': 16, 'f11': 51, 'f1': 2, 'f2': 15}
"""
plot_importance(model)
plt.show()
plot_tree(model)
plt.show()
print(boston.feature_names)
"""
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD'
'TAX' 'PTRATIO' 'B' 'LSTAT']
"""
xgboost_freeze
# -*- coding: utf-8 -*-
"""
수도사업소 주관 - big 콘테스트 dataset
동파유무(0 or 1) 분류하는 위한 dataset
"""
import pandas as pd
from xgboost import XGBClassifier # model(분류모델)
from xgboost import plot_importance # 중요변수 시각화
from sklearn.datasets import load_boston # dataset
from sklearn.model_selection import train_test_split
from matplotlib import font_manager, rc
font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
rc('font', family=font_name)
import matplotlib.pyplot as plt # 중요변수 시각화
freeze = pd.read_csv("../data/freeze_dataset.csv",encoding="MS949")
print(freeze.info())
'''
RangeIndex: 37089 entries, 0 to 37088
Data columns (total 95 columns):
'''
print(freeze.head())
# 칼럼명 수정 : 공백 -> '_'
freeze.columns = freeze.columns.str.replace(' ', '_')
print(freeze.info())
# 동파유무(0 or 1)
print(freeze['동파유무'].value_counts())
'''
0.0 34130 : 90%
1.0 2959 : 10%
'''
cols = list(freeze.columns) # 95개 칼럼
x_cols = cols[1:]
y_cols = cols[0]
print(y_cols) # 동파유무
train_set, test_set = train_test_split(
freeze, test_size=0.4)
# model
obj = XGBClassifier()
model = obj.fit(train_set[x_cols], train_set[y_cols])
# 중요변수 score, 시각화
score = model.get_booster().get_fscore()
print(score)
plot_importance(model)
plt.show()
Emsemble的更多相关文章
- 推荐系统resys小组线下活动见闻2009-08-22
http://www.tuicool.com/articles/vUvQVn 时间2009-08-30 15:13:22 不周山原文 http://www.wentrue.net/blog/?p= ...
- 自适应注意力机制在Image Caption中的应用
在碎片化阅读充斥眼球的时代,越来越少的人会去关注每篇论文背后的探索和思考. 在这个栏目里,你会快速 get 每篇精选论文的亮点和痛点,时刻紧跟 AI 前沿成果. 点击本文底部的「阅读原文」即刻加入社区 ...
- 我的第一次面试 —— 腾讯 AI安全 一面总结
前言 在校两年半,没经历过面试的毒打,第一次面试给了腾讯,周二晚上学长帮推的简历周三下午就打电话来问周四晚上有没有空面试.那天下午还在赶着数据库的实验报告,脑子有点转不过来就说了有空,然后仔细一看好像 ...
随机推荐
- (十六)qt-udp,组播
基本流程 简单聊天程序 #include "server.h" #include <QApplication> #include "client.h" ...
- DNS Tunnel隧道隐蔽通信实验 && 尝试复现特征向量化思维方式检测
1. DNS隧道简介 DNS隧道技术是指利用 DNS协议建立隐蔽信 道,实现隐蔽数据传输.最早是在2004年 DanKaminsky 在 Defcon大会上发布的基于 NSTX 的 DNS隐蔽 隧道工 ...
- Intellij IDEA 4种配置热部署的方法【转】【补】
热加载 热加载可以使代码修改后无须重启服务器,就可以加载更改的代码.(其实分java和非java代码,本处可以让java代码立即生效且不重启服务) 第1种:修改服务器配置,使得IDEA窗口失去焦点时, ...
- [Reinforcement Learning] Model-Free Prediction
上篇文章介绍了 Model-based 的通用方法--动态规划,本文内容介绍 Model-Free 情况下 Prediction 问题,即 "Estimate the value funct ...
- iview-admin安装
桌面创建project文件夹. 文件夹内右键选择gitbash here,输入git init.文件夹内会生成.git文件夹. 再输入git config --global user.name &qu ...
- python-类内置属性和内置方法
class A(): ''' 这是一个类 ''' banji=1 def __init__(self,name,age): self.name=name self.age=age def AA(sel ...
- box-shaw四边阴影详解
四边设置: /*设置四边不同颜色外阴影*/ .element{ box-shadow:-10px 0 10px red, /*左边阴影*/ 10px 0 10px yellow, /*右边阴影*/ 0 ...
- golang _下划线占位符代替需要释放的资源的问题
golang中_有两种作用,一种用在import中,比如这样 import _ "github.com/go-sql-driver/mysql" 表示并不需要导入整个包,只是执行这 ...
- Mail.Ru Cup 2018 Round 3 B. Divide Candies
题目链接 分析一下题意可以得到题目要求的是满足下面这个 公式的不同的i,ji,ji,j的方案数; 即(i2+j2)mod   m=0 (n ≤ ...
- (四)ORBSLAM运动估计
ORBSLAM2的运动估计简介 ORBSLAM2中的运动估计核心方法就是3D-2D的PNP,而在跟踪过程主要分为三种类型: 无运动模型的跟踪,即基于参考帧的跟踪: 基于匀速运动模型的跟踪: 重定位: ...