RM

 # -*- coding: utf-8 -*-
 """
 RandomForestClassifier 예

 """
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_wine
 from sklearn import metrics #model 평가 도구

 #1.dataset load
 wine=load_wine()
 wine_x=wine.data
 wine_y=wine.target# 3개 범주 

 #data set보기
 print(wine_x[:5,:])
 """
 [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
   2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
  [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
   2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
  [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
   3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
  [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
   2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
  [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
   3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]
 """
 print(wine_y[:5]) #[0 0 0 0 0]
 print(wine_y[170:175]) #[2 2 2 2 2]

 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)

 #3.RM model 생성
 obj=RandomForestClassifier()
 model=obj.fit(X_train,y_train)
 print(model)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """

 #4.model 평가 : 도구
 pred=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred)
 print(acc) #0.9629629629629629

 report=metrics.classification_report(Y,pred)
 print(report)
 """
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      1.00      0.95        18
           2       1.00      0.91      0.95        22

 avg / total       0.97      0.96      0.96        54
 """

 ##############################################
 # RF model Tuning
 ##############################################
 """
 n_estimators=10 tree개수 (400~500) 제일 좋음
 min_samples_split=2 : 변수의 개수(sqrt(n))
 """
 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)
 print(wine_x.shape)#(178, 13)  13의 루트
 print(np.sqrt(13)) #3.605551275463989=>4

 # 3. RM model 생성
 obj2=RandomForestClassifier(n_estimators=400,
                            min_samples_split=3)
 model2=obj2.fit(X_train,y_train)
 print(model2)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=4,
             min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """
 # 4. model 평가 : 도구
 pred2=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred2)
 print(acc) #0.9814814814814815

 report=metrics.classification_report(Y,pred2)
 print(report)
 """
              precision    recall  f1-score   support
           0       1.00      1.00      1.00        14
           1       0.95      1.00      0.97        18
           2       1.00      0.95      0.98        22
 avg / total       0.98      0.98      0.98        54
 """

cross_validation

 # -*- coding: utf-8 -*-
 """
 교차 검정예
 """
 import pandas as pd
 from sklearn.model_selection import cross_validate # 교차검정
 from sklearn.ensemble import RandomForestClassifier # RM

 # 1. data set
 iris=pd.read_csv("../data/iris.csv")
 print(iris.info())
 """
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 150 entries, 0 to 149
 Data columns (total 5 columns):
 Sepal.Length    150 non-null float64
 Sepal.Width     150 non-null float64
 Petal.Length    150 non-null float64
 Petal.Width     150 non-null float64
 Species         150 non-null object
 dtypes: float64(4), object(1)
 memory usage: 5.9+ KB
 None
 """

 cols=list(iris.columns)

 x_data=iris[cols[:4]] #1~4
 y_data=iris[cols[-1]] 

 #2.model 생성
 obj=RandomForestClassifier()
 model=obj.fit(x_data,y_data)

 #3.교차 검정 cv=5(5겹 교차검정)
 score=cross_validate(model,x_data,y_data,cv=5)
 print(score)
 """
 {'fit_time': array([0.01000023, 0.01000023, 0.00900006, 0.00999999, 0.01000023]),
 'score_time': array([0.00099993, 0.00099993, 0.00099993, 0.00100017, 0.00099993]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}
 """

 test_score=list(score['test_score'])
 print(test_score) #[0.966, 0.966, 0.933, 0.9, 1.0]

 import numpy as np
 score_arr=np.array(test_score)
 print(score_arr.mean())#0.9533333333333334

RM_regression

 # -*- coding: utf-8 -*-
 """
 RandomForestRegressor 예
 """

 import pandas as pd
 import numpy as np

 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston # data set
 from sklearn import metrics # model 평가 도구

 # 1. dataset load
 boston = load_boston()
 boston_x = boston.data
 boston_y = boston.target # 연속형 

 # data set 보기
 print(boston_x.shape) # (506, 13)
 print(boston_y.shape) # (506,) 

 # 2. train/test
 x_train, x_test, y_train, y_test = train_test_split(
         boston_x, boston_y, test_size=0.3, random_state=123)

 # 3. RM model 생성
 obj = RandomForestRegressor(random_state=234)
 model = obj.fit(x_train, y_train)
 print(model)
 """
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=234, verbose=0, warm_start=False)
 """

 # 4. model 평가 : 도구
 y_pred = model.predict(x_test)
 y_real_value = y_test 

 # 평가 도구
 model_score = model.score(x_test, y_test)
 print(model_score)
 # 0.7998066141697237

xgboost_test

 # -*- coding: utf-8 -*-
 """
 xgboot분류분석
 """
 import pandas as pd
 from xgboost import XGBClassifier #model
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree# tree 시각화
 from sklearn.model_selection import train_test_split

 #1.iris data set load
 iris=pd.read_csv("../data/iris.csv")

 cols=list(iris.columns)
 iris_x=iris[cols[:4]]
 iris_y=iris[cols[-1]]

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBClassifier()
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)

  objective = "binary:logistic": binary:logistic" : y변수 이항
 • max_depth = 2: tree 구조가 간단한 경우 : 2
 • nthread = 2 : cpu 사용 수 : 2
 • nrounds = 2 : 실제값과 예측값의 차이를 줄이기 위한 반복학습 횟수
 • eta = 1 : 학습률을 제어하는 변수(Default: 0.3), 오버 피팅을 방지

 """
 # 4. model 평가
 y_pred=model.predict(x_test)
 print(y_pred)
 Y=y_test
 """
 ['versicolor' 'virginica' 'virginica' 'versicolor' 'setosa' 'versicolor'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica' 'setosa'
  'versicolor' 'virginica' 'virginica' 'virginica' 'setosa' 'setosa'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa' 'virginica' 'setosa'
  'setosa' 'setosa' 'virginica' 'virginica' 'setosa' 'virginica'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'versicolor' 'virginica'
  'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
  'virginica' 'virginica']
 """

 # 중요변수 시각화
 import matplotlib.pyplot as plt
 plot_importance(model)
 plt.show()

 #fscore 중요변수 확인
 score=model.get_booster().get_fscore()
 print('x 중요변수=',score)
 #x 중요변수= {'Petal.Length': 255, 'Petal.Width': 135, 'Sepal.Width': 64, 'Sepal.Length': 118}

 #모델 평가
 from sklearn import metrics
 acc=metrics.accuracy_score(y_pred,Y)
 print("acc=",acc) #acc= 0.9333333333333333

 report=metrics.classification_report(Y,y_pred)
 print(report)
 """
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        18
  versicolor       0.77      1.00      0.87        10
   virginica       1.00      0.82      0.90        17
 avg / total       0.95      0.93      0.93
 """

 plot_tree(model)
 plt.show()

xgboost_regression

 # -*- coding: utf-8 -*-
 """
 Created on Sun Feb 24 15:18:35 2019

 @author: 502-03
 """

 import pandas as pd
 from xgboost import XGBRegressor #model (회귀모델)
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston# dataset
 import matplotlib.pyplot as plt

 # 1. dataset load
 boston=load_boston()
 x=boston.data
 y=boston.target

 print(x.shape) #(506, 13)
 print(y.shape) #(506,)

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBRegressor(n_estimators=400,max_depth=6)
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)
 """
 # 중요변수
 score=model.get_booster().get_fscore()
 print(score)
 """
 {'f5': 83, 'f12': 78, 'f0': 91, 'f4': 42, 'f7': 110, 'f10': 32,
 'f6': 46, 'f9': 38, 'f3': 1, 'f8': 16, 'f11': 51, 'f1': 2, 'f2': 15}
 """

 plot_importance(model)
 plt.show()

 plot_tree(model)
 plt.show()

 print(boston.feature_names)
 """
 ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD'
 'TAX' 'PTRATIO' 'B' 'LSTAT']
 """

xgboost_freeze

 # -*- coding: utf-8 -*-
 """
 수도사업소 주관 - big 콘테스트 dataset
 동파유무(0 or  1) 분류하는 위한 dataset
 """

 import pandas as pd
 from xgboost import XGBClassifier # model(분류모델)
 from xgboost import plot_importance # 중요변수 시각화
 from sklearn.datasets import load_boston # dataset
 from sklearn.model_selection import train_test_split

 from matplotlib import font_manager, rc
 font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
 rc('font', family=font_name)

 import matplotlib.pyplot as plt # 중요변수 시각화 

 freeze = pd.read_csv("../data/freeze_dataset.csv",encoding="MS949")
 print(freeze.info())
 '''
 RangeIndex: 37089 entries, 0 to 37088
 Data columns (total 95 columns):
 '''

 print(freeze.head())

 # 칼럼명 수정 : 공백 -> '_'
 freeze.columns = freeze.columns.str.replace(' ', '_')
 print(freeze.info())

 # 동파유무(0 or 1)
 print(freeze['동파유무'].value_counts())
 '''
 0.0    34130 : 90%
 1.0     2959 : 10%
 '''

 cols = list(freeze.columns) # 95개 칼럼
 x_cols = cols[1:]
 y_cols = cols[0]

 print(y_cols) # 동파유무

 train_set, test_set = train_test_split(
         freeze, test_size=0.4)

 # model
 obj = XGBClassifier()
 model = obj.fit(train_set[x_cols], train_set[y_cols])

 # 중요변수 score, 시각화
 score = model.get_booster().get_fscore()
 print(score)

 plot_importance(model)
 plt.show()

Emsemble的更多相关文章

推荐系统resys小组线下活动见闻2009-08-22
http://www.tuicool.com/articles/vUvQVn 时间2009-08-30 15:13:22 不周山原文 http://www.wentrue.net/blog/?p= ...
自适应注意力机制在Image Caption中的应用
在碎片化阅读充斥眼球的时代,越来越少的人会去关注每篇论文背后的探索和思考. 在这个栏目里,你会快速 get 每篇精选论文的亮点和痛点,时刻紧跟 AI 前沿成果. 点击本文底部的「阅读原文」即刻加入社区 ...
我的第一次面试 —— 腾讯 AI安全一面总结
前言在校两年半,没经历过面试的毒打,第一次面试给了腾讯,周二晚上学长帮推的简历周三下午就打电话来问周四晚上有没有空面试.那天下午还在赶着数据库的实验报告,脑子有点转不过来就说了有空,然后仔细一看好像 ...

随机推荐

thinkphp5 去除缓存
array_map('unlink', glob(TEMP_PATH . '/*.php')); rmdir(TEMP_PATH);
Mac 设计师必备的设计绘图软件推荐与下载
Mac设计师必备的设计绘图软件,为广大设计师推荐一些Mac上实用且强大的软件,使用好的软件,事半功倍,设计出精美的作品. Mac上优秀的设计类软件非常多,绝对不止这几款软件,看看以下内容,希望对你有帮 ...
2017-12-20python全栈9期第五天第二节之可变数据类型和不可变数据类型
LDM与STM指令详解
title: LDM与STM指令详解 date: 2019/2/26 17:58:00 toc: true --- LDM与STM指令详解指令形式如下,这里的存储方向是针对寄存器的 Load Mul ...
HBase RowKey与索引设计
1. HBase的存储形式 hbase的内部使用KeyValue的形式存储,其key时rowKey:family:column:logTime,value是其存储的内容. 其在region内大多以升序 ...
使用numpy的小惊喜
今天使用 numpy.true_divide 发现个有趣的事情, 下面的代码18.19行如果去掉,就会报下面的 RuntimeWarning def multivalue_divide(timese ...
PaginatorSet
from django.core.paginator import Paginator, EmptyPage class PaginatorSet: def __init__(self, limit= ...
Tree POJ - 1741【树分治】【一句话说清思路】
因为该博客的两位作者瞎几把乱吹("￣︶￣)人(￣︶￣")用彼此的智慧总结出了两条全新的定理(高度复杂度定理.特异根特异树定理),转载请务必说明出处.(逃 Pass:anuonei, ...
redis---------AOF文件异常导致的redis无法载入
AOF损坏时的对策1.若在写AOF文件时Server崩溃则可能导致AOF文件损坏而不能被Redis载入.可通过如下步骤修复: 创建一个AOF文件的备份: cp appendonly.aof appen ...
golang range遍历是新创建对象还是创建对象的引用
golang range遍历是新创建对象还是创建对象的引用,通俗的讲就是range对range出来的对象的修改会不会同步到被遍历的那个数组.先看如下代码: package main import ( ...

Emsemble

RM

cross_validation

RM_regression

xgboost_test

xgboost_regression

xgboost_freeze

Emsemble的更多相关文章

随机推荐

热门专题