RM

 # -*- coding: utf-8 -*-
 """
 RandomForestClassifier 예

 """
 import pandas as pd
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_wine
 from sklearn import metrics #model 평가 도구

 #1.dataset load
 wine=load_wine()
 wine_x=wine.data
 wine_y=wine.target# 3개 범주 

 #data set보기
 print(wine_x[:5,:])
 """
 [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
   2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]
  [1.320e+01 1.780e+00 2.140e+00 1.120e+01 1.000e+02 2.650e+00 2.760e+00
   2.600e-01 1.280e+00 4.380e+00 1.050e+00 3.400e+00 1.050e+03]
  [1.316e+01 2.360e+00 2.670e+00 1.860e+01 1.010e+02 2.800e+00 3.240e+00
   3.000e-01 2.810e+00 5.680e+00 1.030e+00 3.170e+00 1.185e+03]
  [1.437e+01 1.950e+00 2.500e+00 1.680e+01 1.130e+02 3.850e+00 3.490e+00
   2.400e-01 2.180e+00 7.800e+00 8.600e-01 3.450e+00 1.480e+03]
  [1.324e+01 2.590e+00 2.870e+00 2.100e+01 1.180e+02 2.800e+00 2.690e+00
   3.900e-01 1.820e+00 4.320e+00 1.040e+00 2.930e+00 7.350e+02]]
 """
 print(wine_y[:5]) #[0 0 0 0 0]
 print(wine_y[170:175]) #[2 2 2 2 2]

 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)

 #3.RM model 생성
 obj=RandomForestClassifier()
 model=obj.fit(X_train,y_train)
 print(model)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """

 #4.model 평가 : 도구
 pred=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred)
 print(acc) #0.9629629629629629

 report=metrics.classification_report(Y,pred)
 print(report)
 """
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.90      1.00      0.95        18
           2       1.00      0.91      0.95        22

 avg / total       0.97      0.96      0.96        54
 """

 ##############################################
 # RF model Tuning
 ##############################################
 """
 n_estimators=10 tree개수 (400~500) 제일 좋음
 min_samples_split=2 : 변수의 개수(sqrt(n))
 """
 #2. train/test
 X_train,X_test,y_train,y_test=train_test_split(wine_x,wine_y,test_size=0.3)
 print(wine_x.shape)#(178, 13)  13의 루트
 print(np.sqrt(13)) #3.605551275463989=>4

 # 3. RM model 생성
 obj2=RandomForestClassifier(n_estimators=400,
                            min_samples_split=3)
 model2=obj2.fit(X_train,y_train)
 print(model2)
 """
 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
             max_depth=None, max_features='auto', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=4,
             min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
             oob_score=False, random_state=None, verbose=0,
             warm_start=False)
 """
 # 4. model 평가 : 도구
 pred2=model.predict(X_test)
 Y=y_test

 # 평가 도구
 acc=metrics.accuracy_score(Y,pred2)
 print(acc) #0.9814814814814815

 report=metrics.classification_report(Y,pred2)
 print(report)
 """
              precision    recall  f1-score   support
           0       1.00      1.00      1.00        14
           1       0.95      1.00      0.97        18
           2       1.00      0.95      0.98        22
 avg / total       0.98      0.98      0.98        54
 """

cross_validation

 # -*- coding: utf-8 -*-
 """
 교차 검정예
 """
 import pandas as pd
 from sklearn.model_selection import cross_validate # 교차검정
 from sklearn.ensemble import RandomForestClassifier # RM

 # 1. data set
 iris=pd.read_csv("../data/iris.csv")
 print(iris.info())
 """
 <class 'pandas.core.frame.DataFrame'>
 RangeIndex: 150 entries, 0 to 149
 Data columns (total 5 columns):
 Sepal.Length    150 non-null float64
 Sepal.Width     150 non-null float64
 Petal.Length    150 non-null float64
 Petal.Width     150 non-null float64
 Species         150 non-null object
 dtypes: float64(4), object(1)
 memory usage: 5.9+ KB
 None
 """

 cols=list(iris.columns)

 x_data=iris[cols[:4]] #1~4
 y_data=iris[cols[-1]] 

 #2.model 생성
 obj=RandomForestClassifier()
 model=obj.fit(x_data,y_data)

 #3.교차 검정 cv=5(5겹 교차검정)
 score=cross_validate(model,x_data,y_data,cv=5)
 print(score)
 """
 {'fit_time': array([0.01000023, 0.01000023, 0.00900006, 0.00999999, 0.01000023]),
 'score_time': array([0.00099993, 0.00099993, 0.00099993, 0.00100017, 0.00099993]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}
 """

 test_score=list(score['test_score'])
 print(test_score) #[0.966, 0.966, 0.933, 0.9, 1.0]

 import numpy as np
 score_arr=np.array(test_score)
 print(score_arr.mean())#0.9533333333333334

RM_regression

 # -*- coding: utf-8 -*-
 """
 RandomForestRegressor 예
 """

 import pandas as pd
 import numpy as np

 from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston # data set
 from sklearn import metrics # model 평가 도구

 # 1. dataset load
 boston = load_boston()
 boston_x = boston.data
 boston_y = boston.target # 연속형 

 # data set 보기
 print(boston_x.shape) # (506, 13)
 print(boston_y.shape) # (506,) 

 # 2. train/test
 x_train, x_test, y_train, y_test = train_test_split(
         boston_x, boston_y, test_size=0.3, random_state=123)

 # 3. RM model 생성
 obj = RandomForestRegressor(random_state=234)
 model = obj.fit(x_train, y_train)
 print(model)
 """
 RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=234, verbose=0, warm_start=False)
 """

 # 4. model 평가 : 도구
 y_pred = model.predict(x_test)
 y_real_value = y_test 

 # 평가 도구
 model_score = model.score(x_test, y_test)
 print(model_score)
 # 0.7998066141697237

xgboost_test

 # -*- coding: utf-8 -*-
 """
 xgboot분류분석
 """
 import pandas as pd
 from xgboost import XGBClassifier #model
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree# tree 시각화
 from sklearn.model_selection import train_test_split

 #1.iris data set load
 iris=pd.read_csv("../data/iris.csv")

 cols=list(iris.columns)
 iris_x=iris[cols[:4]]
 iris_y=iris[cols[-1]]

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(iris_x,iris_y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBClassifier()
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='multi:softprob', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)

  objective = "binary:logistic": binary:logistic" : y변수 이항
 • max_depth = 2: tree 구조가 간단한 경우 : 2
 • nthread = 2 : cpu 사용 수 : 2
 • nrounds = 2 : 실제값과 예측값의 차이를 줄이기 위한 반복학습 횟수
 • eta = 1 : 학습률을 제어하는 변수(Default: 0.3), 오버 피팅을 방지

 """
 # 4. model 평가
 y_pred=model.predict(x_test)
 print(y_pred)
 Y=y_test
 """
 ['versicolor' 'virginica' 'virginica' 'versicolor' 'setosa' 'versicolor'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'virginica' 'setosa'
  'versicolor' 'virginica' 'virginica' 'virginica' 'setosa' 'setosa'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'setosa' 'virginica' 'setosa'
  'setosa' 'setosa' 'virginica' 'virginica' 'setosa' 'virginica'
  'versicolor' 'setosa' 'setosa' 'versicolor' 'versicolor' 'virginica'
  'setosa' 'setosa' 'versicolor' 'versicolor' 'setosa' 'virginica'
  'virginica' 'virginica']
 """

 # 중요변수 시각화
 import matplotlib.pyplot as plt
 plot_importance(model)
 plt.show()

 #fscore 중요변수 확인
 score=model.get_booster().get_fscore()
 print('x 중요변수=',score)
 #x 중요변수= {'Petal.Length': 255, 'Petal.Width': 135, 'Sepal.Width': 64, 'Sepal.Length': 118}

 #모델 평가
 from sklearn import metrics
 acc=metrics.accuracy_score(y_pred,Y)
 print("acc=",acc) #acc= 0.9333333333333333

 report=metrics.classification_report(Y,y_pred)
 print(report)
 """
              precision    recall  f1-score   support
      setosa       1.00      1.00      1.00        18
  versicolor       0.77      1.00      0.87        10
   virginica       1.00      0.82      0.90        17
 avg / total       0.95      0.93      0.93
 """

 plot_tree(model)
 plt.show()

xgboost_regression

 # -*- coding: utf-8 -*-
 """
 Created on Sun Feb 24 15:18:35 2019

 @author: 502-03
 """

 import pandas as pd
 from xgboost import XGBRegressor #model (회귀모델)
 from xgboost import plot_importance #중요변수 시각화
 from xgboost import plot_tree
 from sklearn.model_selection import train_test_split
 from sklearn.datasets import load_boston# dataset
 import matplotlib.pyplot as plt

 # 1. dataset load
 boston=load_boston()
 x=boston.data
 y=boston.target

 print(x.shape) #(506, 13)
 print(y.shape) #(506,)

 # 2. train/test set
 x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=123)

 # 3. model 생성
 obj=XGBRegressor(n_estimators=400,max_depth=6)
 model=obj.fit(x_train,y_train)
 print(model)
 """
 XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
        silent=True, subsample=1)
 """
 # 중요변수
 score=model.get_booster().get_fscore()
 print(score)
 """
 {'f5': 83, 'f12': 78, 'f0': 91, 'f4': 42, 'f7': 110, 'f10': 32,
 'f6': 46, 'f9': 38, 'f3': 1, 'f8': 16, 'f11': 51, 'f1': 2, 'f2': 15}
 """

 plot_importance(model)
 plt.show()

 plot_tree(model)
 plt.show()

 print(boston.feature_names)
 """
 ['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD'
 'TAX' 'PTRATIO' 'B' 'LSTAT']
 """

xgboost_freeze

 # -*- coding: utf-8 -*-
 """
 수도사업소 주관 - big 콘테스트 dataset
 동파유무(0 or  1) 분류하는 위한 dataset
 """

 import pandas as pd
 from xgboost import XGBClassifier # model(분류모델)
 from xgboost import plot_importance # 중요변수 시각화
 from sklearn.datasets import load_boston # dataset
 from sklearn.model_selection import train_test_split

 from matplotlib import font_manager, rc
 font_name = font_manager.FontProperties(fname="c:/Windows/Fonts/malgun.ttf").get_name()
 rc('font', family=font_name)

 import matplotlib.pyplot as plt # 중요변수 시각화 

 freeze = pd.read_csv("../data/freeze_dataset.csv",encoding="MS949")
 print(freeze.info())
 '''
 RangeIndex: 37089 entries, 0 to 37088
 Data columns (total 95 columns):
 '''

 print(freeze.head())

 # 칼럼명 수정 : 공백 -> '_'
 freeze.columns = freeze.columns.str.replace(' ', '_')
 print(freeze.info())

 # 동파유무(0 or 1)
 print(freeze['동파유무'].value_counts())
 '''
 0.0    34130 : 90%
 1.0     2959 : 10%
 '''

 cols = list(freeze.columns) # 95개 칼럼
 x_cols = cols[1:]
 y_cols = cols[0]

 print(y_cols) # 동파유무

 train_set, test_set = train_test_split(
         freeze, test_size=0.4)

 # model
 obj = XGBClassifier()
 model = obj.fit(train_set[x_cols], train_set[y_cols])

 # 중요변수 score, 시각화
 score = model.get_booster().get_fscore()
 print(score)

 plot_importance(model)
 plt.show()

Emsemble的更多相关文章

  1. 推荐系统resys小组线下活动见闻2009-08-22

    http://www.tuicool.com/articles/vUvQVn 时间2009-08-30 15:13:22  不周山原文  http://www.wentrue.net/blog/?p= ...

  2. 自适应注意力机制在Image Caption中的应用

    在碎片化阅读充斥眼球的时代,越来越少的人会去关注每篇论文背后的探索和思考. 在这个栏目里,你会快速 get 每篇精选论文的亮点和痛点,时刻紧跟 AI 前沿成果. 点击本文底部的「阅读原文」即刻加入社区 ...

  3. 我的第一次面试 —— 腾讯 AI安全 一面总结

    前言 在校两年半,没经历过面试的毒打,第一次面试给了腾讯,周二晚上学长帮推的简历周三下午就打电话来问周四晚上有没有空面试.那天下午还在赶着数据库的实验报告,脑子有点转不过来就说了有空,然后仔细一看好像 ...

随机推荐

  1. (N叉树 BFS) leetcode429. N-ary Tree Level Order Traversal

    Given an n-ary tree, return the level order traversal of its nodes' values. (ie, from left to right, ...

  2. Jmeter工具进行一个完整的接口测试

    Jmeter工具进行一个完整的接口测试 1.创建一个线程组 通俗的讲一个线程组,,可以看做一个虚拟用户组,线程组中的每个线程都可以理解为一个虚拟用户.   2.输入线程组名字 3.添加一个cookie ...

  3. 自定义 ThreadPoolExecutor 处理线程运行时异常

    自定义 ThreadPoolExecutor 处理线程运行时异常 最近看完了ElasticSearch线程池模块的源码,感触颇深,然后也自不量力地借鉴ES的 EsThreadPoolExecutor ...

  4. 安卓ADB学习笔记

    ADB(Android Debug Bridge)可以远程调试安卓设备,包括模拟器,可以进入终端模式(安卓本身相当于一个linux) 1.配置adb环境变量 以夜神模拟器为例,将模拟器安装路径里的bi ...

  5. CPM、CPC、CPA、CPS、CPL、CPR 是什么意思 -解析互联网广告术语

    CPA CPS CPA/CPS常见的推广方式 CPA和CPSCPA,CPS CPS与CPA CPA.CPSCPA.CPS产品教  CPA CPS什么意思 CPACPS是什么 1. CPM(Cost p ...

  6. 《模式分类(原书第二版)》pdf格式下载电子书免费下载

    <模式分类(原书第二版)>pdf格式下载电子书免费下载: https://u253469.ctfile.com/fs/253469-302448505 内容简介 <模式分类>( ...

  7. CentOS下TensorFlow安装命令

    安装扩展源:sudo yum -y install epel-release 安装python-pip模块:sudo yum install python-pip 直接把whl文件download下来 ...

  8. 分布式系列十二: Redis高级主题

    持久化 Redis 支持持久化, 其持久化数据有两种方式. 两种可以同时使用. 如果同时使用, Reids 在重启时将使用 AOF 方式来还原数据. RDB 按照一定策略定时同步内存的数据到磁盘.文件 ...

  9. Java编程思想(后)

    Java编程思想(后) 持有对象 如果一个程序只包含固定数量的且其生命期都是已知的对象,那么这是一个非常简单的程序. Java中的库基本类型: List, Set, Queue和Map --- 称为集 ...

  10. ubuntu命令安装

    1.当make时,发现没有对应的命令: apt-get install build-essential 安装工具,可解决这个问题