机器学习技法笔记:Homework #7 Decision Tree&Random Forest相关习题
原文地址:https://www.jianshu.com/p/7ff6fd6fc99f
问题描述


程序实现
13-15
# coding:utf-8
# decision_tree.py
import numpy as np
def ReadData(dataFile):
with open(dataFile, 'r') as f:
lines = f.readlines()
data_list = []
for line in lines:
line = line.strip().split()
data_list.append([float(l) for l in line])
dataArray = np.array(data_list)
return dataArray
def sign(n):
if(n>=0):
return 1
else:
return -1
def GetSortedArray(dataArray,i):
# 根据dataArray第i列的值对dataArray进行从小到大的排序
data_list=dataArray.tolist()
sorted_data_list=sorted(data_list,key=lambda x:x[i],reverse=False)
sortedDataArray=np.array(sorted_data_list)
return sortedDataArray
def GetSplitData(pred,dataArray):
assert pred.shape[0]==dataArray.shape[0],"wrong shape of prediction!"
falseData=[]
trueData=[]
for n in range(pred.shape[0]):
if pred[n]==-1:
falseData.append(dataArray[n,:])
elif pred[n]==1:
trueData.append(dataArray[n,:])
else:
print("wrong prediction!")
return np.array(falseData),np.array(trueData)
def GetWeightedImpurity(pred,dataY):
num_data = dataY.shape[0]
num_false=(pred==-1).sum()
num_true=(pred==1).sum()
assert num_false+num_true==num_data,"wrong prediction!"
if(num_false==0):
falseGini=0
else:
falseFalse = ((pred + dataY) == -2).sum()
falseTrue = num_false - falseFalse
falseGini=1 - (falseFalse ** 2 + falseTrue ** 2) / num_false ** 2
if(num_true==0):
trueGini=0
else:
trueTrue = ((pred + dataY) == 2).sum()
trueFalse = num_true - trueTrue
trueGini=1-(trueFalse**2+trueTrue**2)/num_true**2
return (num_false*falseGini+num_true*trueGini)/num_data
def decision_stump(dataArray):
num_data=dataArray.shape[0]
num_dim=dataArray.shape[1]-1
min_e=np.inf
min_s = np.inf
min_d=np.inf
min_theta = np.inf
min_pred=np.zeros((num_data,))
for d in range(num_dim):
sortedDataArray=GetSortedArray(dataArray,d)
d_min_e=np.inf
d_min_s = np.inf
d_min_theta = np.inf
d_min_pred = np.zeros((num_data,))
for s in [-1.0,1.0]:
for i in range(num_data):
if(i==0):
theta=-np.inf
pred=s*np.ones((num_data,))
else:
if sortedDataArray[i-1][d]==sortedDataArray[i][d]:
continue
theta=(sortedDataArray[i-1][d]+sortedDataArray[i][d])/2
pred=np.zeros((num_data,))
for n in range(num_data):
pred[n]=s*sign(dataArray[n,d]-theta)
d_now_e=GetWeightedImpurity(pred,dataArray[:,-1])
if(d_now_e<d_min_e):
d_min_e=d_now_e
d_min_s=s
d_min_theta=theta
d_min_pred=pred
if(d_min_e<min_e):
min_e=d_min_e
min_s=d_min_s
min_d=d
min_theta=d_min_theta
min_pred=d_min_pred
return min_s,min_d,min_theta,min_pred
paraDict={}
def decision_tree(id,dataArray,prune=False):
num_data=dataArray.shape[0]
num_dim=dataArray.shape[1]-1
dataX=dataArray[:,:-1]
dataY=dataArray[:,-1]
if(dataY.min()==dataY.max()): # y相同
return {id:dataY[0]}
tmpX=np.concatenate([dataX[0,:].reshape((1,num_dim))]*num_data,axis=0)
if(((dataX-tmpX)==0).all()): # x无法再分割
return {id:sign(np.sum(dataY))}
s,d,theta,pred=decision_stump(dataArray)
paraDict[id]=[s,d,theta]
falseArray,trueArray=GetSplitData(pred,dataArray)
if prune:
return {id:{-1:{id*2:sign(falseArray[:,-1].sum())},1:{id*2+1:sign(trueArray[:,-1].sum())}}}
falseTree=decision_tree(id*2,falseArray)
trueTree=decision_tree(id*2+1,trueArray)
return {id:{-1:falseTree,1:trueTree}}
def GetZeroOneError(pred,dataY):
return (pred!=dataY).sum()/dataY.shape[0]
def predict(treeDict,dataX):
num_data=dataX.shape[0]
pred=np.zeros((num_data,))
for n in range(num_data):
x=dataX[n,:]
id=1
tmp_dict=treeDict
while(1):
tmp_dict=tmp_dict[id]
if(type(tmp_dict).__name__!="dict"):
break
paraList = paraDict[id]
tmp_res=paraList[0]*sign(x[paraList[1]]-paraList[2])
tmp_dict=tmp_dict[tmp_res]
id=list(tmp_dict.keys())[0]
pred[n]=tmp_dict
return pred
def getNumLeafs(myTree):
numLeafs = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
if(type(secondDict).__name__=="dict"):
numLeafs += getNumLeafs(secondDict[-1])
numLeafs+=getNumLeafs(secondDict[1])
else:
numLeafs += 1
return numLeafs
def getTreeDepth(myTree):
maxDepth = 0
firstStr = list(myTree.keys())[0]
secondDict = myTree[firstStr]
if(type(secondDict).__name__=="dict"):
thisDepth = 1 + max(getTreeDepth(secondDict[-1]),getTreeDepth(secondDict[1]))
else:
thisDepth = 1
if thisDepth > maxDepth: maxDepth = thisDepth
return maxDepth
import matplotlib.pyplot as plt
decisionNode = dict(boxstyle="round", fc="0.8",pad=0.8)
leafNode = dict(boxstyle="circle", fc="0.8",pad=0.1)
arrow_args = dict(arrowstyle="<-")
def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
return
def plotMidText(centerPt, parentPt, txtString):
xMid = (parentPt[0] - centerPt[0]) / 2.0 + centerPt[0]
yMid = (parentPt[1] - centerPt[1]) / 2.0 + centerPt[1]
createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)
return
def plotTree(myTree, centerPt, parentPt, nodeTxt):
firstStr = list(myTree.keys())[0]
if firstStr==1:
createPlot.ax1.annotate(str(1), xy=parentPt, xycoords='axes fraction',
va="center", ha="center",bbox=decisionNode)
elif firstStr in paraDict:
plotNode(str(firstStr),centerPt,parentPt,decisionNode)
plotMidText(centerPt,parentPt,nodeTxt)
else:
plotNode(str(myTree[firstStr]),centerPt,parentPt,leafNode)
plotMidText(centerPt,parentPt,nodeTxt)
return
secondDict = myTree[firstStr]
if (type(secondDict).__name__ == "dict"):
for key in secondDict.keys():
plotTree(secondDict[key],(centerPt[0]+key*plotTree.xDict[firstStr],centerPt[1]-1.0/plotTree.totalD)
,centerPt, str(key))
return
def createPlot(inTree,savename="13.png"):
fig = plt.figure(1, facecolor='white',figsize=(20,10))
fig.clf()
axprops = dict(xticks=[], yticks=[])
createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)
plotTree.totalW = float(getNumLeafs(inTree))
plotTree.totalD = float(getTreeDepth(inTree))
plotTree.xDict={}
plotTree.xDict[1] = 4*1.0/plotTree.totalW
for i in range(2,int(plotTree.totalD)+1):
for j in range(2**(i-1),2**i):
plotTree.xDict[j]=plotTree.xDict[2**(i-2)]/1.8
plotTree(inTree,(0.43,1.0),(0.43, 1.0), '')
plt.savefig(savename)
return
if __name__=="__main__":
dataArray=ReadData("hw7_train.dat")
treeDict=decision_tree(1,dataArray)
print(treeDict)
# 13
createPlot(treeDict)
# 14
pred=predict(treeDict,dataArray[:,:-1])
ein=GetZeroOneError(pred,dataArray[:,-1])
print("the Ein of the tree:",ein)
# 15
testArray=ReadData("hw7_test.dat")
pred=predict(treeDict,testArray[:,:-1])
eout=GetZeroOneError(pred,testArray[:,-1])
print("the Eout of the tree:",eout)
16-20
# coding: utf-8
# random_forest.py
from decision_tree import *
def bagging(N,dataArray):
bagDataArray=[]
for n in range(N):
id=np.random.randint(low=0,high=dataArray.shape[0])
bagDataArray.append(dataArray[id,:])
return np.array(bagDataArray)
def random_forest(dataArray,iterations,prune=False):
num_data=dataArray.shape[0]
g_list=[]
ein_g_list=[]
ein_G_list=[]
pred_G=np.zeros((num_data,))
for t in range(iterations):
print(t+1)
bagDataArray=bagging(num_data,dataArray)
treeDict=decision_tree(1,bagDataArray,prune)
pred_g=predict(treeDict,dataArray[:,:-1])
pred_G+=pred_g
g_list.append(treeDict)
ein_g_list.append(GetZeroOneError(pred_g,dataArray[:,-1]))
tmpG=np.array(pred_G)
for i in range(num_data):
tmpG[i]=sign(tmpG[i])
ein_G_list.append(GetZeroOneError(tmpG,dataArray[:,-1]))
return g_list,ein_g_list,ein_G_list
def plot_line_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,3000,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):
plt.figure(figsize=(30,12))
plt.plot(X,Y,'b')
plt.plot(X,Y,'ro')
plt.xlim((X[0]-1,X[-1]+1))
for (x,y) in zip(X,Y):
if(x%100==0):
plt.text(x+0.1,y,str(round(y,4)))
plt.xlabel(nameX)
plt.ylabel(nameY)
plt.title(nameY+" versus "+nameX)
plt.savefig(saveName)
return
def plot_bar_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,300,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):
plt.figure(figsize=(30,12))
plt.bar(left=X,height=Y,width=1,align="center",yerr=0.000001)
for (c,w) in zip(X,Y):
if(c%100==0):
plt.text(c,w*1.03,str(round(w,4)))
plt.xlabel(nameX)
plt.ylabel(nameY)
plt.xlim(X[0]-1,X[-1]+1)
plt.ylim(0,1)
plt.title(nameY+" versus "+nameX)
plt.savefig(saveName)
return
if __name__ == "__main__":
dataArray = ReadData("hw7_train.dat")
g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000)
# 16
plot_bar_chart(Y=ein_g_list)
# 17
plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="17.png")
testArray = ReadData("hw7_test.dat")
num_test = testArray.shape[0]
pred_G = np.zeros((num_test,))
eout_G_list = []
for t in range(3000):
print(t+1)
pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])
pred_G += pred_g
tmpG = np.array(pred_G)
for i in range(num_test):
tmpG[i] = sign(tmpG[i])
eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))
# 18
plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="18.png")
g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000, True)
# 19
plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="19.png")
pred_G = np.zeros((num_test,))
eout_G_list = []
for t in range(3000):
print(t+1)
pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])
pred_G += pred_g
tmpG = np.array(pred_G)
for i in range(num_test):
tmpG[i] = sign(tmpG[i])
eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))
# 20
plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="20.png")
运行结果







机器学习技法笔记:Homework #7 Decision Tree&Random Forest相关习题的更多相关文章
- 机器学习技法笔记:09 Decision Tree
Roadmap Decision Tree Hypothesis Decision Tree Algorithm Decision Tree Heuristics in C&RT Decisi ...
- [ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest)
[ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest) 决策树 决策树算法以树状结构表示数据分类的结果.每个决策点实现一个具有离散输出的测试函数,记为分支 ...
- 机器学习技法笔记(2)-Linear SVM
从这一节开始学习机器学习技法课程中的SVM, 这一节主要介绍标准形式的SVM: Linear SVM 引入SVM 首先回顾Percentron Learning Algrithm(感知器算法PLA)是 ...
- 机器学习技法笔记:11 Gradient Boosted Decision Tree
Roadmap Adaptive Boosted Decision Tree Optimization View of AdaBoost Gradient Boosting Summary of Ag ...
- Coursera台大机器学习技法课程笔记11-Gradient Boosted Decision Tree
将Adaboost和decision tree相结合,需要注意的地主是,训练时adaboost需要改变资料的权重,如何将有权重的资 料和decision tree相结合呢?方法很类似于前面讲过的bag ...
- 机器学习技法笔记:10 Random Forest
Roadmap Random Forest Algorithm Out-Of-Bag Estimate Feature Selection Random Forest in Action Summar ...
- 机器学习算法 --- Pruning (decision trees) & Random Forest Algorithm
一.Table for Content 在之前的文章中我们介绍了Decision Trees Agorithms,然而这个学习算法有一个很大的弊端,就是很容易出现Overfitting,为了解决此问题 ...
- 机器学习算法实践:决策树 (Decision Tree)(转载)
前言 最近打算系统学习下机器学习的基础算法,避免眼高手低,决定把常用的机器学习基础算法都实现一遍以便加深印象.本文为这系列博客的第一篇,关于决策树(Decision Tree)的算法实现,文中我将对决 ...
- 机器学习技法笔记:Homework #8 kNN&RBF&k-Means相关习题
原文地址:https://www.jianshu.com/p/1db700f866ee 问题描述 程序实现 # kNN_RBFN.py # coding:utf-8 import numpy as n ...
随机推荐
- 100、TensorFlow实现FFM Field-awared FM模型
''' Created on 2017年11月15日 @author: weizhen ''' import tensorflow as tf import pandas as pd import n ...
- 【JavaSE】运行时类型信息(RTTI、反射)
运行时类型信息使得你可以在程序运行时发现和使用类型信息.--<Think in java 4th> **** 通常我们在面向对象的程序设计中我们经常使用多态特性使得大部分代码尽可能地少了解 ...
- 使用mybatis进行一对多嵌套查询时出错:输出结果:Country{id=2, name='美国', minister=[null]}
即Minister类作为Country类的关联属性. 查询的输出结果是:Country{id=2, name='美国', minister=[null]} <!--mapper.xml内容--& ...
- Jmeter命令行 传递参数
二.参数 -J 和 -G 1.格式:-J变量名=值 -G变量名=值 2.相同之处:设置jmeter属性,例如线程数.循环次数.ramp up-time等 3.不同之处:-J是设置本地jmeter属性 ...
- Jmeter脚本如何在Linux通过no GUI的方式运行 命令行传递参数
本文主要介绍Jmeter脚本如何在Linux通过no GUI的方式运行.总共分三部分: 1.Linux下JDK的安装及环境变量的配置 2.Linux下Jmeter的安装及环境变量的配置 3.运行结果的 ...
- CentOS 7 配置SFTP
目前越来越多的FTP客户端软件开始支持SSH协议上传和下载文件,这种协议方式就是SFTP. SFTP的优势主要有两点,一是不需要再配置个FTP服务端:二是SSH协议是安全传输,上传和下载是经过加密的. ...
- python2和python3中int整型数据的不同之处
python2中的除法,结果为整型数字(int型),例如 10/5=2,10/3=3,小数位向下取整 python3中的除法,结果为浮点型数字(float型)结果小数位最多保留16位小数
- DDCTF 北京地铁
这周打了ddctf,被打成了dd 北京地铁题目给了一张北京地铁图,提示如下:Color Threshold 提示:AES ECB密钥为小写字母提示2:密钥不足位用\0补全提示3:不要光记得隐写不看图片 ...
- cordova插件值 二维码扫描
插件地址 https://github.com/gizwits/cordova-gizwits-scan-qrcode 插件安装方式 cordova plugin add https://github ...
- Java的枚举类型使用方法详解
1.背景在java语言中还没有引入枚举类型之前,表示枚举类型的常用模式是声明一组具有int常量.之前我们通常利用public final static 方法定义的代码如下,分别用1 表示春天,2表示夏 ...