原文地址：https://www.jianshu.com/p/7ff6fd6fc99f

问题描述

程序实现

13-15

# coding:utf-8

# decision_tree.py

import numpy as np

def ReadData(dataFile):

    with open(dataFile, 'r') as f:

        lines = f.readlines()

        data_list = []

        for line in lines:

            line = line.strip().split()

            data_list.append([float(l) for l in line])

        dataArray = np.array(data_list)

        return dataArray

def sign(n):

    if(n>=0):

        return 1

    else:

        return -1

def GetSortedArray(dataArray,i):

     # 根据dataArray第i列的值对dataArray进行从小到大的排序

    data_list=dataArray.tolist()

    sorted_data_list=sorted(data_list,key=lambda x:x[i],reverse=False)

    sortedDataArray=np.array(sorted_data_list)

    return sortedDataArray

def GetSplitData(pred,dataArray):

    assert pred.shape[0]==dataArray.shape[0],"wrong shape of prediction!"

    falseData=[]

    trueData=[]

    for n in range(pred.shape[0]):

        if pred[n]==-1:

            falseData.append(dataArray[n,:])

        elif pred[n]==1:

            trueData.append(dataArray[n,:])

        else:

            print("wrong prediction!")

    return np.array(falseData),np.array(trueData)

def GetWeightedImpurity(pred,dataY):

    num_data = dataY.shape[0]

    num_false=(pred==-1).sum()

    num_true=(pred==1).sum()

    assert num_false+num_true==num_data,"wrong prediction!"

    if(num_false==0):

        falseGini=0

    else:

        falseFalse = ((pred + dataY) == -2).sum()

        falseTrue = num_false - falseFalse

        falseGini=1 - (falseFalse ** 2 + falseTrue ** 2) / num_false ** 2

    if(num_true==0):

        trueGini=0

    else:

        trueTrue = ((pred + dataY) == 2).sum()

        trueFalse = num_true - trueTrue

        trueGini=1-(trueFalse**2+trueTrue**2)/num_true**2

    return (num_false*falseGini+num_true*trueGini)/num_data

def decision_stump(dataArray):

    num_data=dataArray.shape[0]

    num_dim=dataArray.shape[1]-1

    min_e=np.inf

    min_s = np.inf

    min_d=np.inf

    min_theta = np.inf

    min_pred=np.zeros((num_data,))

    for d in range(num_dim):

        sortedDataArray=GetSortedArray(dataArray,d)

        d_min_e=np.inf

        d_min_s = np.inf

        d_min_theta = np.inf

        d_min_pred = np.zeros((num_data,))

        for s in [-1.0,1.0]:

            for i in range(num_data):

                if(i==0):

                    theta=-np.inf

                    pred=s*np.ones((num_data,))

                else:

                    if sortedDataArray[i-1][d]==sortedDataArray[i][d]:

                        continue

                    theta=(sortedDataArray[i-1][d]+sortedDataArray[i][d])/2

                    pred=np.zeros((num_data,))

                    for n in range(num_data):

                        pred[n]=s*sign(dataArray[n,d]-theta)

                d_now_e=GetWeightedImpurity(pred,dataArray[:,-1])

                if(d_now_e<d_min_e):

                    d_min_e=d_now_e

                    d_min_s=s

                    d_min_theta=theta

                    d_min_pred=pred

        if(d_min_e<min_e):

            min_e=d_min_e

            min_s=d_min_s

            min_d=d

            min_theta=d_min_theta

            min_pred=d_min_pred

    return min_s,min_d,min_theta,min_pred

paraDict={}

def decision_tree(id,dataArray,prune=False):

    num_data=dataArray.shape[0]

    num_dim=dataArray.shape[1]-1

    dataX=dataArray[:,:-1]

    dataY=dataArray[:,-1]

    if(dataY.min()==dataY.max()): # y相同

        return {id:dataY[0]}

    tmpX=np.concatenate([dataX[0,:].reshape((1,num_dim))]*num_data,axis=0)

    if(((dataX-tmpX)==0).all()): # x无法再分割

        return {id:sign(np.sum(dataY))}

    s,d,theta,pred=decision_stump(dataArray)

    paraDict[id]=[s,d,theta]

    falseArray,trueArray=GetSplitData(pred,dataArray)

    if prune:

        return {id:{-1:{id*2:sign(falseArray[:,-1].sum())},1:{id*2+1:sign(trueArray[:,-1].sum())}}}

    falseTree=decision_tree(id*2,falseArray)

    trueTree=decision_tree(id*2+1,trueArray)

    return {id:{-1:falseTree,1:trueTree}}

def GetZeroOneError(pred,dataY):

    return (pred!=dataY).sum()/dataY.shape[0]

def predict(treeDict,dataX):

    num_data=dataX.shape[0]

    pred=np.zeros((num_data,))

    for n in range(num_data):

        x=dataX[n,:]

        id=1

        tmp_dict=treeDict

        while(1):

            tmp_dict=tmp_dict[id]

            if(type(tmp_dict).__name__!="dict"):

                break

            paraList = paraDict[id]

            tmp_res=paraList[0]*sign(x[paraList[1]]-paraList[2])

            tmp_dict=tmp_dict[tmp_res]

            id=list(tmp_dict.keys())[0]

        pred[n]=tmp_dict

    return pred

def getNumLeafs(myTree):

    numLeafs = 0

    firstStr = list(myTree.keys())[0]

    secondDict = myTree[firstStr]

    if(type(secondDict).__name__=="dict"):

        numLeafs += getNumLeafs(secondDict[-1])

        numLeafs+=getNumLeafs(secondDict[1])

    else:

        numLeafs += 1

    return numLeafs

def getTreeDepth(myTree):

    maxDepth = 0

    firstStr = list(myTree.keys())[0]

    secondDict = myTree[firstStr]

    if(type(secondDict).__name__=="dict"):

            thisDepth = 1 + max(getTreeDepth(secondDict[-1]),getTreeDepth(secondDict[1]))

    else:

        thisDepth = 1

    if thisDepth > maxDepth: maxDepth = thisDepth

    return maxDepth

import matplotlib.pyplot as plt

decisionNode = dict(boxstyle="round", fc="0.8",pad=0.8)

leafNode = dict(boxstyle="circle", fc="0.8",pad=0.1)

arrow_args = dict(arrowstyle="<-")

def plotNode(nodeTxt, centerPt, parentPt, nodeType):

    createPlot.ax1.annotate(nodeTxt, xy=parentPt, xycoords='axes fraction',

                            xytext=centerPt, textcoords='axes fraction',

                            va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)

    return

def plotMidText(centerPt, parentPt, txtString):

    xMid = (parentPt[0] - centerPt[0]) / 2.0 + centerPt[0]

    yMid = (parentPt[1] - centerPt[1]) / 2.0 + centerPt[1]

    createPlot.ax1.text(xMid, yMid, txtString, va="center", ha="center", rotation=30)

    return

def plotTree(myTree, centerPt, parentPt, nodeTxt):

    firstStr = list(myTree.keys())[0]

    if firstStr==1:

        createPlot.ax1.annotate(str(1), xy=parentPt, xycoords='axes fraction',

                                va="center", ha="center",bbox=decisionNode)

    elif firstStr in paraDict:

        plotNode(str(firstStr),centerPt,parentPt,decisionNode)

        plotMidText(centerPt,parentPt,nodeTxt)

    else:

        plotNode(str(myTree[firstStr]),centerPt,parentPt,leafNode)

        plotMidText(centerPt,parentPt,nodeTxt)

        return

    secondDict = myTree[firstStr]

    if (type(secondDict).__name__ == "dict"):

        for key in secondDict.keys():

            plotTree(secondDict[key],(centerPt[0]+key*plotTree.xDict[firstStr],centerPt[1]-1.0/plotTree.totalD)

                     ,centerPt, str(key))

    return

def createPlot(inTree,savename="13.png"):

    fig = plt.figure(1, facecolor='white',figsize=(20,10))

    fig.clf()

    axprops = dict(xticks=[], yticks=[])

    createPlot.ax1 = plt.subplot(111, frameon=False, **axprops)

    plotTree.totalW = float(getNumLeafs(inTree))

    plotTree.totalD = float(getTreeDepth(inTree))

    plotTree.xDict={}

    plotTree.xDict[1] = 4*1.0/plotTree.totalW

    for i in range(2,int(plotTree.totalD)+1):

        for j in range(2**(i-1),2**i):

            plotTree.xDict[j]=plotTree.xDict[2**(i-2)]/1.8

    plotTree(inTree,(0.43,1.0),(0.43, 1.0), '')

    plt.savefig(savename)

    return

if __name__=="__main__":

    dataArray=ReadData("hw7_train.dat")

    treeDict=decision_tree(1,dataArray)

    print(treeDict)

    # 13

    createPlot(treeDict)

    # 14

    pred=predict(treeDict,dataArray[:,:-1])

    ein=GetZeroOneError(pred,dataArray[:,-1])

    print("the Ein of the tree:",ein)

    # 15

    testArray=ReadData("hw7_test.dat")

    pred=predict(treeDict,testArray[:,:-1])

    eout=GetZeroOneError(pred,testArray[:,-1])

    print("the Eout of the tree:",eout)

16-20

# coding: utf-8

# random_forest.py

from  decision_tree import *

def bagging(N,dataArray):

    bagDataArray=[]

    for n in range(N):

        id=np.random.randint(low=0,high=dataArray.shape[0])

        bagDataArray.append(dataArray[id,:])

    return np.array(bagDataArray)

def random_forest(dataArray,iterations,prune=False):

    num_data=dataArray.shape[0]

    g_list=[]

    ein_g_list=[]

    ein_G_list=[]

    pred_G=np.zeros((num_data,))

    for t in range(iterations):

        print(t+1)

        bagDataArray=bagging(num_data,dataArray)

        treeDict=decision_tree(1,bagDataArray,prune)

        pred_g=predict(treeDict,dataArray[:,:-1])

        pred_G+=pred_g

        g_list.append(treeDict)

        ein_g_list.append(GetZeroOneError(pred_g,dataArray[:,-1]))

        tmpG=np.array(pred_G)

        for i in range(num_data):

            tmpG[i]=sign(tmpG[i])

        ein_G_list.append(GetZeroOneError(tmpG,dataArray[:,-1]))

    return g_list,ein_g_list,ein_G_list

def plot_line_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,3000,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):

    plt.figure(figsize=(30,12))

    plt.plot(X,Y,'b')

    plt.plot(X,Y,'ro')

    plt.xlim((X[0]-1,X[-1]+1))

    for (x,y) in zip(X,Y):

        if(x%100==0):

            plt.text(x+0.1,y,str(round(y,4)))

    plt.xlabel(nameX)

    plt.ylabel(nameY)

    plt.title(nameY+" versus "+nameX)

    plt.savefig(saveName)

    return

def plot_bar_chart(X=np.arange(0,3000,1).tolist(),Y=np.arange(0,300,1).tolist(),nameX="t",nameY="Ein(gt)",saveName="16.png"):

    plt.figure(figsize=(30,12))

    plt.bar(left=X,height=Y,width=1,align="center",yerr=0.000001)

    for (c,w) in zip(X,Y):

        if(c%100==0):

            plt.text(c,w*1.03,str(round(w,4)))

    plt.xlabel(nameX)

    plt.ylabel(nameY)

    plt.xlim(X[0]-1,X[-1]+1)

    plt.ylim(0,1)

    plt.title(nameY+" versus "+nameX)

    plt.savefig(saveName)

    return

if __name__ == "__main__":

    dataArray = ReadData("hw7_train.dat")

    g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000)

    # 16

    plot_bar_chart(Y=ein_g_list)

    # 17

    plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="17.png")

    testArray = ReadData("hw7_test.dat")

    num_test = testArray.shape[0]

    pred_G = np.zeros((num_test,))

    eout_G_list = []

    for t in range(3000):

        print(t+1)

        pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])

        pred_G += pred_g

        tmpG = np.array(pred_G)

        for i in range(num_test):

            tmpG[i] = sign(tmpG[i])

        eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))

    # 18

    plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="18.png")

    g_list, ein_g_list, ein_G_list = random_forest(dataArray, 3000, True)

    # 19

    plot_line_chart(Y=ein_G_list, nameY="Ein(Gt)", saveName="19.png")

    pred_G = np.zeros((num_test,))

    eout_G_list = []

    for t in range(3000):

        print(t+1)

        pred_g = predict(treeDict=g_list[t],dataX=testArray[:, :-1])

        pred_G += pred_g

        tmpG = np.array(pred_G)

        for i in range(num_test):

            tmpG[i] = sign(tmpG[i])

        eout_G_list.append(GetZeroOneError(tmpG, testArray[:, -1]))

    # 20

    plot_line_chart(Y=eout_G_list, nameY="Eout(Gt)", saveName="20.png")

运行结果

机器学习技法笔记：Homework #7 Decision Tree&Random Forest相关习题的更多相关文章

机器学习技法笔记：09 Decision Tree
Roadmap Decision Tree Hypothesis Decision Tree Algorithm Decision Tree Heuristics in C&RT Decisi ...
[ML学习笔记] 决策树与随机森林（Decision Tree&Random Forest）
[ML学习笔记] 决策树与随机森林(Decision Tree&Random Forest) 决策树决策树算法以树状结构表示数据分类的结果.每个决策点实现一个具有离散输出的测试函数,记为分支 ...
机器学习技法笔记(2)-Linear SVM
从这一节开始学习机器学习技法课程中的SVM, 这一节主要介绍标准形式的SVM: Linear SVM 引入SVM 首先回顾Percentron Learning Algrithm(感知器算法PLA)是 ...
机器学习技法笔记：11 Gradient Boosted Decision Tree
Roadmap Adaptive Boosted Decision Tree Optimization View of AdaBoost Gradient Boosting Summary of Ag ...
Coursera台大机器学习技法课程笔记11-Gradient Boosted Decision Tree
将Adaboost和decision tree相结合,需要注意的地主是,训练时adaboost需要改变资料的权重,如何将有权重的资料和decision tree相结合呢?方法很类似于前面讲过的bag ...
机器学习技法笔记：10 Random Forest
Roadmap Random Forest Algorithm Out-Of-Bag Estimate Feature Selection Random Forest in Action Summar ...
机器学习算法 --- Pruning (decision trees) & Random Forest Algorithm
一.Table for Content 在之前的文章中我们介绍了Decision Trees Agorithms,然而这个学习算法有一个很大的弊端,就是很容易出现Overfitting,为了解决此问题 ...
机器学习算法实践：决策树 (Decision Tree)（转载）
前言最近打算系统学习下机器学习的基础算法,避免眼高手低,决定把常用的机器学习基础算法都实现一遍以便加深印象.本文为这系列博客的第一篇,关于决策树(Decision Tree)的算法实现,文中我将对决 ...
机器学习技法笔记：Homework #8 kNN&RBF&k-Means相关习题
原文地址:https://www.jianshu.com/p/1db700f866ee 问题描述程序实现 # kNN_RBFN.py # coding:utf-8 import numpy as n ...

随机推荐

IIS发布获取apk文件，部署IIS遇到的问题记录
今天需要帮着测试一下PDA的检测更新功能,所以需要把web部署到IIS上. 在部署的时候,遇到了两个小问题. 一.web 服务器被配置为不列出此目录的内容. 解决办法:1.按照提示的可尝试操作,启用了 ...
Java学习之Static
Static(静态)是一种修饰符,用于修饰成员(成员变量.成员函数) 1.静态方法只能访问静态成员. 2.静态随着类的加载而加载通过代码分析: class PersonDemo { public s ...
Python变量的下划线
xx: 公有变量 _x: 单前置下划线,私有化属性或方法,from somemodule import *禁止导入,类对象和子类可以访问 __xx:双前置下划线,避免与子类中的属性命名冲突,无法在外部 ...
PAT甲级——A1144 TheMissingNumber【20】
Given N integers, you are supposed to find the smallest positive integer that is NOT in the given li ...
Django框架（十八）—— CBV源码分析、restful规范、restframework框架
目录 CBV源码分析.restful规范.restframework框架一.CBV源码分析 1.url层的使用CBV 2.as_view方法 3.view方法 4.dispatch方法(可以在视图层 ...
如何在webpack开发中利用vue框架使用ES6中提供的新语法
在webpack中开发,会遇到一大推问题,特别是babel6升级到babel7,要跟新一大推插件,而对于安装babel的功能就是在webpack开发中,vue中能够是用ES6的新特性: 例如ES6中的 ...
FrameWork内核解析之布局加载与资源系统（三）
阿里P7Android高级架构进阶视频免费学习请点击:https://space.bilibili.com/474380680本篇文章将继续从以下两个内容来介绍布局加载与资源系统: [ LayoutM ...
ssh文件 ip锁定
vi ~/.ssh/config Host web*hostname 115.29.242.1**user lian Host **hostname 192.168.1.**user dface
CF1158C
题意:有排列p, 令\(nxt_i\)为\(p_i\)右侧第一个大于\(p_i\)的数的位置,若不存在则\(nxt_i=n+1\) 现在整个p和nxt的一部分丢失了,请根据剩余的nxt,构造出一个符合 ...
Pytest---yield
场景:你已经可以将测试方法前要执行的或依赖的解决了,测试方法后销毁清除数据的要如何进行呢?范围是模块级别的.类似 setupClass 解决:通过在同一模块中加入 yield关键字,yield是调用 ...

机器学习技法笔记：Homework #7 Decision Tree&Random Forest相关习题

问题描述

程序实现

13-15

16-20

运行结果

机器学习技法笔记：Homework #7 Decision Tree&Random Forest相关习题的更多相关文章

随机推荐

热门专题