吴裕雄 python 机器学习-DMT（1）

import numpy as np

import operator as op

from math import log

def createDataSet():

    dataSet = [[1, 1, 'yes'],

               [1, 1, 'yes'],

               [1, 0, 'no'],

               [0, 1, 'no'],

               [0, 1, 'no']]

    labels = ['no surfacing','flippers']

    return dataSet, labels

dataSet,labels = createDataSet()

print(dataSet)

print(labels)

def calcShannonEnt(dataSet):

    labelCounts = {}

    for featVec in dataSet:

        currentLabel = featVec[-1]

        if(currentLabel not in labelCounts.keys()):

            labelCounts[currentLabel] = 0

        labelCounts[currentLabel] += 1

    shannonEnt = 0.0

    rowNum = len(dataSet)

    for key in labelCounts:

        prob = float(labelCounts[key])/rowNum

        shannonEnt -= prob * log(prob,2)

    return shannonEnt

shannonEnt = calcShannonEnt(dataSet)

print(shannonEnt)

def splitDataSet(dataSet, axis, value):

    retDataSet = []

    for featVec in dataSet:

        if(featVec[axis] == value):

            reducedFeatVec = featVec[:axis]

            reducedFeatVec.extend(featVec[axis+1:])

            retDataSet.append(reducedFeatVec)

    return retDataSet

retDataSet = splitDataSet(dataSet,1,1)

print(np.array(retDataSet))

retDataSet = splitDataSet(dataSet,1,0)

print(retDataSet)

def chooseBestFeatureToSplit(dataSet):

    numFeatures = np.shape(dataSet)[1]-1

    baseEntropy = calcShannonEnt(dataSet)

    bestInfoGain = 0.0

    bestFeature = -1

    for i in range(numFeatures):

        featList = [example[i] for example in dataSet]

        uniqueVals = set(featList)

        newEntropy = 0.0

        for value in uniqueVals:

            subDataSet = splitDataSet(dataSet, i, value)

            prob = len(subDataSet)/float(len(dataSet))

            newEntropy += prob * calcShannonEnt(subDataSet)

        infoGain = baseEntropy - newEntropy

        if (infoGain > bestInfoGain):

            bestInfoGain = infoGain

            bestFeature = i

    return bestFeature 

bestFeature = chooseBestFeatureToSplit(dataSet)

print(bestFeature)

def majorityCnt(classList):

    classCount={}

    for vote in classList:

        if(vote not in classCount.keys()):

            classCount[vote] = 0

        classCount[vote] += 1

    sortedClassCount = sorted(classCount.items(), key=op.itemgetter(1), reverse=True)

    return sortedClassCount[0][0]

def createTree(dataSet,labels):

    classList = [example[-1] for example in dataSet]

    if(classList.count(classList[0]) == len(classList)):

        return classList[0]

    if len(dataSet[0]) == 1:

        return majorityCnt(classList)

    bestFeat = chooseBestFeatureToSplit(dataSet)

    bestFeatLabel = labels[bestFeat]

    myTree = {bestFeatLabel:{}}

    del(labels[bestFeat])

    featValues = [example[bestFeat] for example in dataSet]

    uniqueVals = set(featValues)

    for value in uniqueVals:

        subLabels = labels[:]

        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)

    return myTree

myTree = createTree(dataSet,labels)

print(myTree)

def classify(inputTree,featLabels,testVec):

    for i in inputTree.keys():

        firstStr = i

        break

    secondDict = inputTree[firstStr]

    featIndex = featLabels.index(firstStr)

    key = testVec[featIndex]

    valueOfFeat = secondDict[key]

    if isinstance(valueOfFeat, dict):

        classLabel = classify(valueOfFeat, featLabels, testVec)

    else:

        classLabel = valueOfFeat

    return classLabel

featLabels = ['no surfacing', 'flippers']

classLabel = classify(myTree,featLabels,[1,1])

print(classLabel)

import pickle

def storeTree(inputTree,filename):

    fw = open(filename,'wb')

    pickle.dump(inputTree,fw)

    fw.close()

def grabTree(filename):

    fr = open(filename,'rb')

    return pickle.load(fr)

filename = "D:\\mytree.txt"

storeTree(myTree,filename)

mySecTree = grabTree(filename)

print(mySecTree)

featLabels = ['no surfacing', 'flippers']

classLabel = classify(mySecTree,featLabels,[0,0])

print(classLabel)

吴裕雄 python 机器学习-DMT（1）的更多相关文章

吴裕雄 python 机器学习-DMT（2）
import matplotlib.pyplot as plt decisionNode = dict(boxstyle="sawtooth", fc="0.8" ...
吴裕雄 python 机器学习——分类决策树模型
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...
吴裕雄 python 机器学习——回归决策树模型
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...
吴裕雄 python 机器学习——线性判断分析LinearDiscriminantAnalysis
import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...
吴裕雄 python 机器学习——逻辑回归
import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...
吴裕雄 python 机器学习——ElasticNet回归
import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...
吴裕雄 python 机器学习——Lasso回归
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...
吴裕雄 python 机器学习——岭回归
import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...
吴裕雄 python 机器学习——线性回归模型
import numpy as np from sklearn import datasets,linear_model from sklearn.model_selection import tra ...

随机推荐

C语言：冒泡排序
void sort(int arr[],int len) { ; ; i<len; i++) { printf("第%d轮:\n", i); // len-i+1:新轮比上轮 ...
hadoop分布式集群的搭建
电脑如果是8G内存或者以下建议搭建3节点集群,如果是搭建5节点集群就要增加内存条了.当然实际开发中不会用虚拟机做,一些小公司刚刚起步的时候会采用云服务,因为开始数据量不大. 但随着数据量的增大才会考虑 ...
kafka的API操作
在集群的接收端启动producer 在consumer这边能接收到producer发来的数据
How Computers Boot Up.计算机的引导过程
原文标题:How Computers Boot Up 原文地址:http://duartes.org/gustavo/blog/ [注:本人水平有限,只好挑一些国外高手的精彩文章翻译一下.一来自己复习 ...
solr使用cursorMark做深度分页
深度分页深度分页是指给搜索结果指定一个很大的起始位移. 普通分页在给定一个大的起始位移时效率十分低下,例如start=1000000,rows=10的查询,搜索引擎需要找到前1000010条记录然后 ...
内存大小设置 Java heap space错误
1. 问题描述当从数据库中查询大量的数据,每个模板取出来几百万条数据,或者是频繁的刷新项目.模板时就会占用Java虚拟机JVM的大量内存,超过内存就会出现报java.lang.OutOfMemory ...
网易微专业 UI设计师
网易云课堂的UI设计师微专业,需要的留言
js 原生图片上传
<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...
Linux中使用python测试主机存活 Linux系统CentOS Linux release 7.3.1611 (Core) py版本Python 2.7.5
下面是最初的情况 #/usr/bin/env python # -*- coding: utf-8 -*- import os import time import subprocess import ...
centos磁盘挂载|centos虚拟机硬盘不够怎么办?|centos虚拟机硬盘的扩展
Centos6磁盘挂载添加一块磁盘分区,格式化,挂载新磁盘磁盘挂载 df -lh fdisk -l fdisk /dev/sdb 这个命令执行后依次输 n p 回车回车 w fdisk -l ...

吴裕雄 python 机器学习-DMT（1）

吴裕雄 python 机器学习-DMT（1）的更多相关文章

随机推荐

热门专题