import numpy as np
import operator as op from math import log def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing','flippers']
return dataSet, labels dataSet,labels = createDataSet()
print(dataSet)
print(labels) def calcShannonEnt(dataSet):
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if(currentLabel not in labelCounts.keys()):
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
rowNum = len(dataSet)
for key in labelCounts:
prob = float(labelCounts[key])/rowNum
shannonEnt -= prob * log(prob,2)
return shannonEnt shannonEnt = calcShannonEnt(dataSet)
print(shannonEnt) def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if(featVec[axis] == value):
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet retDataSet = splitDataSet(dataSet,1,1)
print(np.array(retDataSet))
retDataSet = splitDataSet(dataSet,1,0)
print(retDataSet) def chooseBestFeatureToSplit(dataSet):
numFeatures = np.shape(dataSet)[1]-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature bestFeature = chooseBestFeatureToSplit(dataSet)
print(bestFeature) def majorityCnt(classList):
classCount={}
for vote in classList:
if(vote not in classCount.keys()):
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=op.itemgetter(1), reverse=True)
return sortedClassCount[0][0] def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if(classList.count(classList[0]) == len(classList)):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree myTree = createTree(dataSet,labels)
print(myTree) def classify(inputTree,featLabels,testVec):
for i in inputTree.keys():
firstStr = i
break
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
classLabel = valueOfFeat
return classLabel featLabels = ['no surfacing', 'flippers']
classLabel = classify(myTree,featLabels,[1,1])
print(classLabel) import pickle def storeTree(inputTree,filename):
fw = open(filename,'wb')
pickle.dump(inputTree,fw)
fw.close() def grabTree(filename):
fr = open(filename,'rb')
return pickle.load(fr) filename = "D:\\mytree.txt"
storeTree(myTree,filename)
mySecTree = grabTree(filename)
print(mySecTree) featLabels = ['no surfacing', 'flippers']
classLabel = classify(mySecTree,featLabels,[0,0])
print(classLabel)

吴裕雄 python 机器学习-DMT(1)的更多相关文章

  1. 吴裕雄 python 机器学习-DMT(2)

    import matplotlib.pyplot as plt decisionNode = dict(boxstyle="sawtooth", fc="0.8" ...

  2. 吴裕雄 python 机器学习——分类决策树模型

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...

  3. 吴裕雄 python 机器学习——回归决策树模型

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...

  4. 吴裕雄 python 机器学习——线性判断分析LinearDiscriminantAnalysis

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  5. 吴裕雄 python 机器学习——逻辑回归

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  6. 吴裕雄 python 机器学习——ElasticNet回归

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  7. 吴裕雄 python 机器学习——Lasso回归

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...

  8. 吴裕雄 python 机器学习——岭回归

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...

  9. 吴裕雄 python 机器学习——线性回归模型

    import numpy as np from sklearn import datasets,linear_model from sklearn.model_selection import tra ...

随机推荐

  1. C语言:冒泡排序

    void sort(int arr[],int len) { ; ; i<len; i++) { printf("第%d轮:\n", i); // len-i+1:新轮比上轮 ...

  2. hadoop分布式集群的搭建

    电脑如果是8G内存或者以下建议搭建3节点集群,如果是搭建5节点集群就要增加内存条了.当然实际开发中不会用虚拟机做,一些小公司刚刚起步的时候会采用云服务,因为开始数据量不大. 但随着数据量的增大才会考虑 ...

  3. kafka的API操作

    在集群的接收端 启动producer 在consumer这边能接收到producer发来的数据

  4. How Computers Boot Up.计算机的引导过程

    原文标题:How Computers Boot Up 原文地址:http://duartes.org/gustavo/blog/ [注:本人水平有限,只好挑一些国外高手的精彩文章翻译一下.一来自己复习 ...

  5. solr使用cursorMark做深度分页

    深度分页 深度分页是指给搜索结果指定一个很大的起始位移. 普通分页在给定一个大的起始位移时效率十分低下,例如start=1000000,rows=10的查询,搜索引擎需要找到前1000010条记录然后 ...

  6. 内存大小设置 Java heap space错误

    1. 问题描述 当从数据库中查询大量的数据,每个模板取出来几百万条数据,或者是频繁的刷新项目.模板时就会占用Java虚拟机JVM的大量内存,超过内存就会出现报java.lang.OutOfMemory ...

  7. 网易微专业 UI设计师

    网易云课堂的UI设计师微专业,需要的留言

  8. js 原生图片上传

    <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8&quo ...

  9. Linux中使用python测试主机存活 Linux系统CentOS Linux release 7.3.1611 (Core) py版本Python 2.7.5

    下面是最初的情况 #/usr/bin/env python # -*- coding: utf-8 -*- import os import time import subprocess import ...

  10. centos磁盘挂载|centos虚拟机硬盘不够怎么办?|centos虚拟机硬盘的扩展

    Centos6磁盘挂载 添加一块磁盘 分区,格式化,挂载新磁盘 磁盘挂载 df -lh fdisk -l fdisk /dev/sdb 这个命令执行后依次输 n p 回车 回车 w fdisk -l ...