import numpy as np
import operator as op from math import log def createDataSet():
dataSet = [[1, 1, 'yes'],
[1, 1, 'yes'],
[1, 0, 'no'],
[0, 1, 'no'],
[0, 1, 'no']]
labels = ['no surfacing','flippers']
return dataSet, labels dataSet,labels = createDataSet()
print(dataSet)
print(labels) def calcShannonEnt(dataSet):
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if(currentLabel not in labelCounts.keys()):
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
rowNum = len(dataSet)
for key in labelCounts:
prob = float(labelCounts[key])/rowNum
shannonEnt -= prob * log(prob,2)
return shannonEnt shannonEnt = calcShannonEnt(dataSet)
print(shannonEnt) def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if(featVec[axis] == value):
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataSet.append(reducedFeatVec)
return retDataSet retDataSet = splitDataSet(dataSet,1,1)
print(np.array(retDataSet))
retDataSet = splitDataSet(dataSet,1,0)
print(retDataSet) def chooseBestFeatureToSplit(dataSet):
numFeatures = np.shape(dataSet)[1]-1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature bestFeature = chooseBestFeatureToSplit(dataSet)
print(bestFeature) def majorityCnt(classList):
classCount={}
for vote in classList:
if(vote not in classCount.keys()):
classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=op.itemgetter(1), reverse=True)
return sortedClassCount[0][0] def createTree(dataSet,labels):
classList = [example[-1] for example in dataSet]
if(classList.count(classList[0]) == len(classList)):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
myTree = {bestFeatLabel:{}}
del(labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:]
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value),subLabels)
return myTree myTree = createTree(dataSet,labels)
print(myTree) def classify(inputTree,featLabels,testVec):
for i in inputTree.keys():
firstStr = i
break
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
key = testVec[featIndex]
valueOfFeat = secondDict[key]
if isinstance(valueOfFeat, dict):
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
classLabel = valueOfFeat
return classLabel featLabels = ['no surfacing', 'flippers']
classLabel = classify(myTree,featLabels,[1,1])
print(classLabel) import pickle def storeTree(inputTree,filename):
fw = open(filename,'wb')
pickle.dump(inputTree,fw)
fw.close() def grabTree(filename):
fr = open(filename,'rb')
return pickle.load(fr) filename = "D:\\mytree.txt"
storeTree(myTree,filename)
mySecTree = grabTree(filename)
print(mySecTree) featLabels = ['no surfacing', 'flippers']
classLabel = classify(mySecTree,featLabels,[0,0])
print(classLabel)

吴裕雄 python 机器学习-DMT(1)的更多相关文章

  1. 吴裕雄 python 机器学习-DMT(2)

    import matplotlib.pyplot as plt decisionNode = dict(boxstyle="sawtooth", fc="0.8" ...

  2. 吴裕雄 python 机器学习——分类决策树模型

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...

  3. 吴裕雄 python 机器学习——回归决策树模型

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets from sklearn.model_s ...

  4. 吴裕雄 python 机器学习——线性判断分析LinearDiscriminantAnalysis

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  5. 吴裕雄 python 机器学习——逻辑回归

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  6. 吴裕雄 python 机器学习——ElasticNet回归

    import numpy as np import matplotlib.pyplot as plt from matplotlib import cm from mpl_toolkits.mplot ...

  7. 吴裕雄 python 机器学习——Lasso回归

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...

  8. 吴裕雄 python 机器学习——岭回归

    import numpy as np import matplotlib.pyplot as plt from sklearn import datasets, linear_model from s ...

  9. 吴裕雄 python 机器学习——线性回归模型

    import numpy as np from sklearn import datasets,linear_model from sklearn.model_selection import tra ...

随机推荐

  1. HDFS高级功能

    HDFS的六大高级特性: 安全模式 安全模式是HDFS所处的一种特殊状态,在这种状态下,文件系统只接受读数据请求,而不接受删除.修改等变更请求.在NameNode主节点启动时,HDFS首先进入安全模式 ...

  2. JavaScript Best Practices (w3cschool)

    JavaScript Best Practices (w3cschool) Local Variables: ·      总是在前面集中定义变量,(包括 for 的i).(strict mode) ...

  3. Java - 16 Java 方法

    在前面几个章节中我们经常使用到System.out.println(),那么它是什么呢? println()是一个方法(Method),而System是系统类(Class),out是标准输出对象(Ob ...

  4. virt-install详解

    man virt-install VIRT-INSTALL() Virtual Machine Manager VIRT-INSTALL() NAME virt-install - provision ...

  5. ScrollView嵌套RecyclerView、ScrollView嵌套Listview、ScrollView嵌套各种布局,默认不在顶部和回到顶部的解决方法;

    如果: ScrollView.scrollTo(0,0): ScrollView.fullScroll(View.FOCUS_UP) : ScrollView.smoothScrollTo(0, 0) ...

  6. RESTframwork之视图view

    一 在view.py 中: class AuthorView(APIView): def get(self, request): author_list = Author.objects.all() ...

  7. javascript中的未定义和未声明

    我们在项目中,经常会定义一些变量(很多时候,定义过多的全局变量),当我们调用这些变量的时候,就会发生各种各样的突发状况. 看一个示例: var a; typeof a; typeof b; 很简单的一 ...

  8. QT编写TCP入门+简单的实际项目(附源程序)

    我个人感觉学习QT不需要那么深入的了解,因为我就是编写一下界面来实现理想的功能而已,我不是靠这个吃饭,当然以后要是从事这个方向那就好好深入底层好好学了. 学习QT的TCP:第一步:去百度看看TCP的介 ...

  9. 自定义UITableViewCell上的delete按钮

    http://blog.csdn.net/xiaoxuan415315/article/details/7834940

  10. 四、Java web 部 分试题

    1 .Tomcat 的 优 化 经 验 答:去掉对 web.xml 的监视,把 jsp 提前编辑成 Servlet. 有富余物理内存的情况,加大 tomcat 使用的 jvm 的内存 2 .HTTP ...