決定木(DecisionTree)の白話の原理と簡単な応用
11873 ワード
DecisionTree.py
簡単に応用して分類を得る
意思決定ツリーが得られたら、シーケンス化することで、使用する必要があるときにメモリに読み込むことができます.つまり、再トレーニングを必要とせず、時間を節約できます.
test.py学習中のpythonに対するいくつかの練習といくつかの知識点.
from math import log
# , Shannon
# , , ,
#
# , ,
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet:
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys():
labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key])/numEntries
shannonEnt -= prob * log(prob,2)
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataset = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis]
reducedFeatVec.extend(featVec[axis+1:])
retDataset.append(reducedFeatVec)
return retDataset
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0
bestFeature = -1
for i in range(numFeatures):
featList = [example[i] for example in dataSet]
uniqueVals = set(featList)
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i ,value)
prob = len(subDataSet)/float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy
if (infoGain > bestInfoGain):
bestInfoGain = infoGain
bestFeature = i
return bestFeature
def majorityCnt(classList):
classCount = {}
for vote in classCount:
if vote not in classCount.keys():classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.items(), key=lambda d:d[1], reverse = True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
classList = [example[-1] for example in dataSet]
#print(len(classList))
if classList.count(classList[0]) == len(classList):
return classList[0]
if len(dataSet[0]) == 1:
return majorityCnt(classList)
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
print(bestFeat)
print(bestFeatLabel)
myTree = {bestFeatLabel:{}}#
tmplabels = labels[:]
del(tmplabels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
print(uniqueVals)
for value in uniqueVals:
subLabels = tmplabels[:]
#
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = list(inputTree.keys())[0]
secondDict = inputTree[firstStr]
featIndex = featLabels.index(firstStr)
for key in secondDict.keys():
if testVec[featIndex] == key:
if type(secondDict[key]).__name__=='dict':
classLabel = classify(secondDict[key],featLabels,testVec)
else: classLabel = secondDict[key]
return classLabel
def creatDataSet():
dataSet = [
[1,1,'yes'],
[1,1,'yes'],
[1,0,'no'],
[0,1,'no'],
[0,1,'no']
]
labels = ['no surfacing','flippers']
return dataSet, labels
myData, labels = creatDataSet()
print("1",labels)
myTree = createTree(myData, labels)
print("2",labels)
print(myTree) #result = {'no surfacing': {0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}}
test1 = classify(myTree,labels,[1,0])
test2 = classify(myTree,labels,[1,1])
print("test1: ",test1) # test1: no
print("test2: ",test2) # test2: yes
簡単に応用して分類を得る
import ch2.DecisionTree as dTree
fr = open("lenses.txt")
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age', 'prescript', 'astigmatic', 'tearRate']
lensesTree = dTree.createTree(lenses,lensesLabels)
print(lensesTree)
意思決定ツリーが得られたら、シーケンス化することで、使用する必要があるときにメモリに読み込むことができます.つまり、再トレーニングを必要とせず、時間を節約できます.
# pickle
def storeTree(inputTree,filename):
import pickle
fw = open(filename,'w')
pickle.dump(inputTree,fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
test.py学習中のpythonに対するいくつかの練習といくつかの知識点.
#python ,
import matplotlib.pyplot as plt
'''
#
decisionNode = dict(boxstyle="sawtooth", fc="0.8")
leafNode = dict(boxstyle="round4", fc="0.8")
arrow_args = dict(arrowstyle="def plotNode(nodeTxt, centerPt, parentPt, nodeType):
createPlot.ax1.annotate(nodeTxt, xy=parentPt,
xycoords='axes fraction',
xytext=centerPt, textcoords='axes fraction',
va="center", ha="center", bbox=nodeType, arrowprops=arrow_args)
def createPlot():
fig = plt.figure(1, facecolor='white')
fig.clf()
createPlot.ax1 = plt.subplot(111, frameon=False)
plotNode('a decision node', (0.5, 0.1), (0.1, 0.5), decisionNode)
plotNode('a leaf node', (0.8, 0.1), (0.3, 0.8), leafNode)
plt.show()
createPlot()
'''
'''
# extend append
a = [1,2,3]
b = [3,4,5]
a.append(b)
print(a)
# append [1, 2, 3, [3, 4, 5]], b , 4
c = [1,2,3]
d = [3,4,5]
c.extend(d)
print(c)
# extend [1, 2, 3, 3, 4, 5], 6
'''
'''
# dict_keys
#python3 dict keys(), values(), items() , list key
d= {'a':{'d':2},'b':1,'c':{}}
print(list(d.keys()))
print(list(d.keys())[0])
'''