機械学習実戦_Python3.7_kNNアルゴリズム

11259 ワード

Python 3を使う.7コンパイル、コードは本の中で類似して、python 2が淘汰された文法を修正して、詳しい注釈を添付します.
kNNアルゴリズムの原理は非常に簡単で,言うまでもなく,欠点は主に運転速度である.
ここで使用する方法はすべてマトリクス構造であり,グローバル遍歴であり,高度なデータ構造は用いられず,実行効率は低いが,実現は簡単である.
PythonとNumpyの基本構文(リスト、マトリクス操作)と基本的なファイル操作を重点的に学習します.
きほんアルゴリズム
from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

def classify0(inX,dataSet,labels,k):
    #   dataSet      ,   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ,  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat **2
    #   sum          ,      axis      0 ,      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ,                
    distance = sqDistance **0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ,       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ,          label ,get     0 ,    label   1      
        #         label ,get        label    ,    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True :    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

テスト:
inX = [0.1,0.1]
group, labels = createDataSet()
res = classify0(inX, group, labels, 3)
print(res)

実行結果(中間変数の一部を追加し、プログラムプロセスをより明確に理解する):
diffMat:
[[-0.9 -1. ]
 [-0.9 -0.9]
 [ 0.1  0.1]
 [ 0.1  0. ]]
distance:
[1.3453624  1.27279221 0.14142136 0.1       ]
sortedDistIndicies:
[3 2 1 0]
classCount:
{'B': 2, 'A': 1}
sortedClassCount:
[('B', 2), ('A', 1)]
B

Process finished with exit code 0

例1:デートサイト
from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   

def classify0(inX,dataSet,labels,k):
    #   dataSet      ,   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ,  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat ** 2
    #   sum          ,      axis      0 ,      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ,                
    distance = sqDistance ** 0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ,       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ,          label ,get     0 ,    label   1      
        #         label ,get        label    ,    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True :    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def file2matrix(filename):
    fr = open(filename)
    #   readlines       ,                ,            
    arrayOLines = fr.readlines()
    #   len            ,      ,    
    numberOfLines = len(arrayOLines)
    #             (       * 3      )
    returnMat = zeros((numberOfLines, 3))
    #            (    )
    classLabelVector = []
    #     (        )
    index = 0
    #              
    for line in arrayOLines:
        #        ,     
        line = line.strip()
        #   '\t' (Tab)      ,     4           
        listFromLine = line.split('\t')
        #   index       3    (  )     returnMat      
        #      zeros        returnMat,     float  ,            
        returnMat[index,:] = listFromLine[0:3]
        #   4    (label)   classLabelVector      ,       int  
        classLabelVector.append(int(listFromLine[-1]))
        #    ,        
        index += 1
    return returnMat, classLabelVector

def autoNorm(dataSet):
    #        (    )
    minVals = dataSet.min(0)
    #         (    )
    maxVals = dataSet.max(0)
    #    ,           
    ranges = maxVals - minVals
    #  zeros       dataSet        
    normDataSet = zeros(shape(dataSet))
    #   shape[0]    dataSet    
    m = dataSet.shape[0]
    #     (   tile         )
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet

def datingClassTest():
    #              
    hoRatio = 0.03
    #        file2matrix              
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    #        autoNorm             
    normMat = autoNorm(datingDataMat)
    #   shape[0]     (    )
    m = normMat.shape[0]
    #                   
    numTestVecs = int(m * hoRatio)
    #      
    errorCount = 0.0
    #            
    for i in range(numTestVecs):
        #        classify0           
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], datingLabels[numTestVecs:m], 3)
        #        
        print("the classifier came back with: %d, the real answer is: %d" %(classifierResult, datingLabels[i]))
        #     ,    
        if(classifierResult != datingLabels[i]): errorCount += 1.0
    #       
    print("the total error rate is: %f" %(errorCount / float(numTestVecs)))


テスト:
datingClassTest()

実行結果:
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the total error rate is: 0.033333

Process finished with exit code 0

例2:手書き数字
from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   
from os import listdir
#       ,                 

def classify0(inX,dataSet,labels,k):
    #   dataSet      ,   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ,  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat ** 2
    #   sum          ,      axis      0 ,      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ,                
    distance = sqDistance ** 0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ,       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ,          label ,get     0 ,    label   1      
        #         label ,get        label    ,    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True :    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def img2vector(filename):
    #          (32 * 32)       1 * 1024       
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32 * i + j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    #          label     
    hwLabels = []
    #   listdir         
    trainingFileList = listdir('trainingDigits')
    #      (        )
    m = len(trainingFileList)
    #               
    trainingMat = zeros((m, 1024))
    #             
    for i in range(m):
        #         , '0_0.txt'
        fileNameStr = trainingFileList[i]
        #   '.'      , '0_0'
        fileStr = fileNameStr.split('.')[0]
        #   '_'      , '0'
        classNumStr = int(fileStr.split('_')[0])
        #        (label)    hwLabels
        hwLabels.append(classNumStr)
        #        img2vector   ,        trainingMat
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    #   listdir         
    testFileList = listdir('testDigits')
    #      
    errorCount = 0.0
    #      (        )
    mTest = len(testFileList)
    #             
    for i in range(mTest):
        #  
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        #        classify0       
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        #          
        print("the classifier come back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if(classifierResult != classNumStr): errorCount += 1.0
    print("
the total number of errors is: %d" % errorCount) print("
the total error rate is: %f" % (errorCount / float(mTest)))

テスト:
handwritingClassTest()

実行結果:
……
the classifier come back with: 1, the real answer is: 1
the classifier come back with: 5, the real answer is: 5
the classifier come back with: 4, the real answer is: 4
the classifier come back with: 3, the real answer is: 3
the classifier come back with: 3, the real answer is: 3

the total number of errors is: 11

the total error rate is: 0.011628

Process finished with exit code 0