機械学習実戦_Python3.7_kNNアルゴリズム

11259 ワード

マシン学習

Python 3を使う.7コンパイル、コードは本の中で類似して、python 2が淘汰された文法を修正して、詳しい注釈を添付します.
kNNアルゴリズムの原理は非常に簡単で,言うまでもなく,欠点は主に運転速度である.
ここで使用する方法はすべてマトリクス構造であり,グローバル遍歴であり,高度なデータ構造は用いられず,実行効率は低いが,実現は簡単である.
PythonとNumpyの基本構文(リスト、マトリクス操作)と基本的なファイル操作を重点的に学習します.
きほんアルゴリズム

from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   

def createDataSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group, labels

def classify0(inX,dataSet,labels,k):
    #   dataSet      ，   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ，  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat **2
    #   sum          ，      axis      0 ，      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ，                
    distance = sqDistance **0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ，       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ，          label ，get     0 ，    label   1      
        #         label ，get        label    ，    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True ：    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

テスト:

inX = [0.1,0.1]
group, labels = createDataSet()
res = classify0(inX, group, labels, 3)
print(res)

実行結果(中間変数の一部を追加し、プログラムプロセスをより明確に理解する):

diffMat:
[[-0.9 -1. ]
 [-0.9 -0.9]
 [ 0.1  0.1]
 [ 0.1  0. ]]
distance:
[1.3453624  1.27279221 0.14142136 0.1       ]
sortedDistIndicies:
[3 2 1 0]
classCount:
{'B': 2, 'A': 1}
sortedClassCount:
[('B', 2), ('A', 1)]
B

Process finished with exit code 0

例1:デートサイト

from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   

def classify0(inX,dataSet,labels,k):
    #   dataSet      ，   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ，  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat ** 2
    #   sum          ，      axis      0 ，      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ，                
    distance = sqDistance ** 0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ，       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ，          label ，get     0 ，    label   1      
        #         label ，get        label    ，    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True ：    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def file2matrix(filename):
    fr = open(filename)
    #   readlines       ，                ，            
    arrayOLines = fr.readlines()
    #   len            ，      ，    
    numberOfLines = len(arrayOLines)
    #             （       * 3      ）
    returnMat = zeros((numberOfLines, 3))
    #            （    ）
    classLabelVector = []
    #     （        ）
    index = 0
    #              
    for line in arrayOLines:
        #        ，     
        line = line.strip()
        #   '\t' （Tab）      ，     4           
        listFromLine = line.split('\t')
        #   index       3    （  ）     returnMat      
        #      zeros        returnMat，     float  ，            
        returnMat[index,:] = listFromLine[0:3]
        #   4    （label）   classLabelVector      ，       int  
        classLabelVector.append(int(listFromLine[-1]))
        #    ，        
        index += 1
    return returnMat, classLabelVector

def autoNorm(dataSet):
    #        （    ）
    minVals = dataSet.min(0)
    #         （    ）
    maxVals = dataSet.max(0)
    #    ，           
    ranges = maxVals - minVals
    #  zeros       dataSet        
    normDataSet = zeros(shape(dataSet))
    #   shape[0]    dataSet    
    m = dataSet.shape[0]
    #     （   tile         ）
    normDataSet = dataSet - tile(minVals, (m, 1))
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet

def datingClassTest():
    #              
    hoRatio = 0.03
    #        file2matrix              
    datingDataMat, datingLabels = file2matrix('datingTestSet2.txt')
    #        autoNorm             
    normMat = autoNorm(datingDataMat)
    #   shape[0]     （    ）
    m = normMat.shape[0]
    #                   
    numTestVecs = int(m * hoRatio)
    #      
    errorCount = 0.0
    #            
    for i in range(numTestVecs):
        #        classify0           
        classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], datingLabels[numTestVecs:m], 3)
        #        
        print("the classifier came back with: %d, the real answer is: %d" %(classifierResult, datingLabels[i]))
        #     ，    
        if(classifierResult != datingLabels[i]): errorCount += 1.0
    #       
    print("the total error rate is: %f" %(errorCount / float(numTestVecs)))

テスト:

datingClassTest()

実行結果:

the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the classifier came back with: 3, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 3, the real answer is: 3
the classifier came back with: 2, the real answer is: 2
the classifier came back with: 1, the real answer is: 1
the total error rate is: 0.033333

Process finished with exit code 0

例2:手書き数字

from numpy import *
#    array、shape、tile、**    
import operator
#    itemgetter   
from os import listdir
#       ，                 

def classify0(inX,dataSet,labels,k):
    #   dataSet      ，   .shape[0]                
    dataSetSize = dataSet.shape[0]
    #   tile     inX        dataSet        ，  dataSet      
    diffMat = tile(inX, (dataSetSize, 1)) - dataSet
    #          
    sqDiffMat = diffMat ** 2
    #   sum          ，      axis      0 ，      
    sqDistance = sqDiffMat.sum(axis = 1)
    #         ，                
    distance = sqDistance ** 0.5
    #   argsort                 
    sortedDistIndicies = distance.argsort()
    #       
    classCount = {}
    #    k       label
    for i in range(k):
        #                ，       i        label
        voteIlabel = labels[sortedDistIndicies[i]]
        #   get   ，          label ，get     0 ，    label   1      
        #         label ，get        label    ，    label         1
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    #   items             
    #   key               
    #   reverse     True ：    
    sortedClassCount = sorted(classCount.items(),key = operator.itemgetter(1),reverse = True)
    return sortedClassCount[0][0]

def img2vector(filename):
    #          （32 * 32）       1 * 1024       
    returnVect = zeros((1, 1024))
    fr = open(filename)
    for i in range(32):
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0, 32 * i + j] = int(lineStr[j])
    return returnVect

def handwritingClassTest():
    #          label     
    hwLabels = []
    #   listdir         
    trainingFileList = listdir('trainingDigits')
    #      （        ）
    m = len(trainingFileList)
    #               
    trainingMat = zeros((m, 1024))
    #             
    for i in range(m):
        #         ， '0_0.txt'
        fileNameStr = trainingFileList[i]
        #   '.'      ， '0_0'
        fileStr = fileNameStr.split('.')[0]
        #   '_'      ， '0'
        classNumStr = int(fileStr.split('_')[0])
        #        （label）    hwLabels
        hwLabels.append(classNumStr)
        #        img2vector   ，        trainingMat
        trainingMat[i,:] = img2vector('trainingDigits/%s' % fileNameStr)
    #   listdir         
    testFileList = listdir('testDigits')
    #      
    errorCount = 0.0
    #      （        ）
    mTest = len(testFileList)
    #             
    for i in range(mTest):
        #  
        fileNameStr = testFileList[i]
        fileStr = fileNameStr.split('.')[0]
        classNumStr = int(fileStr.split('_')[0])
        vectorUnderTest = img2vector('testDigits/%s' % fileNameStr)
        #        classify0       
        classifierResult = classify0(vectorUnderTest, trainingMat, hwLabels, 3)
        #          
        print("the classifier come back with: %d, the real answer is: %d" % (classifierResult, classNumStr))
        if(classifierResult != classNumStr): errorCount += 1.0
    print("
the total number of errors is: %d" % errorCount)
    print("
the total error rate is: %f" % (errorCount / float(mTest)))

テスト:

handwritingClassTest()

実行結果:

……
the classifier come back with: 1, the real answer is: 1
the classifier come back with: 5, the real answer is: 5
the classifier come back with: 4, the real answer is: 4
the classifier come back with: 3, the real answer is: 3
the classifier come back with: 3, the real answer is: 3

the total number of errors is: 11

the total error rate is: 0.011628

Process finished with exit code 0

pyyamlでハマったこと

Laravelは七牛クラウドオブジェクトのストレージとファイルアップロードの総括を実現する