【読書ノート】機械学習実戦p 19-2.1.2(k-近隣アルゴリズム)

3250 ワード


from numpy import *
import operator


def createDataSet():
    group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels=['A','A','B','B']
    return group,labels


def classify0(inX,dataSet,labels,k):
    dataSetSize=dataSet.shape[0]
    #   shape     :           


    diffMat=tile(inX,(dataSetSize,1))-dataSet
    #    :numpy.tile(A,reps)     : A      
    # tile(A,[x,y])     
    # tile(A,x)        tile(A,【1,x】)


    sqDiffMat=diffMat**2
    #**     


    sqDistances=sqDiffMat.sum(axis=1)
    #  sum(axis=0)      
    #sum(axis=1)           


    distances=sqDistances**0.5
    sortedDistIndicies=distances.argsort()
    #argsort()    x          ,      index(  ),     y
    classCount={}


    for i in range(k):
        voteIlabel=labels[sortedDistIndicies[i]]
        classCount[voteIlabel]=classCount.get(voteIlabel,0)+1
        sortedClassCount=sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)
        return sortedClassCount[0][0]


 def file2matrix(filename):
    fr=open(filename)


    numberOfLines=len(fr.readlines())
    #readlines([sizehit[,keepends]])
    #Read all lines available on the input stream and return them as a list of lines
    returnMat=zeros((numberOfLines,3))
    #create a matrix with zero       


    classLabelVector=[]
    fr=open(filename)
    index=0
    for line in fr.readlines():
        line=line.strip()
        #     
        listFromLine=line.split('\t')
        #  tab      
        returnMat[index,:]=listFromLine[0:3]
        # 0、1、2  returnMat   
        classLabelVector.append(int(listFromLine[-1]))
        #        int    classLabelVector 
        index += 1
    return  returnMat,classLabelVector




#       
#newValue=(oldValue-min)/(max-min)
def autoNorm(dataSet):
    minVals=dataSet.min(0)
    maxVals=dataSet.max(0)
    ranges=maxVals-minVals
    normDataSet=zeros(shape(dataSet))
    m=dataSet.shape[0]
    normDataSet=dataSet-tile(minVals,(m,1))
    normDataSet=normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals




def datingClassTest():
    hoRatio=0.50
    #         
    datingDataMat,datingLabels=file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals=autoNorm(datingDataMat)
    m=normMat.shape[0]
    numTestVecs=int(m*hoRatio)
    errorCount=0.0
    for i in range(numTestVecs):
        classifierResult=classify0(normMat[i,:],normMat[numTestVecs:m,:],
                                   datingLabels[numTestVecs:m],3)
        print("the classifier came back with: %d, the real answer is: %d" % (classifierResult, datingLabels[i]))
        if(classifierResult != datingLabels[i]):
            errorCount += 1.0
    print("the total error rate is :%f" %(errorCount/float(numTestVecs)))
    print(errorCount)

python argsort()

python sum .sum(axis=1)

python NumPy—tile

python * **

python: numpy-- shape

Python sorted operator.itemgetter

Numpy