kaggle Digit Recognizerディジタル識別


https://www.kaggle.com/c/digit-recognizer
まず、提供されたトレーニングファイルtrainを見てみましょう.csv
import pandas as pd

trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')

print(trainingFile.head())
'''
   label  pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  \
0      1       0       0       0       0       0       0       0       0   
1      0       0       0       0       0       0       0       0       0   
2      1       0       0       0       0       0       0       0       0   
3      4       0       0       0       0       0       0       0       0   
4      0       0       0       0       0       0       0       0       0   

   pixel8    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1       0    ...            0         0         0         0         0   
2       0    ...            0         0         0         0         0   
3       0    ...            0         0         0         0         0   
4       0    ...            0         0         0         0         0   

   pixel779  pixel780  pixel781  pixel782  pixel783  
0         0         0         0         0         0  
1         0         0         0         0         0  
2         0         0         0         0         0  
3         0         0         0         0         0  
4         0         0         0         0         0  

[5 rows x 785 columns]
'''
print(len(trainingFile))
'''
42000
'''
彼の説明によれば、labelは数pixelであり、784画素点の計42000データであることがわかる.
まずkNNアルゴリズムで
クリックしてkNNを開きます.py
まず、最初の41900個のデータを訓練セットの後100個をテストとして使用して正確率を見てみましょう.
import numpy as np
import pandas as pd

import kNN


#     
def loadDataSet():
    #      
    print('     ...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:41900]
    train_x[train_x > 0] = 1
    train_y = np.array(trainingFile['label'])[:41900]

    #      
    print('     ...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    test_x = np.array(testingFile.drop('label', 1))[41900:]
    test_x[test_x > 0] = 1
    test_y = np.array(testingFile['label'])[41900:]

    return train_x, train_y, test_x, test_y

#       
def testHandWritingClass():
    #     
    print('    ...')

    train_x, train_y, test_x, test_y = loadDataSet()

    #   
    print('   ...')

    pass

    #   
    print('   ...')

    numTestSamples = len(test_x)
    matchCount = 0
    result = []
    for i in range(numTestSamples):
        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 3)
        if predict == test_y[i]:
            matchCount += 1

    accuracy = float(matchCount) / numTestSamples

    #     
    print('    ...')

    print('      : %.2f%%' % (accuracy * 100))

if __name__ == '__main__':
    testHandWritingClass()
出力結果:
    ...
     ...
     ...
   ...
   ...
    ...
      : 99.00%

正解率は悪くないtrainに直接csvは訓練セットとしてtestを計算する.csvの各数はsample_を照らしていますsubmission.csvのフォーマットは答えをresultに保存します.csv
import numpy as np
import pandas as pd

import kNN


#     
def loadDataSet():
    #      
    print('     ...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:]
    train_x[train_x > 0] = 1
    train_y = np.array(trainingFile['label'])[:]

    #      
    print('     ...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
    test_x = np.array(testingFile)[:]
    test_x[test_x > 0] = 1
    test_y = []

    return train_x, train_y, test_x, test_y

#       
def testHandWritingClass():
    #     
    print('    ...')

    train_x, train_y, test_x, test_y = loadDataSet()

    #   
    print('   ...')

    pass

    #   
    print('   ...')

    numTestSamples = len(test_x)
    result = []
    for i in range(numTestSamples):
        predict = kNN.kNNClassify(test_x[i], train_x, train_y, 4)
        result.append([i + 1, predict])
        if i % 100 == 0:
            print('  :', i, '/', numTestSamples)

    #     
    print('    ...')

    #print(result)
    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
    testHandWritingClass()

最後にresult.csvコミットスコア0.96543
scikit-learnライブラリを使用したkNN
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsClassifier


#     
def loadDataSet():
    #      
    print('     ...')

    trainingFile = pd.read_csv('C:/Users/Administrator/Desktop/train.csv')
    train_x = np.array(trainingFile.drop('label', 1))[:]
    preprocessing.Binarizer().fit(train_x)
    train_y = np.array(trainingFile['label'])[:]

    #      
    print('     ...')

    testingFile = pd.read_csv('C:/Users/Administrator/Desktop/test.csv')
    test_x = np.array(testingFile)[:]
    preprocessing.Binarizer().fit(test_x)
    test_y = []

    return train_x, train_y, test_x, test_y

#       
def testHandWritingClass():
    #     
    print('    ...')

    train_x, train_y, test_x, test_y = loadDataSet()

    #   
    print('   ...')

    model = KNeighborsClassifier()
    model.fit(train_x, train_y)

    #   
    print('   ...')

    predict = model.predict(test_x)

    #     
    print('    ...')

    result = list(enumerate(predict, 1))
    #print(result)
    pd.DataFrame(result, columns=['ImageId', 'Label']).to_csv('C:/Users/Administrator/Desktop/result.csv', index=False)

if __name__ == '__main__':
    testHandWritingClass()
最後にresultをcsvコミットスコア0.96800