kaggle入門digit recognizer Python xgboost
# coding:utf-8
import numpy
__author__ = 'WHP'
__mtime__ = '2016/5/12'
__name__ = ''
import xgboost
import pandas
import time
now = time.time()
dataset = pandas.read_csv("...input\\train.csv")
trainData = dataset.iloc[:, 1:].values
labelData = dataset.iloc[:, :1].values
testData = pandas.read_csv("...input\\test.csv")
test = testData.iloc[:, :].values
# http://xgboost.readthedocs.io/en/latest/parameter.html
param = {"booster": "gbtree", "max_depth": 12, "eta": 0.03, "seed": 710, "objective": "multi:softmax", "num_class": 10,
"gamma": 0.03}
offset = 35000 <span style="font-size: 13.3333339691162px; font-family: Arial, Helvetica, sans-serif;"># </span>
num_rounds = 500 #
# DMatrix xgboost
xgtest = xgboost.DMatrix(test)
xgtrain = xgboost.DMatrix(trainData[:offset, :], label=labelData[:offset])
xgeval = xgboost.DMatrix(trainData[offset:, :], label=labelData[offset:])
watchlist = [(xgtrain, 'train'), (xgeval, 'val')]
# http://xgboost.readthedocs.io/en/latest/python/python_api.html
model = xgboost.train(list(param.items()), xgtrain, num_rounds, watchlist, early_stopping_rounds=100)
#
preds = model.predict(xgtest, ntree_limit=model.best_iteration)
numpy.savetxt('submission_xgb_MultiSoftmax.csv', numpy.c_[range(1, len(testData) + 1), preds], delimiter=',',
header='ImageId,Label', comments='', fmt='%d')
print("cost time:", time.time() - now)
結果最初はmulti:softmax(分類器)ではなくデフォルトの線形回帰結果で0.5程度悪い
データを前処理し,1より大きいものを1に割り当て,0.97の結果を得た.
参照ドキュメント:
http://blog.csdn.net/eddy_zheng/article/details/50496186