PySparkでのパラレルランニングxgboostモデル
1359 ワード
from sklearn import datasets
iris = datasets.load_iris()
data = iris.data[:100]
print data.shape
#(100L, 4L)
# 100 , 4
label = iris.target[:100]
print label
# 、
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data, label, random_state=0)
# xgboost
import xgboost as xgb
dtrain=xgb.DMatrix(train_x,label=train_y)
dtest=xgb.DMatrix(test_x)
#xgboost
params={'booster':'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'max_depth':4,
'lambda':10,
'subsample':0.75,
'colsample_bytree':0.75,
'min_child_weight':2,
'eta': 0.025,
'seed':0,
'nthread':8,
'silent':1}
watchlist = [(dtrain,'train')]
bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
#
ypred=bst.predict(dtest)
#
bst.save_model('/root/xgb2.model')
bst2 = xgb.core.Booster(model_file='/root/xgb2.model')
#
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
s=sc.parallelize(test_x,5)
#
import numpy as np;
s.map(lambda x: bst2.predict(xgb.DMatrix(np.array(x).reshape((1,-1))))).collect()