PySparkでのパラレルランニングxgboostモデル

1359 ワード

from sklearn import datasets
iris = datasets.load_iris()
data = iris.data[:100]
print data.shape
#(100L, 4L)
# 100 ,  4 

label = iris.target[:100]
print label

# 、 
from sklearn.cross_validation import train_test_split
train_x, test_x, train_y, test_y = train_test_split(data, label, random_state=0)

# xgboost 
import xgboost as xgb
dtrain=xgb.DMatrix(train_x,label=train_y)
dtest=xgb.DMatrix(test_x)

#xgboost 
params={'booster':'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth':4,
    'lambda':10,
    'subsample':0.75,
    'colsample_bytree':0.75,
    'min_child_weight':2,
    'eta': 0.025,
    'seed':0,
    'nthread':8,
     'silent':1}

watchlist = [(dtrain,'train')]

bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)

# 
ypred=bst.predict(dtest)

# 
bst.save_model('/root/xgb2.model')
bst2 = xgb.core.Booster(model_file='/root/xgb2.model')

# 
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf = conf)
s=sc.parallelize(test_x,5)

# 
import numpy as np;
s.map(lambda x: bst2.predict(xgb.DMatrix(np.array(x).reshape((1,-1))))).collect()