xgboostパラメータ:最適反復回数

2647 ワード

xgboostパラメータ:最適反復回数
説明:
xgboostには、反復ごとにクロス検証を使用し、理想的な決定ツリー数を返す有用な関数「cv」があります.

#coding:utf-8
from __future__ import division
import sys
import pandas as pd
import xgboost as xgb
from numpy import loadtxt
from xgboost import XGBClassifier
from sklearn import cross_validation, metrics   #Additional     scklearn functions
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,roc_auc_score, log_loss
from sklearn.grid_search import GridSearchCV   #Perforing grid search

def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain, label=predictors)
        #XGBoost         “cv”，                   ，           。
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
           metrics='auc', early_stopping_rounds=early_stopping_rounds)
        print "cvresult---", cvresult.shape[0]
        print cvresult

        alg.set_params(n_estimators=cvresult.shape[0])
    #Fit the algorithm on the data
    alg.fit(dtrain, predictors, eval_metric='auc')
        #Predict training set:
    dtrain_predictions = alg.predict(dtrain)
    dtrain_predprob = alg.predict_proba(dtrain)[:,1]

    #Print model report:
    print "
Model Report"
    print "Accuracy : %.4g" % metrics.accuracy_score(predictors, dtrain_predictions)
    print "AUC Score (Train): %f" % metrics.roc_auc_score(predictors, dtrain_predprob)

    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    print "feat_imp", "*"*30
    print "feature_name feature_importance_score"
    print feat_imp
    return cvresult.shape[0]

if __name__ == "__main__":
    dataset = loadtxt("pima-indians-diabetes.csv", delimiter=",")
    # split data into X and y
    X = dataset[:,0:8]
    Y = dataset[:,8]

    #               
    seed = 7
    test_size = 0.33
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
    xgb1 = XGBClassifier(
         learning_rate =0.1,
         n_estimators=100,
         max_depth=5,
         min_child_weight=1,
         gamma=0,
         subsample=0.8,
         colsample_bytree=0.8,
         objective= 'binary:logistic',
         nthread=4,
         scale_pos_weight=1,
         seed=27)

    best_n_estimators = modelfit(xgb1, X_train, y_train)
    print "best_n_estimators=", best_n_estimators

L1正則化とLeakyReluの比較

文字列反転によるchar str[]=「abc」とchar*str=「abc」思考