XGBoost使用チュートリアル(進級編)3


「何気なく巨大牛の人工知能のチュートリアルを見つけて、思わず共有してしまいました.チュートリアルは基礎がゼロで分かりやすいだけでなく、とても面白くてユーモアがあって、小説を読むように!すごいと思ったので、みんなに共有しました.ここをクリックしてチュートリアルにジャンプすることができます.」
一、Importing all the libraries
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score

二、Reading the file
それともキノコのデータセットで、Kaggleコンテストの22ビットを直接採用していますか?https://www.kaggle.com/uciml/mushroom-classification
データセットのダウンロード先:http://download.csdn.net/download/u011630575/10266626
# path to where the data lies
dpath = './data/'
data = pd.read_csv(dpath+"mushrooms.csv")
data.head(6)

三、Let us check if there is any null values
 
data.isnull().sum() #         

四、check if we have two claasification.Either the mushroom is poisonous or edible
 
data['class'].unique() #           ,  ,   
print(data.dtypes)

五、check if 22 features(1 st one is label)and 8124 instances
 
data.shape  #22    8124           

六、The dataset has values in strings.We need to convert all the unique values to integers. Thus we perform label encoding on the data標準化ラベル
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()   #     ,         range(     -1)   
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()#标准化标签,将标签值统一转换成range(标签值个数-1)范围内
for col in data.columns:
    data[col] = labelencoder.fit_transform(data[col])
 
data.head()

Separating features and label

X = data.iloc[:,1:23]  #   1-23   
y = data.iloc[:, 0]  #   0   
X.head()
y.head()

Splitting the data into training and testing dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=4)

七、default Logistic Regression
from sklearn.linear_model import LogisticRegression
model_LR= LogisticRegression()
model_LR.fit(X_train,y_train)
y_prob = model_LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_LR.score(X_test, y_pred)

注:np.where(condition,x,y)は三元演算子であり、conditon条件が成立すると結果はxであり、そうでない場合はyである.
accuracy
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)

八、Logistic Regression(Tuned model)調整モデル
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn import metrics

LR_model= LogisticRegression()

tuned_parameters = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] ,
              'penalty': ['l1','l2']
                   }

九、CV
from sklearn.model_selection import GridSearchCV

LR= GridSearchCV(LR_model, tuned_parameters,cv=10)
LR.fit(X_train,y_train)
print(LR.best_params_)
y_prob = LR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
LR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
print(auc_roc)

十、Default Decision Tree model
from sklearn.tree import DecisionTreeClassifier

model_tree = DecisionTreeClassifier()
model_tree.fit(X_train, y_train)
y_prob = model_tree.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_tree.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

十一、Let us tune the hyperparameters of the Decision tree model
from sklearn.tree import DecisionTreeClassifier

model_DD = DecisionTreeClassifier()

tuned_parameters= {   'max_features': ["auto","sqrt","log2"],
                  'min_samples_leaf': range(1,100,1) , 'max_depth': range(1,50,1)
                  }
#tuned_parameters= { 'max_features': ["auto","sqrt","log2"]  }


#If “auto”, then max_features=sqrt(n_features).
from sklearn.model_selection import GridSearchCV
DD = GridSearchCV(model_DD, tuned_parameters,cv=10)
DD.fit(X_train, y_train)
print(DD.grid_scores_)
print(DD.best_score_)
print(DD.best_params_)
y_prob = DD.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
DD.score(X_test, y_pred)
auc_roc=metrics.classification_report(y_test,y_pred)
print(auc_roc)

十二、Default Random Forest
 
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()
model_RR.fit(X_train,y_train)
y_prob = model_RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_RR.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

十三、Let us tuned the parameters of Random Forest just for the purpose of knowledge
 
1) max_features 
 
2) n_estimators推定量
3) min_sample_leaf
from sklearn.ensemble import RandomForestClassifier

model_RR=RandomForestClassifier()

tuned_parameters = {'min_samples_leaf' range(10,100,10), 'n_estimators' : range(10,100,10),
                    'max_features':['auto','sqrt','log2']
                    }

from sklearn.model_selection import GridSearchCV
RR = GridSearchCV(model_RR, tuned_parameters,cv=10)

RR.fit(X_train,y_train)

print(RR.grid_scores_)

print(RR.best_score_)

print(RR.best_params_)

y_prob = RR.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
RR_model.score(X_test, y_pred)

auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

十四、Default XGLBoost
from xgboost import XGBClassifier
model_XGB=XGBClassifier()
model_XGB.fit(X_train,y_train)
y_prob = model_XGB.predict_proba(X_test)[:,1] # This will give you positive class prediction probabilities  
y_pred = np.where(y_prob > 0.5, 1, 0) # This will threshold the probabilities to give class predictions.
model_XGB.score(X_test, y_pred)
auc_roc=metrics.roc_auc_score(y_test,y_pred)
auc_roc

十五、特徴の重要性
 
XGBoostではフィーチャーの重要性が自動的に計算され、featureimportancesに保存されます.
 
print(model_XGB.feature_importances_)
from matplotlib import pyplot
pyplot.bar(range(len(model_XGB.feature_importances_)), model_XGB.feature_importances_)
pyplot.show()
# plot feature importance using built-in function
from xgboost import plot_importance
plot_importance(model_XGB)
pyplot.show()

フィーチャーの重要性に基づいてフィーチャーを選択できます
 
from numpy import sort
from sklearn.feature_selection import SelectFromModel

# Fit model using each importance as a threshold
thresholds = sort(model_XGB.feature_importances_)
for thresh in thresholds:
  # select features using threshold
  selection = SelectFromModel(model_XGB, threshold=thresh, prefit=True)
  select_X_train = selection.transform(X_train)
  # train model
  selection_model = XGBClassifier()
  selection_model.fit(select_X_train, y_train)
# eval model
  select_X_test = selection.transform(X_test)
  y_pred = selection_model.predict(select_X_test)
  predictions = [round(value) for value in y_pred]
  accuracy = accuracy_score(y_test, predictions)
  print("Thresh=%.3f, n=%d, Accuracy: %.2f%%" % (thresh, select_X_train.shape[1],
      accuracy*100.0))