統合アルゴリズム-xgboost/bagging/voting

12896 ワード

#!/usr/bin/env python
# -*- coding: utf-8 -*-
'''
 bagging try
'''
import pandas as pd
import numpy as np
from numpy import NaN
from dateutil.parser import parse
'''
     
'''
# dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d')
# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620.csv',
#                    dtype={'"EQUIP_ID"': object, 'FAULT_TYPE': object, 'INST_DATE':object, 'DETECT_DATE':object,
#                           'FAULT_DATE':object, 'SYNC_ORG_NO': object, 'ORG_NO': object, 'ORG_NAME': object, 'SORT_CODE':object,
#                           'SPEC_CODE': object, 'COMM_MODE': object, 'ARRIVE_BATCH_NO': object, 'MANUFACTURER': object },
# date_parser=dateparse)
#
# data.drop('ORG_NAME', axis=1, inplace=True)
# #     
# data = data.drop_duplicates()
# data.dropna(how='all')
# print data.info()
# '''
# 1.    
# '''
# #     
# data['FAULT_DATE1'] = pd.to_datetime(data['FAULT_DATE'].str.strip().str.split(' ').str[0])
# data['INST_DATE1'] = pd.to_datetime(data['INST_DATE'].str.strip().str.split(' ').str[0])
# data['DETECT_DATE1'] = pd.to_datetime(data['DETECT_DATE'].str.strip().str.split(' ').str[0], errors='coerce')
# #     
# data['FAULT_MONTH'] = [x.month for x in data['FAULT_DATE1']]
# data['INST_MONTH'] = [x.month for x in data['INST_DATE1']]
# data['DETECT_MONTH'] = [x.month for x in data['DETECT_DATE1']]
# #       
# print sum(data['FAULT_DATE1'].isnull()), sum(data['INST_DATE1'].isnull())
# data['work_days'] = data['FAULT_DATE1'] - data['INST_DATE1']
# data['work_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['work_days']]
# #       
# data['save_days'] = data['INST_DATE1'] - data['DETECT_DATE1']
# data['save_months'] = [x.days / 30 if not pd.isnull(x) else np.nan for x in data['save_days']]
# '''
# 2.    /  /    
# '''
# import seaborn as sns
# import matplotlib.pyplot as plt
# # 2.1 FAULT_TYPE
# print data['FAULT_TYPE'].isnull().sum()
# print data['FAULT_TYPE'].describe()
# print data['FAULT_TYPE'].value_counts()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='FAULT_TYPE', data=data, ax=axis0)
# #   401-411  
# data['FAULT_TYPE'] = data['FAULT_TYPE'].str.strip()
# data['FAULT_TYPE_2'] = [x[0:2] for x in data['FAULT_TYPE'].values.astype('str')]
# data['FAULT_TYPE_4'] = [x[0:4] for x in data['FAULT_TYPE'].values.astype('str')]
# data = data[data['FAULT_TYPE_2'] == '04']
# data = data[(data['FAULT_TYPE_4'] != '0412') & (data['FAULT_TYPE'] != '04')]
# print data['FAULT_TYPE'].value_counts()
# print data['FAULT_TYPE_4'].value_counts()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='FAULT_TYPE_4', data=data, ax=axis0)
# # 2.2 SORT_CODE
# print data['SORT_CODE'].isnull().sum()
# print data['SORT_CODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='SORT_CODE', data=data, ax=axis0)
# #   SORT_CODE  10     
# data['SORT_CODE'] = data['SORT_CODE'].str.strip()
# data = data[data['SORT_CODE'] == '10']
# # 2.3 SPEC_CODE——  
# print data['SPEC_CODE'].isnull().sum()
# print data['SPEC_CODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='SPEC_CODE', data=data, ax=axis0)
# # 2.4 COMM_MODE——  
# print data['COMM_MODE'].isnull().sum()
# print data['COMM_MODE'].describe()
# fig, axis0 = plt.subplots(1, 1)
# sns.countplot(x='COMM_MODE', data=data, ax=axis0)
# # 2.5 ORG_NO
# print data['ORG_NO'].isnull().sum()
# print data['ORG_NO'].describe()
# data['ORG_NO1'] = [x[:5] for x in data['ORG_NO'].values.astype('str')]
# data['ORG_NO1'].value_counts()
# # 2.6 ARRIVE_BATCH_NO——  
# print data['ARRIVE_BATCH_NO'].isnull().sum()
# data['ARRIVE_BATCH_NO'].value_counts()
# # 2.7 MANUFACTURER——  
# print data['MANUFACTURER'].isnull().sum()
# data['MANUFACTURER'].value_counts()
# # 2.8 all
# print data.describe()
# '''
# 3.      
# '''
# data.drop(['FAULT_DATE','SYNC_ORG_NO', 'INST_DATE', 'DETECT_DATE',
#            'work_days', 'save_days', 'FAULT_TYPE_2'],
#           axis=1, inplace=True)
# '''
# 4.    
# '''
# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv', index=False)

'''
      :      ,         
'''
# # 1.      
# data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-new0620-reprocess.csv')
# data = data.drop_duplicates()   #   
# data.dropna()   #   
# print data.info()
# print data['COMM_MODE'].value_counts()
# data.drop(['"EQUIP_ID"', 'FAULT_TYPE', 'ORG_NO', 'SORT_CODE', 'COMM_MODE',
#            'ARRIVE_BATCH_NO', 'MANUFACTURER', 'FAULT_DATE1',
#            'INST_DATE1', 'DETECT_DATE1', 'INST_MONTH',
#            'DETECT_MONTH', 'FAULT_DATE1', 'FAULT_DATE1'],axis=1, inplace=True)
# print data.info()
# # 2.       
# # work_months
# #        
# print len(data['work_months'][data['work_months']<0])
# #         na
# data['work_months'][data['work_months'] < 0] = NaN
# #          ,     
# print len(data['work_months'][data['work_months']<0])
# #     
# count_nan_work_months = data['work_months'].isnull().sum()
# print count_nan_work_months
# #        
# work_months_mean = data['work_months'].mean()
# work_months_std = data['work_months'].std()
# #   ,        ,        
# rand_1 = np.random.randint(work_months_mean - work_months_std, work_months_mean + work_months_std, size = count_nan_work_months)
# data['work_months'][data['work_months'].isnull()] = rand_1
# # save_months
# print len(data['save_months'][data['save_months']<0])
# data['save_months'][data['save_months'] < 0] = NaN
# print len(data['save_months'][data['save_months']<0])
# count_nan_save_months = data['save_months'].isnull().sum()
# print count_nan_save_months
# save_months_mean = data['save_months'].mean()
# save_months_std = data['save_months'].std()
# rand_2 = np.random.randint(save_months_std - save_months_mean, save_months_mean + save_months_std, size = count_nan_save_months)
# data['save_months'][data['save_months'].isnull()] = rand_2
# #   
# print len(data['work_months'][data['work_months']<0])
# print len(data['save_months'][data['save_months']<0])
# print data.isnull().sum().sum()
# # 3.    
# data.to_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv', index=False)

'''
bagging
'''
data = pd.read_csv(r'E:\7 Python\data\20170616\zhejiang-bagging-data.csv')
print data.info()
data_X = data.drop(['FAULT_TYPE_4'], axis=1)
data_y = data['FAULT_TYPE_4']
'''
    
'''
"category     :   string,one-hot  "
# FAULT_TYPE_4
print data['FAULT_TYPE_4'].dtypes
data['FAULT_TYPE_4'] = data['FAULT_TYPE_4'].astype(str)
print data['FAULT_TYPE_4'].dtypes
print data['FAULT_TYPE_4'].value_counts()
print pd.get_dummies(data['FAULT_TYPE_4'],prefix = 'FAULT_TYPE_4').head()
# SPEC_CODE
print data['SPEC_CODE'].dtypes
data['SPEC_CODE'] = data['SPEC_CODE'].astype(str)
print data['SPEC_CODE'].dtypes
print data['SPEC_CODE'].value_counts()
print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()
# FAULT_MONTH
print data['FAULT_MONTH'].dtypes
data['FAULT_MONTH'] = data['FAULT_MONTH'].astype(str)
print data['FAULT_MONTH'].dtypes
print data['FAULT_MONTH'].value_counts()
print pd.get_dummies(data['FAULT_MONTH'],prefix = 'FAULT_MONTH').head()
# ORG_NO1
print data['ORG_NO1'].dtypes
data['ORG_NO1'] = data['ORG_NO1'].astype(str)
print data['ORG_NO1'].dtypes
print data['ORG_NO1'].value_counts()
print pd.get_dummies(data['ORG_NO1'],prefix = 'ORG_NO1').head()
# SPEC_CODE
print data['SPEC_CODE'].dtypes
data['SPEC_CODE'] = data['SPEC_CODE'].astype(str)
print data['SPEC_CODE'].dtypes
print data['SPEC_CODE'].value_counts()
print pd.get_dummies(data['SPEC_CODE'],prefix = 'SPEC_CODE').head()
# one-hot coding
data_dummy = pd.get_dummies(data)
print data_dummy.head()
"numerical  (work_months save_months)  :     ,0/1     "
#      
print data_dummy.isnull().sum()
# print data_dummy['save_months'].isnull().sum()
# save_months_mean = data_dummy['save_months'].mean()
# data_dummy['save_months']= data_dummy['save_months'].fillna(save_months_mean)
# print data_dummy.isnull().sum().sum()
# 0/1     
numeric_cols = data.columns[data.dtypes != 'object']
print numeric_cols
numeric_col_means = data_dummy.loc[:,numeric_cols].mean()
numeric_col_std = data_dummy.loc[:,numeric_cols].std()
data_dummy.loc[:,numeric_cols] = (data_dummy.loc[:,numeric_cols] - numeric_col_means) / numeric_col_std
'''
    
'''
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix
#     
train, test, train_y, test_y = train_test_split(data_X, data_y, test_size=0.33, random_state=27)

#xgboost
import xgboost as xgb
# dtrain = xgb.DMatrix(train,train_y)
# dtest = xgb.DMatrix(test)
xgb_clf = xgb.XGBClassifier(
    learning_rate = 0.2,
    n_estimators = 720,
    max_depth = 9,
    colsample_bytree = 0.8,
    subsample = 0.9,
    objective = 'multi:softprob',
    min_child_weight = 1,
    gamma = 2,
    seed = 27 )
param = xgb_clf.get_xgb_params()
param['num_class'] = 11
xgb_clf.fit(train,train_y, eval_metric='merror')
xgb_pred = xgb_clf.predict(test)
print(classification_report(test_y, xgb_pred))
print(confusion_matrix(test_y, xgb_pred))

# # DecisionTree
# from sklearn.tree import DecisionTreeClassifier
# dt_clf = DecisionTreeClassifier()
# dt_clf.fit(train,train_y)
# print(dt_clf)
# dt_clf_pred = dt_clf.predict(test)
# print(classification_report(test_y, dt_clf_pred))
# print(confusion_matrix(test_y, dt_clf_pred))

# # knn
# from sklearn.neighbors import KNeighborsClassifier
# knn_clf = KNeighborsClassifier(n_neighbors=25)
# knn_clf.fit(train, train_y)
# knn_pred = knn_clf.predict(test)
# print knn_pred
# knn_pred_proba=knn_clf.predict_proba(test)
# print knn_pred_proba
# # print model report:
# # print knn_clf.score(test, test_y)
# print(classification_report(test_y, knn_pred))
# print(confusion_matrix(test_y, knn_pred))

# # bagging
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# dt_clf = DecisionTreeClassifier()
# bagging_clf = BaggingClassifier(base_estimator=dt_clf,
#                                 n_estimators=10,
#                                 max_samples=1.0,
#                                 max_features=1.0,
#                                 bootstrap=True )
# bagging_clf.fit(train,train_y)
# bagging_pred = bagging_clf.predict(test)
# print bagging_pred
# print(classification_report(test_y, bagging_pred))
# print(confusion_matrix(test_y, bagging_pred))

# voting
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
dt_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier(n_neighbors=25)
xgb_clf = xgb.XGBClassifier(learning_rate = 0.2,n_estimators = 720,max_depth = 9,colsample_bytree = 0.8,subsample = 0.9,
                            objective = 'multi:softprob',min_child_weight = 1,gamma = 2,seed = 27)
voting_clf=VotingClassifier(estimators=[('dt_clf', dt_clf),('knn_clf', knn_clf),('xgb_clf',xgb_clf)])
voting_clf.fit(train,train_y)
votingg_pred = voting_clf.predict(test)
print votingg_pred
print(classification_report(test_y, votingg_pred))
print(confusion_matrix(test_y, votingg_pred))