Python機械学習初歩——第二部分

3942 ワード

# coding = UTF-8

# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# machine_five_ldmwp.py
# @  : python           
# @  :Glen
# @  :2016.8.16
# @    :Python         
# +++++++++++++++++++++++++++++++++++++++++++++++++++++

import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array
from sklearn.pipeline import Pipeline

# ---------------------------------------
#         
# ----------------------------------------

#           ,        。
#         ,     ,         。

#       adult   ,                

# 1.     
adult_filename = r'E:\data\bigdata\adult\adult.data'
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
                                                        "Education-Num", "Marital-Status", "Occupation",
                                                        "Relationship", "Race", "Sex", "Capital-gain",
                                                        "Capital-loss", "Hours-per-week", "Native-Country",
                                                        "Earnings-Raw"])

# 2.     
#       
adult.dropna(how='all', inplace=True)

# 3.        
#     
print(adult["Hours-per-week"].describe())

print(adult["Work-Class"].unique())

# 3'.   scikit-learn       
X = np.arange(30).reshape((10, 3))
X[:,1] = 1
#   :  X        1

#   VarianceThreshold()            
vt = VarianceThreshold()
Xt = vt.fit_transform(X)
#     ,        ,        
print(vt.variances_)

#   adult   ,      
X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (adult["Earnings-Raw"] == ' >50K').values

#      
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)

#   :           、 、  
print(transformer.scores_)

#         (Pearson)        
#      SciPy  pearsonr()  

#     
def multivariate_pearsonr(X, y):
    scores, pvalues = [], []
    for column in range(X.shape[1]):
        cur_score, cur_p = pearsonr(X[:,column], y)
        scores.append(abs(cur_score))
        pvalues.append(cur_p)
    return (np.array(scores), np.array(pvalues))

transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)

#   CART   ,          
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')

print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))

#         
#     API   。            ,         。

#           
# - fit():       ,      
# - transform():     。                 。

#      
class MeanDiscrete(TransformerMixin):
    def fit(self, X, y=None):
        X = as_float_array(X)
        self.mean = np.mean(X, axis=0)
        return self

    def transform(self, X):
        X = as_float_array(X)
        assert X.shape[1] == self.mean.shape[0]
        return X > self.mean

pipeline = Pipeline([('mean_discrete', MeanDiscrete()),
                     ('classifier', DecisionTreeClassifier(random_state=14))])
scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')
print("Mean Discrete performance: {0:.3f}".format(scores_mean_discrete.mean()))