Python機械学習初歩——第二部分
3942 ワード
# coding = UTF-8
# ++++++++++++++++++++++++++++++++++++++++++++++++++++
# machine_five_ldmwp.py
# @ : python
# @ :Glen
# @ :2016.8.16
# @ :Python
# +++++++++++++++++++++++++++++++++++++++++++++++++++++
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from sklearn.base import TransformerMixin
from sklearn.utils import as_float_array
from sklearn.pipeline import Pipeline
# ---------------------------------------
#
# ----------------------------------------
# , 。
# , , 。
# adult ,
# 1.
adult_filename = r'E:\data\bigdata\adult\adult.data'
adult = pd.read_csv(adult_filename, header=None, names=["Age", "Work-Class", "fnlwgt", "Education",
"Education-Num", "Marital-Status", "Occupation",
"Relationship", "Race", "Sex", "Capital-gain",
"Capital-loss", "Hours-per-week", "Native-Country",
"Earnings-Raw"])
# 2.
#
adult.dropna(how='all', inplace=True)
# 3.
#
print(adult["Hours-per-week"].describe())
print(adult["Work-Class"].unique())
# 3'. scikit-learn
X = np.arange(30).reshape((10, 3))
X[:,1] = 1
# : X 1
# VarianceThreshold()
vt = VarianceThreshold()
Xt = vt.fit_transform(X)
# , ,
print(vt.variances_)
# adult ,
X = adult[["Age", "Education-Num", "Capital-gain", "Capital-loss", "Hours-per-week"]].values
y = (adult["Earnings-Raw"] == ' >50K').values
#
transformer = SelectKBest(score_func=chi2, k=3)
Xt_chi2 = transformer.fit_transform(X, y)
# : 、 、
print(transformer.scores_)
# (Pearson)
# SciPy pearsonr()
#
def multivariate_pearsonr(X, y):
scores, pvalues = [], []
for column in range(X.shape[1]):
cur_score, cur_p = pearsonr(X[:,column], y)
scores.append(abs(cur_score))
pvalues.append(cur_p)
return (np.array(scores), np.array(pvalues))
transformer = SelectKBest(score_func=multivariate_pearsonr, k=3)
Xt_pearson = transformer.fit_transform(X, y)
print(transformer.scores_)
# CART ,
clf = DecisionTreeClassifier(random_state=14)
scores_chi2 = cross_val_score(clf, Xt_chi2, y, scoring='accuracy')
scores_pearson = cross_val_score(clf, Xt_pearson, y, scoring='accuracy')
print("Chi2 performance: {0:.3f}".format(scores_chi2.mean()))
print("Pearson performance: {0:.3f}".format(scores_pearson.mean()))
#
# API 。 , 。
#
# - fit(): ,
# - transform(): 。 。
#
class MeanDiscrete(TransformerMixin):
def fit(self, X, y=None):
X = as_float_array(X)
self.mean = np.mean(X, axis=0)
return self
def transform(self, X):
X = as_float_array(X)
assert X.shape[1] == self.mean.shape[0]
return X > self.mean
pipeline = Pipeline([('mean_discrete', MeanDiscrete()),
('classifier', DecisionTreeClassifier(random_state=14))])
scores_mean_discrete = cross_val_score(pipeline, X, y, scoring='accuracy')
print("Mean Discrete performance: {0:.3f}".format(scores_mean_discrete.mean()))