pythonデータマイニング学習ノート19.アヤメデータセットの可視化、線形回帰、決定ツリーパターン分析


#2018-04-05 16:57:26 April Thursday the 14 week, the 095 day SZ SSMR
python        】  .         、    、        
1.           
2.                   
3.           
4.Kmeans          

 .         
    Python Sklearn            ——      。               ,
      、    、    、            。
    Python Sklearn            ——      。               ,      、    、    、            。
#     iris  
from sklearn.datasets import load_iris  
#       
iris = load_iris()  
#       
#print iris.data 
target     ,   data              ,     150,          3     ,       3 。  :
    Iris Setosa(   )
    Iris Versicolour(    )
    Iris Virginica(      )
 .         





              ,    Pandas         。
       ,       、           ,          ,          。
import pandas
#     iris  
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names) #  csv  
print(dataset.describe())

#    histograms
dataset.hist() 

     dataset.plot()     ,        ,   x  、y         。
import pandas
#     iris  
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names) #  csv  
print(dataset.describe())
dataset.plot(x='sepal-length', y='sepal-width', kind='scatter')

  dataset.plot(kind='kde')  KDE ,KDE        (Kernel Density Estimate,     )。
import pandas
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names) #  csv  
print(dataset.describe())
dataset.plot(kind='kde')

  dataset.plot()     kind='box'    ,              (y )       ,      ,      ,             。
import pandas 
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names) #  csv  
print(dataset.describe())
dataset.plot(kind='kde')
dataset.plot(kind='box', subplots=True, layout=(2,2), 
             sharex=False, sharey=False)
     radviz()  、andrews_curves()   parallel_coordinates()      ,    petal-length  ,      :
import pandas
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

from pandas.tools.plotting import radviz
radviz(dataset, 'class')

from pandas.tools.plotting import andrews_curves
andrews_curves(dataset, 'class')

from pandas.tools.plotting import parallel_coordinates
parallel_coordinates(dataset, 'class')
         ,                ,              ,               ,          。
import pandas
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'class']
dataset = pandas.read_csv(url, names=names)

from pandas.tools.plotting import scatter_matrix
scatter_matrix(dataset, alpha=0.2, figsize=(6, 6), diagonal='kde')
 .          
                    ,     x y  


from sklearn.datasets import load_iris
hua = load_iris()
#        
x = [n[0] for n in hua.data]
y = [n[1] for n in hua.data]
import numpy as np #     
x = np.array(x).reshape(len(x),1)
y = np.array(y).reshape(len(y),1)

      Sklearn              ,         
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
clf.fit(x,y)
pre = clf.predict(x)


      Matplotlib          
#      
import matplotlib.pyplot as plt
plt.scatter(x,y,s=100)
plt.plot(x,pre,"r-",linewidth=4)
for idx, m in enumerate(x):
    plt.plot([m,m],[y[idx],pre[idx]], 'g-')
plt.show()
       ,                。                       ,              ,     。
          ,            ,    :
print(u"  ", clf.coef_)
print (u"  ", clf.intercept_)
print (np.mean(y-pre)**2)
#    [[-0.05726823]]
#    [ 3.38863738]
# 1.91991214088e-31

             5.0  ,         ,                    ,     [3.10229621]。
print(clf.predict([[5.0]]))

 .         

                :
from sklearn.datasets import load_iris 
from sklearn.tree import DecisionTreeClassifier   
iris = load_iris()    
clf = DecisionTreeClassifier()  
clf.fit(iris.data, iris.target)  
#print clf   
predicted = clf.predict(iris.data)  
  
#           
X = iris.data  
L1 = [x[0] for x in X]  
#print L1  
L2 = [x[1] for x in X]  
#print L2  
  
import numpy as np  
import matplotlib.pyplot as plt  
plt.scatter(L1, L2, c=predicted, marker='x')  #cmap=plt.cm.Paired  
plt.title("DTC")  
plt.show()  
    70%   ,30%     ,  70%     0-40、50-90、100-140 ,30%    40-50、90-100、140-150 。       、    ,            :


from sklearn.datasets import load_iris   
from sklearn.tree import DecisionTreeClassifier  
import numpy as np 
iris = load_iris()   
#     
train_data = np.concatenate((iris.data[0:40, :], iris.data[50:90, :], iris.data[100:140, :]), axis = 0)  
train_target = np.concatenate((iris.target[0:40], iris.target[50:90], iris.target[100:140]), axis = 0)  
#     
test_data = np.concatenate((iris.data[40:50, :], iris.data[90:100, :], iris.data[140:150, :]), axis = 0)  
test_target = np.concatenate((iris.target[40:50], iris.target[90:100], iris.target[140:150]), axis = 0)  

#    
clf = DecisionTreeClassifier()   
clf.fit(train_data, train_target)  
predict_target = clf.predict(test_data)  
#print predict_target  

#             
print(sum(predict_target == test_target)  )
  
#          F   
from sklearn import metrics  
print(metrics.classification_report(test_target,predict_target))  
print(metrics.confusion_matrix(test_target,predict_target)) 
X = test_data  
L1 = [n[0] for n in X]  
#print L1  
L2 = [n[1] for n in X]  
#print L2  
import numpy as np  
import matplotlib.pyplot as plt  
plt.scatter(L1, L2, c=predict_target, marker='x')  #cmap=plt.cm.Paired  
plt.title("DecisionTreeClassifier")  
plt.show()

 . Kmeans       




KMeans          ,       (        ),            ,  “    ,    ”    。    :

# -*- coding: utf-8 -*-
from sklearn.datasets import load_iris 
from sklearn.cluster import KMeans   
iris = load_iris()    
clf = KMeans()  
clf.fit(iris.data, iris.target)  
#print clf   
predicted = clf.predict(iris.data)  
  
#           
X = iris.data  
L1 = [x[0] for x in X]  
#print L1  
L2 = [x[1] for x in X]  
#print L2  
  
import numpy as np  
import matplotlib.pyplot as plt  
plt.scatter(L1, L2, c=predicted, marker='s',s=20,cmap=plt.cm.Paired)  
plt.title("DTC")  
plt.show()