k-meansアルゴリズム(航空分析)

3968 ワード

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# .         

data=pd.read_csv(r'air_data.csv',encoding='gb18030')
print(data.shape)
#        
data.dropna(axis=0,how='any',subset=['SUM_YR_1','SUM_YR_2'],inplace=True)
print(data.shape)
#     0  
mask=(data['SUM_YR_1']==0)&(data['SUM_YR_2']==0)
labels=data.index[mask]
data.drop(labels=labels,inplace=True,axis=0)
print(data.shape)
'''
(62988, 44)
(62299, 44)
(62044, 44)
'''



# .      
#(1)                 L=              (   
# L=LOAD TIME-FFP DATE
FFP_DATE=pd.to_datetime(data['FFP_DATE']).dt.date
LOAD_TIME=pd.to_datetime(data['LOAD_TIME']).dt.date
# print(FFP_DATE)
# L1=(LOAD_TIME-FFP_DATE)
# print(L1)
L=(LOAD_TIME-FFP_DATE)/30
# print(L)
'''
0       90 days 04:48:00
1       86 days 13:36:00
2       87 days 04:00:00

'''
data['L']=L.dt.days
# print(data['L'])
'''
0        90
1        86
2        87
'''
# (2)                       R=                  (  : ),
#R = RELAST TO END
LAST_TO_END=data['LAST_TO_END']//30
# print(LAST_TO_END)
data['R']=LAST_TO_END
# print(data['R'])
'''
/30

0         0.033333
1         0.233333
2         0.366667
3         3.233333
'''
'''
//30
0         0
1         0
2         0
3         3

'''
# (3)                   
# F=FLIGHT-COUNT
FLIGHT_COUNT=data['FLIGHT_COUNT']
data['F']=FLIGHT_COUNT
#(4)             M=           (  :  )
# M=SEG_KM_SUM
SEG_KM_SUM=data['SEG_KM_SUM']
data['M']=SEG_KM_SUM
#(5)                       c=     (  : )
# C=AVG_DISCOUNT
avg_discount=data['avg_discount']
data['C']=avg_discount
# print(data)



# .     
def data_scal(data):
    data['L ']=(data['L']-data['L'].min())/(data['L'].max()-data['L'].min())
    data['R ']=(data['R']-data['R'].min())/(data['R'].max()-data['R'].min())
    data['F ']=(data['F']-data['F'].min())/(data['F'].max()-data['F'].min())
    data['M '] =(data['M']-data['M'].min())/(data['M'].max()-data['M'].min())
    data['C '] =(data['C']-data['C'].min())/(data['C'].max()-data['C'].min())
    return data
data=data_scal(data)




# .    K_means    
from sklearn.cluster import KMeans
x=data[['L ','R ','F ','M ','C ']]
kms=KMeans(n_clusters=5)
y=kms.fit_predict(x)
data['index1']=y  #    
# print(y)
center=data[['L ','R ','F ','M ','C ','index1']].groupby(by='index1').mean()  #      
center['L 2']=center['L ']
print(center)
'''
              L         R         F         M         C        L 2
index1                                                            
0       0.155161  0.615177  0.010573  0.011052  0.420774  0.155161
1       0.775075  0.083877  0.078814  0.045059  0.450950  0.775075
2       0.125223  0.106254  0.046397  0.028963  0.418198  0.125223
3       0.440348  0.097372  0.060490  0.035447  0.435148  0.440348
4       0.640012  0.598472  0.012530  0.011577  0.432559  0.640012
'''
# print(data)


#    :
plt.rcParams['font.sans-serif'] = 'SimHei'##  
plt.rcParams['axes.unicode_minus'] = False ##        
plt.figure()
dataLength=5
angles=np.linspace(0,2*np.pi,dataLength,endpoint=False)
angles2=np.concatenate((angles,np.array([angles[0]]))) #  
# print(angles2)
labels = ['L ','R ','F ','M ','C ']
for i in range(5):
    plt.polar(angles2,center.values[i])
    plt.fill(angles2,center.values[i],alpha=0.25)  #    
plt.xticks(angles,labels)
plt.show()