k-meansアルゴリズム(航空分析)
3968 ワード
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# .
data=pd.read_csv(r'air_data.csv',encoding='gb18030')
print(data.shape)
#
data.dropna(axis=0,how='any',subset=['SUM_YR_1','SUM_YR_2'],inplace=True)
print(data.shape)
# 0
mask=(data['SUM_YR_1']==0)&(data['SUM_YR_2']==0)
labels=data.index[mask]
data.drop(labels=labels,inplace=True,axis=0)
print(data.shape)
'''
(62988, 44)
(62299, 44)
(62044, 44)
'''
# .
#(1) L= (
# L=LOAD TIME-FFP DATE
FFP_DATE=pd.to_datetime(data['FFP_DATE']).dt.date
LOAD_TIME=pd.to_datetime(data['LOAD_TIME']).dt.date
# print(FFP_DATE)
# L1=(LOAD_TIME-FFP_DATE)
# print(L1)
L=(LOAD_TIME-FFP_DATE)/30
# print(L)
'''
0 90 days 04:48:00
1 86 days 13:36:00
2 87 days 04:00:00
'''
data['L']=L.dt.days
# print(data['L'])
'''
0 90
1 86
2 87
'''
# (2) R= ( : ),
#R = RELAST TO END
LAST_TO_END=data['LAST_TO_END']//30
# print(LAST_TO_END)
data['R']=LAST_TO_END
# print(data['R'])
'''
/30
0 0.033333
1 0.233333
2 0.366667
3 3.233333
'''
'''
//30
0 0
1 0
2 0
3 3
'''
# (3)
# F=FLIGHT-COUNT
FLIGHT_COUNT=data['FLIGHT_COUNT']
data['F']=FLIGHT_COUNT
#(4) M= ( : )
# M=SEG_KM_SUM
SEG_KM_SUM=data['SEG_KM_SUM']
data['M']=SEG_KM_SUM
#(5) c= ( : )
# C=AVG_DISCOUNT
avg_discount=data['avg_discount']
data['C']=avg_discount
# print(data)
# .
def data_scal(data):
data['L ']=(data['L']-data['L'].min())/(data['L'].max()-data['L'].min())
data['R ']=(data['R']-data['R'].min())/(data['R'].max()-data['R'].min())
data['F ']=(data['F']-data['F'].min())/(data['F'].max()-data['F'].min())
data['M '] =(data['M']-data['M'].min())/(data['M'].max()-data['M'].min())
data['C '] =(data['C']-data['C'].min())/(data['C'].max()-data['C'].min())
return data
data=data_scal(data)
# . K_means
from sklearn.cluster import KMeans
x=data[['L ','R ','F ','M ','C ']]
kms=KMeans(n_clusters=5)
y=kms.fit_predict(x)
data['index1']=y #
# print(y)
center=data[['L ','R ','F ','M ','C ','index1']].groupby(by='index1').mean() #
center['L 2']=center['L ']
print(center)
'''
L R F M C L 2
index1
0 0.155161 0.615177 0.010573 0.011052 0.420774 0.155161
1 0.775075 0.083877 0.078814 0.045059 0.450950 0.775075
2 0.125223 0.106254 0.046397 0.028963 0.418198 0.125223
3 0.440348 0.097372 0.060490 0.035447 0.435148 0.440348
4 0.640012 0.598472 0.012530 0.011577 0.432559 0.640012
'''
# print(data)
# :
plt.rcParams['font.sans-serif'] = 'SimHei'##
plt.rcParams['axes.unicode_minus'] = False ##
plt.figure()
dataLength=5
angles=np.linspace(0,2*np.pi,dataLength,endpoint=False)
angles2=np.concatenate((angles,np.array([angles[0]]))) #
# print(angles2)
labels = ['L ','R ','F ','M ','C ']
for i in range(5):
plt.polar(angles2,center.values[i])
plt.fill(angles2,center.values[i],alpha=0.25) #
plt.xticks(angles,labels)
plt.show()