day 07-meansアルゴリズム(上)


day07-k_means     

01-    
import pandas as pd
import numpy as np

#                 
#      ---           
#     
data = pd.read_excel('./meal_order_detail.xlsx')
# print('data:
',data)
# print('data :
',data.columns)
# print('data :
', data.loc[:, "dishes_name"])
print('data :
'
, data.loc[:, "amounts"]) print('data amounts :', data.loc[:, "amounts"].max()) print('data amounts :', data.loc[:, "amounts"].min()) print('*' * 100) # # # detail dishes_name # res = pd.get_dummies( # data=data.loc[:, 'dishes_name'], # # prefix=' ', # # prefix_sep=':' # # ) # print(' :
', res)
# --- # # detail amounts # 1、 # 2、 # # (1) # group_num = 5 # # (2) # # # max_amounts = data.loc[:, "amounts"].max() # # # min_amounts = data.loc[:, "amounts"].min() # # # ptp = max_amounts - min_amounts # # (3) # step = int(np.ceil(ptp / group_num)) # # (4) bins # bins = np.arange(min_amounts, max_amounts + step, step) # print(' :', bins) # 3、 -- # (1) group_num = 5 # (2) bins bins = data.loc[:, 'amounts'].quantile(q=np.arange(0, 1 + 1 / group_num, 1 / group_num)) # [0,0.2,0.4,0.6,0.8,1.0] print('bins:', bins) data.loc[:, 'amounts'] = pd.cut( x=data.loc[:, 'amounts'], # # bins=5, # bins=bins, include_lowest=True, # , ) print(' :
'
, data.loc[:, 'amounts']) print('*' * 100) # value_counts -- res = pd.value_counts(data.loc[:, 'amounts']) print('res:
'
, res) 02- import pandas as pd # , # ---corr # ---amounts counts data = pd.read_excel('./meal_order_detail.xlsx') print('data:
'
, data) print('data :
'
, data.columns) print('*' * 100) # # amounts counts # method --->{'pearson', 'kendall', 'spearman'} res = data.loc[:, ['amounts', 'counts']].corr() print('res:
'
, res) # --- , ----- , # --- , ----- , data = data.loc[:, ['detail_id', 'order_id', 'dishes_id', 'counts', 'amounts']] # for c1 in data.columns: for c2 in data.columns: if c1 != c2: print('%s %s:' % (c1, c2), end='') print(' :', data.loc[:, [c1, c2]].corr()) 03-k_means import numpy as np def build_data(): """ :return:data """ # .txt # python ---with open # numpy ----loadtxt --- txt # pandas ---read_table data = np.loadtxt('./test.txt', delimiter='\t') # data = np.mat(data) print('data:
'
, data) print('data:
'
, type(data)) return data def center_init(data, k): """ :param data: :param k: :return: center """ # : # # # # # index_num = data.shape[0] # # # column_num = data.shape[1] # # # # center = np.zeros(shape=(k, column_num)) # # 4 # # r # r_list = [] # # i # i = 0 # while True: # # 4 # r = np.random.randint(low=0, high=index_num) # print('r:
', r)
# # r , r_list, center # if r not in r_list: # r_list.append(r) # center[i, :] = data[r, :] # # , # else: # continue # # # if len(r_list) == k: # break # i += 1 # : # index_num = data.shape[0] # index_arr = np.arange(index_num) # k -- # replace=False -- index = np.random.choice(index_arr, k, replace=False) print(index) # index center = data[index, :] return center def distance(v1, v2): """ :param v1: 1 :param v2: 2 :return: """ # # : # # v1 = v1[0] # # # v1 = v1.A[0] # v2 = v2.A[0] # print('v1:
', v1)
# print('v2:
', v2)
# # sum_dist = 0 # for i in range(v1.shape[0]): # sum_dist += (v1[i] - v2[i]) ** 2 # # # # dist = np.sqrt(sum_dist) # x = np.power(v1 - v2, 2) # print('x:
', x)
dist = np.sqrt(np.sum(x)) # print('dist:
', dist)
return dist def k_means_owns(data, k): """ k-means :param data: data :param k: :return: """ # 1、 center = center_init(data, k) # print('center:
', center)
# index_num = data.shape[0] # new_data = np.zeros(shape=(index_num, 2)) # flag = True while flag: flag = False # 2、 for i in range(index_num): min_dist = 10000000000 min_index = -1 for j in range(k): # dist = distance(data[i, :], center[j, :]) if dist < min_dist: # min_dist = dist # min_index min_index = j # , # , , if new_data[i, 1] != min_index: flag =True new_data[i, :] = min_dist, min_index if flag: # # for p in range(k): # --bool_index bool_index = new_data[:, 1] == p # p_cluster = data[bool_index, :] # center[p, :] = p_cluster[:, 0].mean(), p_cluster[:, 1].mean() return new_data,center def show_res(): """ :return: """ pass def main(): """ :return: None """ # data = build_data() # k_means # k = 4 new_data,center = k_means_owns(data, k) print('new_data:
'
,new_data) print('center:
'
,center) # show_res() if __name__ == '__main__': main() import numpy as np # 1 v1 = np.array([[1, 2]]) # ,2 v2 = np.array([[3, 4]]) # v ==[a,b,c,d,e,f] # , # ((x1-x2) **2 + (y1 -y2)**2) # : # sum_dist = 0 # for i in range(len(v1)): # # sum_dist += (v1[i] - v2[i]) ** 2 # # # # dist = np.sqrt(sum_dist) # : x = np.power(v1-v2, 2) print('x:
'
, x) dist = np.sqrt(np.sum(x)) print('dist:
'
, dist)