day 07-meansアルゴリズム(上)
32898 ワード
day07-k_means
01-
import pandas as pd
import numpy as np
#
# ---
#
data = pd.read_excel('./meal_order_detail.xlsx')
# print('data:
',data)
# print('data :
',data.columns)
# print('data :
', data.loc[:, "dishes_name"])
print('data :
', data.loc[:, "amounts"])
print('data amounts :', data.loc[:, "amounts"].max())
print('data amounts :', data.loc[:, "amounts"].min())
print('*' * 100)
#
# # detail dishes_name
# res = pd.get_dummies(
# data=data.loc[:, 'dishes_name'], #
# prefix=' ', #
# prefix_sep=':' #
# )
# print(' :
', res)
# ---
#
# detail amounts
# 1、
# 2、
# # (1)
# group_num = 5
# # (2)
# #
# max_amounts = data.loc[:, "amounts"].max()
# #
# min_amounts = data.loc[:, "amounts"].min()
# #
# ptp = max_amounts - min_amounts
# # (3)
# step = int(np.ceil(ptp / group_num))
# # (4) bins
# bins = np.arange(min_amounts, max_amounts + step, step)
# print(' :', bins)
# 3、 --
# (1)
group_num = 5
# (2) bins
bins = data.loc[:, 'amounts'].quantile(q=np.arange(0, 1 + 1 / group_num, 1 / group_num))
# [0,0.2,0.4,0.6,0.8,1.0]
print('bins:', bins)
data.loc[:, 'amounts'] = pd.cut(
x=data.loc[:, 'amounts'], #
# bins=5, #
bins=bins,
include_lowest=True, # ,
)
print(' :
', data.loc[:, 'amounts'])
print('*' * 100)
# value_counts --
res = pd.value_counts(data.loc[:, 'amounts'])
print('res:
', res)
02-
import pandas as pd
# ,
# ---corr
# ---amounts counts
data = pd.read_excel('./meal_order_detail.xlsx')
print('data:
', data)
print('data :
', data.columns)
print('*' * 100)
#
# amounts counts
# method --->{'pearson', 'kendall', 'spearman'}
res = data.loc[:, ['amounts', 'counts']].corr()
print('res:
', res)
# --- , ----- ,
# --- , ----- ,
data = data.loc[:, ['detail_id', 'order_id', 'dishes_id', 'counts', 'amounts']]
#
for c1 in data.columns:
for c2 in data.columns:
if c1 != c2:
print('%s %s:' % (c1, c2), end='')
print(' :', data.loc[:, [c1, c2]].corr())
03-k_means
import numpy as np
def build_data():
"""
:return:data
"""
# .txt
# python ---with open
# numpy ----loadtxt --- txt
# pandas ---read_table
data = np.loadtxt('./test.txt', delimiter='\t')
#
data = np.mat(data)
print('data:
', data)
print('data:
', type(data))
return data
def center_init(data, k):
"""
:param data:
:param k:
:return: center
"""
# :
# #
# #
# index_num = data.shape[0]
# #
# column_num = data.shape[1]
# # #
# center = np.zeros(shape=(k, column_num))
# # 4
# # r
# r_list = []
# # i
# i = 0
# while True:
# # 4
# r = np.random.randint(low=0, high=index_num)
# print('r:
', r)
# # r , r_list, center
# if r not in r_list:
# r_list.append(r)
# center[i, :] = data[r, :]
# # ,
# else:
# continue
# #
# if len(r_list) == k:
# break
# i += 1
# :
#
index_num = data.shape[0]
#
index_arr = np.arange(index_num)
# k --
# replace=False --
index = np.random.choice(index_arr, k, replace=False)
print(index)
# index
center = data[index, :]
return center
def distance(v1, v2):
"""
:param v1: 1
:param v2: 2
:return:
"""
# # :
# # v1 = v1[0]
# #
# v1 = v1.A[0]
# v2 = v2.A[0]
# print('v1:
', v1)
# print('v2:
', v2)
#
# sum_dist = 0
# for i in range(v1.shape[0]):
# sum_dist += (v1[i] - v2[i]) ** 2
#
# #
# dist = np.sqrt(sum_dist)
#
x = np.power(v1 - v2, 2)
# print('x:
', x)
dist = np.sqrt(np.sum(x))
# print('dist:
', dist)
return dist
def k_means_owns(data, k):
"""
k-means
:param data: data
:param k:
:return:
"""
# 1、
center = center_init(data, k)
# print('center:
', center)
#
index_num = data.shape[0]
#
new_data = np.zeros(shape=(index_num, 2))
#
flag = True
while flag:
flag = False
# 2、
for i in range(index_num):
min_dist = 10000000000
min_index = -1
for j in range(k):
#
dist = distance(data[i, :], center[j, :])
if dist < min_dist:
#
min_dist = dist
# min_index
min_index = j
# ,
# , ,
if new_data[i, 1] != min_index:
flag =True
new_data[i, :] = min_dist, min_index
if flag:
#
#
for p in range(k):
# --bool_index
bool_index = new_data[:, 1] == p
#
p_cluster = data[bool_index, :]
#
center[p, :] = p_cluster[:, 0].mean(), p_cluster[:, 1].mean()
return new_data,center
def show_res():
"""
:return:
"""
pass
def main():
"""
:return: None
"""
#
data = build_data()
# k_means
#
k = 4
new_data,center = k_means_owns(data, k)
print('new_data:
',new_data)
print('center:
',center)
#
show_res()
if __name__ == '__main__':
main()
import numpy as np
# 1
v1 = np.array([[1, 2]]) # ,2
v2 = np.array([[3, 4]])
# v ==[a,b,c,d,e,f] # ,
# ((x1-x2) **2 + (y1 -y2)**2)
# :
# sum_dist = 0
# for i in range(len(v1)):
#
# sum_dist += (v1[i] - v2[i]) ** 2
#
# #
# dist = np.sqrt(sum_dist)
# :
x = np.power(v1-v2, 2)
print('x:
', x)
dist = np.sqrt(np.sum(x))
print('dist:
', dist)