手書きKMeansクラスタリング


手書きKMeansクラスタリング、純python実装(numpyライブラリを使用)は、コード、コメントを参照してください.
# -*- coding: utf-8 -*-
"""
Created on Sun Jun 21 16:46:52 2020

@author: Lenovo
"""

import numpy as np

#      
def euclidean_distance(x1,x2):
    distance=0
    for i in range(len(x1)):
        distance+=pow((x1[i]-x2[i]),2)
    return np.sqrt(distance)


#       
def centroids_init(k,X):
    n_samples,n_features=X.shape

    centroids=np.zeros((k,n_features))
    #centroids          
    for i in range(k):
        #               
        centroid=X[np.random.choice(range(n_samples))] #   n_samples             
        
        centroids[i]=centroid
        
    return centroids

#                 
def closest_centroid(sample,centroids):
    closest_i=0
    #         
    closest_dist=float('inf')
    for i,centroid in enumerate(centroids):
        distance=euclidean_distance(sample,centroid)
        if distance < closest_dist:
            closest_i=i
            closest_dist=distance
            
    return closest_i

#        
def create_clusters(centroids,k,X):
    
    clusters=[[] for i in range(k)]
    #   K  3,         ,clusters  [[],[],[]]
    for sample_i , sample in enumerate(X):
        #  sample          
        centroid_i=closest_centroid(sample,centroids)
        clusters[centroid_i].append(sample_i)
        
    return clusters

#                 
def calculate_centroids(clusters,k,X):
    n_features=np.shape(X)[1]
    centroids=np.zeros((k,n_features))
    for i,cluster in enumerate(clusters):
        #    ,   KMeans         
        centroid=np.mean(X[cluster],axis=0)
        #      
        centroids[i]=centroid
        
    return centroids

#             
def get_cluster_labels(clusters,X):
    y_pred=np.zeros(np.shape(X)[0])
    for cluster_i,cluster in enumerate(clusters):
        for sample_i in cluster:
            y_pred[sample_i]=cluster_i
            
    return y_pred


#     
def kmeans(X,k,max_iter):
    centroids=centroids_init(k,X)
    for i in range(max_iter):
        clusters=create_clusters(centroids,k,X)
        prev_centroids=centroids
        centroids=calculate_centroids(clusters,k,X)
        diff=centroids-prev_centroids
        #          ,    
        if not diff.any():
            break
        
    return get_cluster_labels(clusters,X),centroids



X=np.array([[0,2],[0,0],[5,0],[4,5],[1,1]])
labels,centers=kmeans(X,2,10)
print(labels)
print(centers)

出力:[1.1.0.1.1.][[4.5 2.5 ] [0.33333333 1. ]]
参考文献:
公衆号「機械学習実験室」のオリジナル記事「数学導出+純Python実現機械学習アルゴリズム25:kmeansクラスタリング」