Friday 22 October 2021

How can retrieve clustering calcualtion within class?

I'm experimenting with KM-based algorithm so-called ODKM which uses KMeans clustering algorithm.

I want to retrieve clustering info cluster_centers_, labels_ cluster_score, effect, dist within class ODKM elegantly.

import math
from math import pow
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans


class ODKM:
    
    def __init__(self,n_clusters=15,effectiveness=500,max_iter=2):
        self.n_clusters=n_clusters
        self.effectiveness=effectiveness
        self.max_iter=max_iter
        self.kmeans = {}
        self.cluster_score = {}
        
    def fit(self, data):
        length = len(data)
        for column in data.columns:
            kmeans = KMeans(n_clusters=self.n_clusters,max_iter=self.max_iter)
            self.kmeans[column]=kmeans
            kmeans.fit(data[column].values.reshape(-1,1))
            assign = pd.DataFrame(kmeans.predict(data[column].values.reshape(-1,1)),columns=['cluster'])
            cluster_score=assign.groupby('cluster').apply(len).apply(lambda x:x/length)
            ratio=cluster_score.copy()
        
            sorted_centers = sorted(kmeans.cluster_centers_)
            max_distance = ( sorted_centers[-1] - sorted_centers[0] )[ 0 ]
        
            for i in range(self.n_clusters):
                for k in range(self.n_clusters):
                    if i != k:
                        dist = np.abs(kmeans.cluster_centers_[i] - kmeans.cluster_centers_[k])/max_distance
                        effect = ratio[k]*(1/pow(self.effectiveness,dist))
                        cluster_score[i] = cluster_score[i]+effect
                        
            self.cluster_score[column] = cluster_score
                    
    def predict(self, data):
        length = len(data)
        score_array = np.zeros(length)
        for column in data.columns:
            kmeans = self.kmeans[ column ]
            cluster_score = self.cluster_score[ column ]
            
            assign = kmeans.predict( data[ column ].values.reshape(-1,1) )
            #print(assign)
            
            for i in range(length):
                score_array[i] = score_array[i] + math.log10( cluster_score[assign[i]] )
            
        return score_array
    
    def fit_predict(self,data):
        self.fit(data)
        return self.predict(data)

test the results:

import pandas as pd

df = pd.DataFrame(data={'attr1':[1,1,1,1,2,2,2,2,2,2,2,2,3,5,5,6,6,7,7,7,7,7,7,7,15],
                         'attr2':[1,1,1,1,2,2,2,2,2,2,2,2,3,5,5,6,6,7,7,7,13,13,13,14,15]})


odkm_model = ODKM(n_clusters=3, max_iter=1)
result = odkm_model.fit_predict(df)

df['ODKM_Score']= result 
df

#for i in result:
#    print(round(i,2))

#results
#-0.51, -0.51 , -0.51 , -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51
#-0.78, -0.78, -0.78, -0.78, -0.78, -0.78, -0.78
#-0.99, -0.99, -0.99, -0.99
#-1.99

So question is: Is there any elegant way I can include clustering info and return it when I run the class ODKM and get reflect results of @class ODKM like how we do df['ODKM_Score']= result for df['Cluster_labels']= result, df['Cluster_centers']= result, df['cluster_score']= result to have all info within main dataframe df and pave the way visualization of clustering results.

normally without class method scripting I would do this using km.cluster_centers_ and km.cluster_centers_:

n_clusters=3

km = KMeans(init='k-means++', n_clusters=n_clusters).fit(df[['Score']])

counts = np.bincount(km.labels_)

for center, count, label in zip(km.cluster_centers_, counts, range(n_clusters)):
    print(center, count)
    plt.bar(center, count, width=0.2, label=label)

but I'm wondering if I could collect this clustering info after fit and transform model maybe define a function at the end of class named KM_summary:

  • predicted cluster labels km.labels_
  • cluster centers km.cluster_centers_
  • dist, effect, cluster_score from 2nd function def fit(self, data):
  • score_array from 3nd function def predict(self, data):

Any help will be highly aprrectited.



from How can retrieve clustering calcualtion within class?

No comments:

Post a Comment