I'm experimenting with KM-based algorithm so-called ODKM which uses KMeans clustering algorithm.
I want to retrieve clustering info cluster_centers_
, labels_
cluster_score
, effect
, dist
within class ODKM
elegantly.
import math
from math import pow
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
class ODKM:
def __init__(self,n_clusters=15,effectiveness=500,max_iter=2):
self.n_clusters=n_clusters
self.effectiveness=effectiveness
self.max_iter=max_iter
self.kmeans = {}
self.cluster_score = {}
def fit(self, data):
length = len(data)
for column in data.columns:
kmeans = KMeans(n_clusters=self.n_clusters,max_iter=self.max_iter)
self.kmeans[column]=kmeans
kmeans.fit(data[column].values.reshape(-1,1))
assign = pd.DataFrame(kmeans.predict(data[column].values.reshape(-1,1)),columns=['cluster'])
cluster_score=assign.groupby('cluster').apply(len).apply(lambda x:x/length)
ratio=cluster_score.copy()
sorted_centers = sorted(kmeans.cluster_centers_)
max_distance = ( sorted_centers[-1] - sorted_centers[0] )[ 0 ]
for i in range(self.n_clusters):
for k in range(self.n_clusters):
if i != k:
dist = np.abs(kmeans.cluster_centers_[i] - kmeans.cluster_centers_[k])/max_distance
effect = ratio[k]*(1/pow(self.effectiveness,dist))
cluster_score[i] = cluster_score[i]+effect
self.cluster_score[column] = cluster_score
def predict(self, data):
length = len(data)
score_array = np.zeros(length)
for column in data.columns:
kmeans = self.kmeans[ column ]
cluster_score = self.cluster_score[ column ]
assign = kmeans.predict( data[ column ].values.reshape(-1,1) )
#print(assign)
for i in range(length):
score_array[i] = score_array[i] + math.log10( cluster_score[assign[i]] )
return score_array
def fit_predict(self,data):
self.fit(data)
return self.predict(data)
test the results:
import pandas as pd
df = pd.DataFrame(data={'attr1':[1,1,1,1,2,2,2,2,2,2,2,2,3,5,5,6,6,7,7,7,7,7,7,7,15],
'attr2':[1,1,1,1,2,2,2,2,2,2,2,2,3,5,5,6,6,7,7,7,13,13,13,14,15]})
odkm_model = ODKM(n_clusters=3, max_iter=1)
result = odkm_model.fit_predict(df)
df['ODKM_Score']= result
df
#for i in result:
# print(round(i,2))
#results
#-0.51, -0.51 , -0.51 , -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51, -0.51
#-0.78, -0.78, -0.78, -0.78, -0.78, -0.78, -0.78
#-0.99, -0.99, -0.99, -0.99
#-1.99
So question is: Is there any elegant way I can include clustering info and return it when I run the class ODKM
and get reflect results of @class ODKM
like how we do df['ODKM_Score']= result
for df['Cluster_labels']= result
, df['Cluster_centers']= result
, df['cluster_score']= result
to have all info within main dataframe df
and pave the way visualization of clustering results.
normally without class method scripting I would do this using km.cluster_centers_
and km.cluster_centers_
:
n_clusters=3
km = KMeans(init='k-means++', n_clusters=n_clusters).fit(df[['Score']])
counts = np.bincount(km.labels_)
for center, count, label in zip(km.cluster_centers_, counts, range(n_clusters)):
print(center, count)
plt.bar(center, count, width=0.2, label=label)
but I'm wondering if I could collect this clustering info after fit and transform model maybe define a function at the end of class named KM_summary
:
- predicted cluster labels
km.labels_
- cluster centers
km.cluster_centers_
dist
,effect
,cluster_score
from 2nd functiondef fit(self, data):
score_array
from 3nd functiondef predict(self, data):
Any help will be highly aprrectited.
from How can retrieve clustering calcualtion within class?
No comments:
Post a Comment