-
Notifications
You must be signed in to change notification settings - Fork 50
Description
Sorry! I try again, bu the results are also different in differently running. May be some wrong in my procedure. Could you help and check for me.
#!/usr/bin/env python
coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gap_statistic import OptimalK
try:
from sklearn.datasets.samples_generator import make_blobs
except ImportError:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25)
print('Data shape: ', X.shape)
test the first exmaple
The 1st run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)
plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

The 2nd run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)
plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

The 3rd run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)
plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

test the second exmaple, I will test another exmple by fixing random_state and defining the OptimalK instance and passing in our own clustering function
def special_clustering_func(X, k):
m = KMeans(random_state=0)
m.fit(X)
return m.cluster_centers_, m.predict(X)
The first run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))
plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

The second run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))
plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()

The third run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))
plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
