Skip to content

Question #55

@nnn-niu

Description

@nnn-niu

Sorry! I try again, bu the results are also different in differently running. May be some wrong in my procedure. Could you help and check for me.

#!/usr/bin/env python

coding: utf-8

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gap_statistic import OptimalK
try:
from sklearn.datasets.samples_generator import make_blobs
except ImportError:
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25)
print('Data shape: ', X.shape)

test the first exmaple

The 1st run about gap statistic

optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

The 2nd run about gap statistic

optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

The 3rd run about gap statistic

optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

test the second exmaple, I will test another exmple by fixing random_state and defining the OptimalK instance and passing in our own clustering function

def special_clustering_func(X, k):
m = KMeans(random_state=0)
m.fit(X)
return m.cluster_centers_, m.predict(X)

The first run about gap statistic

optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

The second run about gap statistic

optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

The third run about gap statistic

optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
image

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions