Question

Sorry! I try again, bu the results are also different in differently running. May be some wrong in my procedure. Could you help and check for me.


#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gap_statistic import OptimalK
try:
    from sklearn.datasets.samples_generator import make_blobs
except ImportError:
    from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25)
print('Data shape: ', X.shape)

# test the first exmaple

## The 1st run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
            optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247184-80ea00b8-be76-4402-a852-75c49bc0afa9.png)

## The 2nd run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
            optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247192-310efef9-95cd-49c0-a7e1-28813dc84c56.png)

## The 3rd run about gap statistic
optimalK = OptimalK(parallel_backend='rust')
n_clusters = optimalK(X, cluster_array=np.arange(1, 15))
print('Optimal clusters: ', n_clusters)

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
            optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247202-bc007fba-352f-42a6-887d-5d06d4374db4.png)


# test the second exmaple, I will test another exmple by fixing random_state and defining the OptimalK instance and passing in our own clustering function
def special_clustering_func(X, k):
    m = KMeans(random_state=0)
    m.fit(X)
    return m.cluster_centers_, m.predict(X)

## The first run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
            optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247216-1315afad-fa72-436a-94c6-a9fc5a64f73c.png)

## The second run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
            optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247220-61a3adfe-e868-4916-9de8-caa3c7a1adb7.png)

## The third run about gap statistic
optimalk = OptimalK(clusterer=special_clustering_func)
n_clusters = optimalk(X, n_refs=3, cluster_array=range(1, 15))

plt.plot(optimalk.gap_df.n_clusters, optimalk.gap_df.gap_value, linewidth=3)
plt.scatter(optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].n_clusters,
            optimalk.gap_df[optimalk.gap_df.n_clusters == n_clusters].gap_value, s=250, c='r')
plt.grid(True)
plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
![image](https://user-images.githubusercontent.com/99719999/155247223-ca656558-dd2d-4f05-80da-8d24b4ff7a80.png)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Question #55

coding: utf-8

test the first exmaple

The 1st run about gap statistic

The 2nd run about gap statistic

The 3rd run about gap statistic

test the second exmaple, I will test another exmple by fixing random_state and defining the OptimalK instance and passing in our own clustering function

The first run about gap statistic

The second run about gap statistic

The third run about gap statistic

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Question #55

Description

coding: utf-8

test the first exmaple

The 1st run about gap statistic

The 2nd run about gap statistic

The 3rd run about gap statistic

test the second exmaple, I will test another exmple by fixing random_state and defining the OptimalK instance and passing in our own clustering function

The first run about gap statistic

The second run about gap statistic

The third run about gap statistic

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions