Skip to content

Commit 0f8a5d8

Browse files
committed
Correct label assignment after each iteration
1 parent f273615 commit 0f8a5d8

File tree

1 file changed

+10
-15
lines changed

1 file changed

+10
-15
lines changed

source/clustering.md

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -496,8 +496,7 @@ Random initialization of labels.
496496
```{code-cell} ipython3
497497
:tags: [remove-cell]
498498
499-
from sklearn.cluster import KMeans
500-
499+
from sklearn.metrics import euclidean_distances
501500
502501
def plot_kmean_iterations(iterations, data, centroid_init):
503502
"""Plot kmeans cluster and label updates for multiple iterations"""
@@ -510,18 +509,15 @@ def plot_kmean_iterations(iterations, data, centroid_init):
510509
data['bill_centroid'] = data['label'].map(centroid_init['bill_length_standardized'])
511510
dfs.append(data.copy())
512511
513-
cluster_columns = ['flipper_length_standardized', 'bill_length_standardized']
514-
knn = KMeans(init=centroid_init[cluster_columns], n_clusters=3, max_iter=1, n_init=1)
515-
knn.fit(data[cluster_columns])
516-
517-
data['label'] = knn.labels_
518512
data['iteration'] = f'Iteration {i}'
519513
data['update_type'] = 'Label Update'
514+
cluster_columns = ['flipper_length_standardized', 'bill_length_standardized']
515+
data['label'] = np.argmin(euclidean_distances(data[cluster_columns], centroid_init), axis=1)
520516
data['flipper_centroid'] = data['label'].map(centroid_init['flipper_length_standardized'])
521517
data['bill_centroid'] = data['label'].map(centroid_init['bill_length_standardized'])
522518
dfs.append(data.copy())
523519
524-
centroid_init = data.groupby('label').mean(numeric_only=True)
520+
centroid_init = data.groupby('label')[cluster_columns].mean()
525521
526522
points = alt.Chart(
527523
pd.concat(dfs),
@@ -553,12 +549,8 @@ def plot_kmean_iterations(iterations, data, centroid_init):
553549
:tags: [remove-cell]
554550
555551
centroid_init = penguin_data.groupby('label').mean()
556-
# For some reason, this adjustment is needed to get roughly the same assignment of the points
557-
# to the clusters as in the R version
558-
centroid_init.loc[2] = [0.29, 0.36]
559-
centroid_init.loc[1] = [-0.01, -0.09]
560552
561-
glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguin_data, centroid_init), display=True)
553+
glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguin_data.copy(), centroid_init.copy()), display=True)
562554
```
563555

564556
```{index} WSSD; total
@@ -618,7 +610,7 @@ These, however, are beyond the scope of this book.
618610
619611
penguin_data = pd.read_csv("data/penguins_standardized.csv")
620612
# Set up the initial "random" label assignment the same as in the R book
621-
penguin_data['label'] = [2, 2, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 1, 1, 1, 2, 2, 2]
613+
penguin_data['label'] = [1, 1, 2, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0, 0, 0, 1, 1, 1]
622614
centroid_init = penguin_data.groupby('label').mean()
623615
624616
points_kmeans_init = alt.Chart(penguin_data).mark_point(size=75, filled=True, opacity=1).encode(
@@ -650,7 +642,7 @@ Random initialization of labels.
650642
```{code-cell} ipython3
651643
:tags: [remove-cell]
652644
653-
glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguin_data, centroid_init), display=True)
645+
glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguin_data.copy(), centroid_init.copy()), display=True)
654646
```
655647

656648
{numref}`toy-kmeans-bad-iter-1` shows what the iterations of K-means would look like with the unlucky random initialization shown in {numref}`toy-kmeans-bad-init-1`
@@ -670,6 +662,9 @@ and pick the clustering that has the lowest final total WSSD.
670662
```{code-cell} ipython3
671663
:tags: [remove-cell]
672664
665+
from sklearn.cluster import KMeans
666+
667+
673668
penguin_data = pd.read_csv("data/penguins_standardized.csv")
674669
675670
dfs = []

0 commit comments

Comments
 (0)