@@ -496,8 +496,7 @@ Random initialization of labels.
496
496
``` {code-cell} ipython3
497
497
:tags: [remove-cell]
498
498
499
- from sklearn.cluster import KMeans
500
-
499
+ from sklearn.metrics import euclidean_distances
501
500
502
501
def plot_kmean_iterations(iterations, data, centroid_init):
503
502
"""Plot kmeans cluster and label updates for multiple iterations"""
@@ -510,18 +509,15 @@ def plot_kmean_iterations(iterations, data, centroid_init):
510
509
data['bill_centroid'] = data['label'].map(centroid_init['bill_length_standardized'])
511
510
dfs.append(data.copy())
512
511
513
- cluster_columns = ['flipper_length_standardized', 'bill_length_standardized']
514
- knn = KMeans(init=centroid_init[cluster_columns], n_clusters=3, max_iter=1, n_init=1)
515
- knn.fit(data[cluster_columns])
516
-
517
- data['label'] = knn.labels_
518
512
data['iteration'] = f'Iteration {i}'
519
513
data['update_type'] = 'Label Update'
514
+ cluster_columns = ['flipper_length_standardized', 'bill_length_standardized']
515
+ data['label'] = np.argmin(euclidean_distances(data[cluster_columns], centroid_init), axis=1)
520
516
data['flipper_centroid'] = data['label'].map(centroid_init['flipper_length_standardized'])
521
517
data['bill_centroid'] = data['label'].map(centroid_init['bill_length_standardized'])
522
518
dfs.append(data.copy())
523
519
524
- centroid_init = data.groupby('label').mean(numeric_only=True )
520
+ centroid_init = data.groupby('label')[cluster_columns] .mean()
525
521
526
522
points = alt.Chart(
527
523
pd.concat(dfs),
@@ -553,12 +549,8 @@ def plot_kmean_iterations(iterations, data, centroid_init):
553
549
:tags: [remove-cell]
554
550
555
551
centroid_init = penguin_data.groupby('label').mean()
556
- # For some reason, this adjustment is needed to get roughly the same assignment of the points
557
- # to the clusters as in the R version
558
- centroid_init.loc[2] = [0.29, 0.36]
559
- centroid_init.loc[1] = [-0.01, -0.09]
560
552
561
- glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguin_data, centroid_init), display=True)
553
+ glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguin_data.copy() , centroid_init.copy() ), display=True)
562
554
```
563
555
564
556
``` {index} WSSD; total
@@ -618,7 +610,7 @@ These, however, are beyond the scope of this book.
618
610
619
611
penguin_data = pd.read_csv("data/penguins_standardized.csv")
620
612
# Set up the initial "random" label assignment the same as in the R book
621
- penguin_data['label'] = [2, 2, 0 , 0, 1 , 0, 1, 0, 0, 0 , 2, 0, 1, 1 , 1, 2, 2, 2 ]
613
+ penguin_data['label'] = [1, 1, 2, 2 , 0, 2 , 0, 2, 2, 2, 1 , 2, 0, 0, 0 , 1, 1, 1 ]
622
614
centroid_init = penguin_data.groupby('label').mean()
623
615
624
616
points_kmeans_init = alt.Chart(penguin_data).mark_point(size=75, filled=True, opacity=1).encode(
@@ -650,7 +642,7 @@ Random initialization of labels.
650
642
``` {code-cell} ipython3
651
643
:tags: [remove-cell]
652
644
653
- glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguin_data, centroid_init), display=True)
645
+ glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguin_data.copy() , centroid_init.copy() ), display=True)
654
646
```
655
647
656
648
{numref}` toy-kmeans-bad-iter-1 ` shows what the iterations of K-means would look like with the unlucky random initialization shown in {numref}` toy-kmeans-bad-init-1 `
@@ -670,6 +662,9 @@ and pick the clustering that has the lowest final total WSSD.
670
662
``` {code-cell} ipython3
671
663
:tags: [remove-cell]
672
664
665
+ from sklearn.cluster import KMeans
666
+
667
+
673
668
penguin_data = pd.read_csv("data/penguins_standardized.csv")
674
669
675
670
dfs = []
0 commit comments