@@ -488,14 +488,13 @@ These are beyond the scope of this book.
488
488
``` {code-cell} ipython3
489
489
:tags: [remove-cell]
490
490
491
- penguin_data = pd.read_csv("data/penguins_standardized.csv")
492
491
# Set up the initial "random" label assignment the same as in the R book
493
- penguin_data ['label'] = [
492
+ penguins_standardized ['label'] = [
494
493
2, 2, 1, 1, 0, 0, 0, 1,
495
494
2, 2, 1, 2, 1, 2,
496
495
0, 1, 2, 2
497
496
]
498
- points_kmeans_init = alt.Chart(penguin_data ).mark_point(size=75, filled=True, opacity=1).encode(
497
+ points_kmeans_init = alt.Chart(penguins_standardized ).mark_point(size=75, filled=True, opacity=1).encode(
499
498
alt.X("flipper_length_standardized").title("Flipper Length (standardized)"),
500
499
alt.Y("bill_length_standardized").title("Bill Length (standardized)"),
501
500
alt.Color('label:N').legend(None),
@@ -577,9 +576,9 @@ def plot_kmean_iterations(iterations, data, centroid_init):
577
576
``` {code-cell} ipython3
578
577
:tags: [remove-cell]
579
578
580
- centroid_init = penguin_data .groupby('label').mean()
579
+ centroid_init = penguins_standardized .groupby('label').mean()
581
580
582
- glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguin_data .copy(), centroid_init.copy()), display=True)
581
+ glue('toy-kmeans-iter-1', plot_kmean_iterations(3, penguins_standardized .copy(), centroid_init.copy()), display=True)
583
582
```
584
583
585
584
``` {index} WSSD; total
@@ -624,12 +623,11 @@ are changing, and the algorithm terminates.
624
623
``` {code-cell} ipython3
625
624
:tags: [remove-cell]
626
625
627
- penguin_data = pd.read_csv("data/penguins_standardized.csv")
628
626
# Set up the initial "random" label assignment the same as in the R book
629
- penguin_data ['label'] = [1, 1, 2, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0, 0, 0, 1, 1, 1]
630
- centroid_init = penguin_data .groupby('label').mean()
627
+ penguins_standardized ['label'] = [1, 1, 2, 2, 0, 2, 0, 2, 2, 2, 1, 2, 0, 0, 0, 1, 1, 1]
628
+ centroid_init = penguins_standardized .groupby('label').mean()
631
629
632
- points_kmeans_init = alt.Chart(penguin_data ).mark_point(size=75, filled=True, opacity=1).encode(
630
+ points_kmeans_init = alt.Chart(penguins_standardized ).mark_point(size=75, filled=True, opacity=1).encode(
633
631
alt.X("flipper_length_standardized").title("Flipper Length (standardized)"),
634
632
alt.Y("bill_length_standardized").title("Bill Length (standardized)"),
635
633
alt.Color('label:N').legend(None),
@@ -659,7 +657,7 @@ Random initialization of labels.
659
657
``` {code-cell} ipython3
660
658
:tags: [remove-cell]
661
659
662
- glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguin_data .copy(), centroid_init.copy()), display=True)
660
+ glue('toy-kmeans-bad-iter-1', plot_kmean_iterations(4, penguins_standardized .copy(), centroid_init.copy()), display=True)
663
661
```
664
662
665
663
{numref}` toy-kmeans-bad-iter-1 ` shows what the iterations of K-means would look like with the unlucky random initialization shown in {numref}` toy-kmeans-bad-init-1 `
@@ -681,13 +679,12 @@ and pick the clustering that has the lowest final total WSSD.
681
679
682
680
from sklearn.cluster import KMeans
683
681
684
-
685
- penguin_data = pd.read_csv("data/penguins_standardized.csv")
682
+ penguins_standardized = penguins_standardized.drop(columns=["label"])
686
683
687
684
dfs = []
688
685
inertias = []
689
686
for i in range(1, 10):
690
- data = penguin_data .copy()
687
+ data = penguins_standardized .copy()
691
688
knn = KMeans(n_clusters=i, n_init='auto')
692
689
knn.fit(data)
693
690
data['n_clusters'] = f'{i} Cluster' + ('' if i == 1 else 's')
0 commit comments