Merge pull request #301 from UBC-DSCI/assign-apply-lambda

trevorcampbell · web-flow · commit 9d72fa86bdef · 2023-11-13T13:41:56.000-08:00
Handling assign, apply, and lambdas
diff --git a/source/classification1.md b/source/classification1.md
@@ -628,15 +628,16 @@ Scatter plot of concavity versus perimeter with new observation represented as a
 ```{code-cell} ipython3
 new_obs_Perimeter = 0
 new_obs_Concavity = 3.5
-(
-    cancer
-   [["Perimeter", "Concavity", "Class"]]
-   .assign(dist_from_new = (
+cancer["dist_from_new"] = (
        (cancer["Perimeter"] - new_obs_Perimeter) ** 2
      + (cancer["Concavity"] - new_obs_Concavity) ** 2
-   )**(1/2))
-   .nsmallest(5, "dist_from_new")
-)
+)**(1/2)
+cancer.nsmallest(5, "dist_from_new")[[
+    "Perimeter", 
+    "Concavity", 
+    "Class", 
+    "dist_from_new"
+]]
 ```
 
 ```{code-cell} ipython3
@@ -751,16 +752,18 @@ three predictors.
 new_obs_Perimeter = 0
 new_obs_Concavity = 3.5
 new_obs_Symmetry = 1
-(
-    cancer
-    [["Perimeter", "Concavity", "Symmetry", "Class"]]
-    .assign(dist_from_new = (
-        (cancer["Perimeter"] - new_obs_Perimeter) ** 2
-        + (cancer["Concavity"] - new_obs_Concavity) ** 2
-        + (cancer["Symmetry"] - new_obs_Symmetry) ** 2
-    )**(1/2))
-    .nsmallest(5, "dist_from_new")
-)
+cancer["dist_from_new"] = (
+      (cancer["Perimeter"] - new_obs_Perimeter) ** 2
+    + (cancer["Concavity"] - new_obs_Concavity) ** 2
+    + (cancer["Symmetry"] - new_obs_Symmetry) ** 2
+)**(1/2)
+cancer.nsmallest(5, "dist_from_new")[[
+    "Perimeter", 
+    "Concavity", 
+    "Symmetry", 
+    "Class", 
+    "dist_from_new"
+]]
 ```
 
 Based on $K=5$ nearest neighbors with these three predictors we would classify 
diff --git a/source/classification2.md b/source/classification2.md
@@ -606,18 +606,16 @@ knn_pipeline
 ```
 
 Now that we have a $K$-nearest neighbors classifier object, we can use it to
-predict the class labels for our test set.  We will use the `assign` method to 
-augment the original test data with a column of predictions, creating the
-`cancer_test_predictions` data frame. The `Class` variable contains the actual
+predict the class labels for our test set and
+augment the original test data with a column of predictions.
+The `Class` variable contains the actual
 diagnoses, while the `predicted` contains the predicted diagnoses from the
 classifier. Note that below we print out just the `ID`, `Class`, and `predicted`
 variables in the output data frame.
 
 ```{code-cell} ipython3
-cancer_test_predictions = cancer_test.assign(
-    predicted = knn_pipeline.predict(cancer_test[["Smoothness", "Concavity"]])
-)
-cancer_test_predictions[["ID", "Class", "predicted"]]
+cancer_test["predicted"] = knn_pipeline.predict(cancer_test[["Smoothness", "Concavity"]])
+cancer_test[["ID", "Class", "predicted"]]
 ```
 
 ### Evaluate performance
@@ -632,11 +630,11 @@ number of predictions. First we filter the rows to find the number of correct pr
 and then divide the number of rows with correct predictions by the total number of rows
 using the `shape` attribute.
 ```{code-cell} ipython3
-correct_preds = cancer_test_predictions[
-    cancer_test_predictions["Class"] == cancer_test_predictions["predicted"]
+correct_preds = cancer_test[
+    cancer_test["Class"] == cancer_test["predicted"]
 ]
 
-correct_preds.shape[0] / cancer_test_predictions.shape[0]
+correct_preds.shape[0] / cancer_test.shape[0]
 ```
 
 The `scitkit-learn` package also provides a more convenient way to do this using
@@ -669,15 +667,15 @@ arguments: the actual labels first, then the predicted labels second.
 
 ```{code-cell} ipython3
 pd.crosstab(
-    cancer_test_predictions["Class"],
-    cancer_test_predictions["predicted"]
+    cancer_test["Class"],
+    cancer_test["predicted"]
 )
 ```
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
-_ctab = pd.crosstab(cancer_test_predictions["Class"],
-            cancer_test_predictions["predicted"]
+_ctab = pd.crosstab(cancer_test["Class"],
+            cancer_test["predicted"]
            )
 
 c11 = _ctab["Malignant"]["Malignant"]
@@ -1205,15 +1203,14 @@ We will also rename the parameter name column to be a bit more readable,
 and drop the now unused `std_test_score` column.
 
 ```{code-cell} ipython3
+accuracies_grid["sem_test_score"] = accuracies_grid["std_test_score"] / 10**(1/2)
 accuracies_grid = (
     accuracies_grid[[
         "param_kneighborsclassifier__n_neighbors",
         "mean_test_score",
-        "std_test_score"
+        "sem_test_score"
     ]]
-    .assign(sem_test_score=accuracies_grid["std_test_score"] / 10**(1/2))
     .rename(columns={"param_kneighborsclassifier__n_neighbors": "n_neighbors"})
-    .drop(columns=["std_test_score"])
 )
 accuracies_grid
 ```
diff --git a/source/clustering.md b/source/clustering.md
@@ -856,14 +856,14 @@ order to do that, we first need to augment our
 original `penguins` data frame with the cluster assignments. 
 We can access these using the `labels_` attribute of the clustering object 
 ("labels" is a common alternative term to "assignments" in clustering), and 
-add them to the data frame using `assign`.
+add them to the data frame.
 
 ```{code-cell} ipython3
-clustered_data = penguins.assign(cluster = penguin_clust[1].labels_)
-clustered_data
+penguins["cluster"] = penguin_clust[1].labels_
+penguins
 ```
 
-Now that we have the cluster assignments included in the `clustered_data` data frame, we can 
+Now that we have the cluster assignments included in the `penguins` data frame, we can 
 visualize them as shown in {numref}`cluster_plot`.
 Note that we are plotting the *un-standardized* data here; if we for some reason wanted to 
 visualize the *standardized* data, we would need to use the `fit` and `transform` functions
@@ -874,7 +874,7 @@ will treat the `cluster` variable as a nominal/categorical variable, and
 hence use a discrete color map for the visualization.
 
 ```{code-cell} ipython3
-cluster_plot=alt.Chart(clustered_data).mark_circle().encode(
+cluster_plot=alt.Chart(penguins).mark_circle().encode(
     x=alt.X("flipper_length_mm").title("Flipper Length").scale(zero=False),
     y=alt.Y("bill_length_mm").title("Bill Length").scale(zero=False),
     color=alt.Color("cluster:N").title("Cluster"),
diff --git a/source/inference.md b/source/inference.md
@@ -250,11 +250,11 @@ expect our sample proportions from this population to vary for samples of size 4
 
 We again use the `sample` to take samples of size 40 from our
 population of Airbnb listings. But this time we use a list comprehension
-to repeat an operation multiple time (as in the previous chapter).
-In this case we are taking 20,000 samples of size 40
-and to make it clear which rows in the data frame come
-which of the 20,000 samples,
-we also add a column called `replicate` with this information.
+to repeat the operation multiple times (as we did previously in {numref}`Chapter %s <clustering>`).
+In this case we repeat the operation 20,000 times to obtain 20,000 samples of size 40.
+To make it clear which rows in the data frame come
+which of the 20,000 samples, we also add a column called `replicate` with this information using the `assign` function,
+introduced previously in {numref}`Chapter %s <wrangling>`.
 The call to `concat` concatenates all the 20,000 data frames
 returned from the list comprehension into a single big data frame.
 
diff --git a/source/intro.md b/source/intro.md
@@ -646,7 +646,8 @@ ten_lang = arranged_lang.head(10)
 ten_lang
 ```
 
-## Adding and modifying columns using `assign`
+(ch1-adding-modifying)=
+## Adding and modifying columns
 
 ```{index} assign
 ```
@@ -663,7 +664,7 @@ column by the total Canadian population according to the 2016
 census&mdash;i.e., 35,151,728&mdash;and multiply it by 100. We can perform
 this computation using the code `100 * ten_lang["mother_tongue"] / canadian_population`. 
 Then to store the result in a new column (or
-overwrite an existing column), we use the `assign` method. We specify the name of the new
+overwrite an existing column), we specify the name of the new
 column to create (or old column to modify), then the assignment symbol `=`, 
 and then the computation to store in that column. In this case, we will opt to
 create a new column called `mother_tongue_percent`. 
@@ -676,10 +677,18 @@ and do not affect how Python interprets the number. In other words,
 although the latter is much clearer!
 ```
 
+```{code-cell} ipython3
+:tags: [remove-cell]
+# disable setting with copy warning
+# it's not important for this chapter and just distracting
+# only occurs here because we did a much earlier .loc operation that is being picked up below by the coln assignment
+pd.options.mode.chained_assignment = None
+```
+
 ```{code-cell} ipython3
 canadian_population = 35_151_728
-ten_lang_percent = ten_lang.assign(mother_tongue_percent=100 * ten_lang["mother_tongue"] / canadian_population)
-ten_lang_percent
+ten_lang["mother_tongue_percent"] = 100 * ten_lang["mother_tongue"] / canadian_population
+ten_lang
 ```
 
 The `ten_lang_percent` data frame shows that
diff --git a/source/regression1.md b/source/regression1.md
@@ -294,17 +294,15 @@ of a house that is 2,000 square feet.
 ```
 
 ```{code-cell} ipython3
-nearest_neighbors = (
-    small_sacramento.assign(diff=(2000 - small_sacramento["sqft"]).abs())
-    .nsmallest(5, "diff")
-)
-
+small_sacramento["dist"] = (2000 - small_sacramento["sqft"]).abs()
+nearest_neighbors = small_sacramento.nsmallest(5, "dist")
 nearest_neighbors
 ```
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
+
 nn_plot = small_plot + rule
 
 # plot horizontal lines which is perpendicular to x=2000
@@ -609,16 +607,15 @@ sacr_gridsearch.fit(
 )
 
 # Retrieve the CV scores
-sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)[[
-    "param_kneighborsregressor__n_neighbors",
-    "mean_test_score",
-    "std_test_score"
-]]
+sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)
+sacr_results["sem_test_score"] = sacr_results["std_test_score"] / 5**(1/2)
 sacr_results = (
-    sacr_results
-    .assign(sem_test_score=sacr_results["std_test_score"] / 5**(1/2))
+    sacr_results[[
+        "param_kneighborsregressor__n_neighbors",
+        "mean_test_score",
+        "sem_test_score"
+    ]]
     .rename(columns={"param_kneighborsregressor__n_neighbors": "n_neighbors"})
-    .drop(columns=["std_test_score"])
 )
 sacr_results
 ```
@@ -834,12 +831,10 @@ model uses a different default scoring metric than the RMSPE.
 ```{code-cell} ipython3
 from sklearn.metrics import mean_squared_error
 
-sacr_preds = sacramento_test.assign(
-    predicted = sacr_gridsearch.predict(sacramento_test)
-)
+sacramento_test["predicted"] = sacr_gridsearch.predict(sacramento_test)
 RMSPE = mean_squared_error(
-    y_true = sacr_preds["price"],
-    y_pred=sacr_preds["predicted"]
+    y_true = sacramento_test["price"],
+    y_pred = sacramento_test["predicted"]
 )**(1/2)
 RMSPE
 ```
@@ -890,9 +885,7 @@ sqft_prediction_grid = pd.DataFrame({
     "sqft": np.arange(sacramento["sqft"].min(), sacramento["sqft"].max(), 10)
 })
 # Predict the price for each of the sqft values in the grid
-sacr_preds = sqft_prediction_grid.assign(
-    predicted = sacr_gridsearch.predict(sqft_prediction_grid)
-)
+sqft_prediction_grid["predicted"] = sacr_gridsearch.predict(sqft_prediction_grid)
 
 # Plot all the houses
 base_plot = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
@@ -905,7 +898,10 @@ base_plot = alt.Chart(sacramento).mark_circle(opacity=0.4).encode(
 )
 
 # Add the predictions as a line
-sacr_preds_plot = base_plot + alt.Chart(sacr_preds, title=f"K = {best_k_sacr}").mark_line(
+sacr_preds_plot = base_plot + alt.Chart(
+    sqft_prediction_grid, 
+    title=f"K = {best_k_sacr}"
+).mark_line(
     color="#ff7f0e"
 ).encode(
     x="sqft",
@@ -1018,25 +1014,24 @@ sacr_gridsearch = GridSearchCV(
     cv=5,
     scoring="neg_root_mean_squared_error"
 )
+
 sacr_gridsearch.fit(
   sacramento_train[["sqft", "beds"]],
   sacramento_train["price"]
 )
 
 # retrieve the CV scores
-sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)[[
-    "param_kneighborsregressor__n_neighbors",
-    "mean_test_score",
-    "std_test_score"
-]]
-
+sacr_results = pd.DataFrame(sacr_gridsearch.cv_results_)
+sacr_results["sem_test_score"] = sacr_results["std_test_score"] / 5**(1/2)
+sacr_results["mean_test_score"] = -sacr_results["mean_test_score"]
 sacr_results = (
-    sacr_results
-    .assign(sem_test_score=sacr_results["std_test_score"] / 5**(1/2))
+    sacr_results[[
+        "param_kneighborsregressor__n_neighbors",
+        "mean_test_score",
+        "sem_test_score"
+    ]]
     .rename(columns={"param_kneighborsregressor__n_neighbors" : "n_neighbors"})
-    .drop(columns=["std_test_score"])
 )
-sacr_results["mean_test_score"] = -sacr_results["mean_test_score"]
 
 # show only the row of minimum RMSPE
 sacr_results.nsmallest(1, "mean_test_score")
@@ -1069,12 +1064,10 @@ via the `predict` method of the fit `GridSearchCV` object. Finally, we will use
 to compute the RMSPE.
 
 ```{code-cell} ipython3
-sacr_preds = sacramento_test.assign(
-    predicted = sacr_gridsearch.predict(sacramento_test)
-)
+sacramento_test["predicted"] = sacr_gridsearch.predict(sacramento_test)
 RMSPE_mult = mean_squared_error(
-    y_true = sacr_preds["price"], 
-    y_pred=sacr_preds["predicted"]
+    y_true = sacramento_test["price"], 
+    y_pred = sacramento_test["predicted"]
 )**(1/2)
 RMSPE_mult
 
diff --git a/source/regression2.md b/source/regression2.md
diff --git a/source/viz.md b/source/viz.md
diff --git a/source/wrangling.md b/source/wrangling.md