Merge pull request #218 from UBC-DSCI/chart-sizes_pdf-build

trevorcampbell · web-flow · commit ddf5383236bc · 2023-08-31T10:16:57.000-07:00
Fix PDF build and reduce chart sizes in HTML build
diff --git a/Dockerfile b/Dockerfile
@@ -17,10 +17,8 @@ RUN rm -rf work
 RUN pip install docutils==0.17.1 # Need to pin docutils to an old version for now, due to https://github.com/executablebooks/jupyter-book/issues/2022
 RUN pip install referencing
 RUN pip install jupyter-book
-RUN pip install numpy jinja2 altair_data_server vl-convert-python click ibis-framework ghp-import jupytext nodejs
-
-# Update altair to >= 5.1.1 get a png rendering in PDF build
-RUN pip install --upgrade altair
+# Pinning pandas until altair 5.1.2 to avoid future warning https://github.com/altair-viz/altair/issues/3181
+RUN pip install numpy jinja2 pandas"<2.1" altair">=5.1.1" "vegafusion[embed]" vl-convert-python">=0.13" click ibis-framework ghp-import jupytext nodejs
 
 # forces scikit-learn to grab latest to avoid bug in 1.3.0 related to checking for c-contiguity breaking figures in classification 2. See https://github.com/scikit-learn/scikit-learn/pull/26772
 # TODO: remove this once scikit-learn 1.4.x or beyond releases and is incorporated into jupyter/scipy-notebook
diff --git a/build_pdf.sh b/build_pdf.sh
@@ -1,3 +1,3 @@
 chmod -R o+w source/
-docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202308311605146eaa0f /bin/bash -c "jupyter-book build source --builder pdflatex"
+docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202308311605146eaa0f /bin/bash -c "export BOOK_BUILD_TYPE='PDF'; jupyter-book build source --builder pdflatex"
 chmod -R o-w source/
diff --git a/source/_config.yml b/source/_config.yml
@@ -78,6 +78,8 @@ sphinx:
   local_extensions: # A list of local extensions to load by sphinx specified by "name: path" items
   config: # key-value pairs to directly over-ride the Sphinx configuration
     bibtex_reference_style: author_year
+  html_context:
+    default_mode: light
 
 #######################################################################################
 bibtex_bibfiles:
diff --git a/source/chapter_preamble.py b/source/chapter_preamble.py
@@ -0,0 +1,12 @@
+import os
+import altair as alt
+from myst_nb import glue
+
+
+# Use PNG images in the PDF version of the books to make sure that they render
+if 'BOOK_BUILD_TYPE' in os.environ and os.environ['BOOK_BUILD_TYPE'] == 'PDF':
+    alt.data_transformers.disable_max_rows()
+    alt.renderers.enable('png', scale_factor=0.7, ppi=300)
+else:
+    # Reduce chart sizes and allow to plot up to 100k graphical objects (not the same as rows in the data frame)
+    alt.data_transformers.enable('vegafusion')
diff --git a/source/classification1.md b/source/classification1.md
@@ -14,14 +14,10 @@ kernelspec:
 
 ```{code-cell} ipython3
 :tags: [remove-cell]
-import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
 
-from myst_nb import glue
-import numpy as np
-from sklearn.metrics.pairwise import euclidean_distances
+from chapter_preamble import *
 from IPython.display import HTML
+from sklearn.metrics.pairwise import euclidean_distances
 ```
 
 (classification1)=
@@ -388,13 +384,10 @@ Scatter plot of concavity versus perimeter with new observation represented as a
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-near_neighbor_df = pd.concat(
-    (
-        cancer.loc[np.argmin(my_distances), attrs],
-        perim_concav_with_new_point_df.loc[len(cancer), attrs],
-    ),
-    axis=1,
-).T
+near_neighbor_df = pd.concat([
+    cancer.loc[[np.argmin(my_distances)], attrs],
+    perim_concav_with_new_point_df.loc[[cancer.shape[0]], attrs],
+])
 glue("1-neighbor_per", round(near_neighbor_df.iloc[0, :]['Perimeter'], 1))
 glue("1-neighbor_con", round(near_neighbor_df.iloc[0, :]['Concavity'], 1))
 ```
@@ -466,13 +459,10 @@ perim_concav_with_new_point2 = (
     )
 )
 
-near_neighbor_df2 = pd.concat(
-    (
-        cancer.loc[np.argmin(my_distances2), attrs],
-        perim_concav_with_new_point_df2.loc[len(cancer), attrs],
-    ),
-    axis=1,
-).T
+near_neighbor_df2 = pd.concat([
+    cancer.loc[[np.argmin(my_distances2)], attrs],
+    perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
+])
 line2 = alt.Chart(near_neighbor_df2).mark_line().encode(
     x='Perimeter',
     y='Concavity',
@@ -507,20 +497,14 @@ label.
 
 # The index of 3 rows that has smallest distance to the new point
 min_3_idx = np.argpartition(my_distances2, 3)[:3]
-near_neighbor_df3 = pd.concat(
-    (
-        cancer.loc[min_3_idx[1], attrs],
-        perim_concav_with_new_point_df2.loc[len(cancer), attrs],
-    ),
-    axis=1,
-).T
-near_neighbor_df4 = pd.concat(
-    (
-        cancer.loc[min_3_idx[2], attrs],
-        perim_concav_with_new_point_df2.loc[len(cancer), attrs],
-    ),
-    axis=1,
-).T
+near_neighbor_df3 = pd.concat([
+    cancer.loc[[min_3_idx[1]], attrs],
+    perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
+])
+near_neighbor_df4 = pd.concat([
+    cancer.loc[[min_3_idx[2]], attrs],
+    perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
+])
 ```
 
 ```{code-cell} ipython3
@@ -1223,27 +1207,18 @@ area_smoothness_new_point = (
 
 # The index of 3 rows that has smallest distance to the new point
 min_3_idx = np.argpartition(my_distances, 3)[:3]
-neighbor1 = pd.concat(
-    (
-        unscaled_cancer.loc[min_3_idx[0], attrs],
-        new_obs[attrs].T,
-    ),
-    axis=1,
-).T
-neighbor2 = pd.concat(
-    (
-        unscaled_cancer.loc[min_3_idx[1], attrs],
-        new_obs[attrs].T,
-    ),
-    axis=1,
-).T
-neighbor3 = pd.concat(
-    (
-        unscaled_cancer.loc[min_3_idx[2], attrs],
-        new_obs[attrs].T,
-    ),
-    axis=1,
-).T
+neighbor1 = pd.concat([
+    unscaled_cancer.loc[[min_3_idx[0]], attrs],
+    new_obs[attrs],
+])
+neighbor2 = pd.concat([
+    unscaled_cancer.loc[[min_3_idx[1]], attrs],
+    new_obs[attrs],
+])
+neighbor3 = pd.concat([
+    unscaled_cancer.loc[[min_3_idx[2]], attrs],
+    new_obs[attrs],
+])
 
 line1 = (
     alt.Chart(neighbor1)
@@ -1297,27 +1272,18 @@ area_smoothness_new_point_scaled = (
     )
 )
 min_3_idx_scaled = np.argpartition(my_distances_scaled, 3)[:3]
-neighbor1_scaled = pd.concat(
-    (
-        scaled_cancer_all.loc[min_3_idx_scaled[0], attrs],
-        new_obs_scaled[attrs].T,
-    ),
-    axis=1,
-).T
-neighbor2_scaled = pd.concat(
-    (
-        scaled_cancer_all.loc[min_3_idx_scaled[1], attrs],
-        new_obs_scaled[attrs].T,
-    ),
-    axis=1,
-).T
-neighbor3_scaled = pd.concat(
-    (
-        scaled_cancer_all.loc[min_3_idx_scaled[2], attrs],
-        new_obs_scaled[attrs].T,
-    ),
-    axis=1,
-).T
+neighbor1_scaled = pd.concat([
+    scaled_cancer_all.loc[[min_3_idx_scaled[0]], attrs],
+    new_obs_scaled[attrs],
+])
+neighbor2_scaled = pd.concat([
+    scaled_cancer_all.loc[[min_3_idx_scaled[1]], attrs],
+    new_obs_scaled[attrs],
+])
+neighbor3_scaled = pd.concat([
+    scaled_cancer_all.loc[[min_3_idx_scaled[2]], attrs],
+    new_obs_scaled[attrs],
+])
 
 line1_scaled = (
     alt.Chart(neighbor1_scaled)
@@ -1499,13 +1465,10 @@ for i in range(7):
     clr = "#1f77b4"
     if rare_cancer.iloc[min_7_idx[i], :]["Class"] == "Malignant":
         clr = "#ff7f0e"
-    neighbor = pd.concat(
-        (
-            rare_cancer.iloc[min_7_idx[i], :][attrs],
-            new_point_df[attrs].T,
-        ),
-        axis=1,
-    ).T
+    neighbor = pd.concat([
+        rare_cancer.iloc[[min_7_idx[i]], :][attrs],
+        new_point_df[attrs],
+    ])
     rare_plot = rare_plot + (
         alt.Chart(neighbor)
         .mark_line(opacity=0.3)
diff --git a/source/classification2.md b/source/classification2.md
@@ -18,15 +18,7 @@ kernelspec:
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-
-import altair as alt
-import numpy as np
-import pandas as pd
- 
-from myst_nb import glue
+from chapter_preamble import *
 ```
 
 ## Overview 
@@ -292,6 +284,7 @@ on the series of numbers, passing the argument `n = 10` to indicate that we want
 
 ```{code-cell} ipython3
 import numpy as np
+import pandas as pd
 
 np.random.seed(1)
 
diff --git a/source/clustering.md b/source/clustering.md
@@ -18,8 +18,7 @@ kernelspec:
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-import warnings
-warnings.filterwarnings("ignore", category=FutureWarning)
+from chapter_preamble import *
 ```
 
 ## Overview
@@ -174,7 +173,6 @@ Now we can load and preview the data.
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-from myst_nb import glue
 import pandas as pd
 
 data = pd.read_csv(
@@ -632,6 +630,10 @@ visualize them as shown in {numref}`cluster_plot`.
 Note that we are plotting the *un-standardized* data here; if we for some reason wanted to 
 visualize the *standardized* data, we would need to use the `fit` and `transform` functions
 on the `StandardScaler` preprocessor directly to obtain that first.
+As in Chapter {ref}`viz`,
+adding the `:N` suffix ensures that `altair`
+will treat the `cluster` variable as a nominal/categorical variable, and
+hence use a discrete color map for the visualization.
 
 ```{code-cell} ipython3
 cluster_plot=alt.Chart(clustered_data).mark_circle().encode(
diff --git a/source/inference.md b/source/inference.md
@@ -18,13 +18,7 @@ kernelspec:
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-import altair as alt
-from myst_nb import glue
-import warnings
-
-
-warnings.filterwarnings("ignore", category=FutureWarning)
-alt.data_transformers.disable_max_rows()
+from chapter_preamble import *
 ```
 
 ## Overview
@@ -732,7 +726,7 @@ glue(
         )
     ).facet(
         alt.Facet(
-            'sample_size',
+            'sample_size:N',
             header=alt.Header(
                 title='',
                 labelFontWeight='bold',
@@ -1043,7 +1037,7 @@ alt.Chart(six_bootstrap_samples, height=150).mark_bar().encode(
         .title("Price per night (dollars)"),
     y=alt.Y("count()").title("Count")
 ).facet(
-    "replicate",
+    "replicate:N",  # Recall that `:N` converts the variable to a categorical type
     columns=2
 )
 ```
diff --git a/source/intro.md b/source/intro.md
@@ -16,8 +16,9 @@ kernelspec:
 # Python and Pandas
 
 ```{code-cell} ipython3
-:tags: ["remove-cell"]
-from myst_nb import glue
+:tags: [remove-cell]
+
+from chapter_preamble import *
 ```
 
 ## Overview
diff --git a/source/regression1.md b/source/regression1.md
@@ -18,26 +18,8 @@ kernelspec:
 ```{code-cell} ipython3
 :tags: [remove-cell]
 
-import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-
-import altair as alt
-import numpy as np
-import pandas as pd
-
-# from sklearn.model_selection import GridSearchCV, train_test_split
-# from sklearn.compose import make_column_transformer
-# from sklearn.neighbors import KNeighborsRegressor
-# from sklearn.pipeline import Pipeline, make_pipeline
-# from sklearn.preprocessing import StandardScaler
-
-# import plotly.express as px
-# import plotly.graph_objs as go
-# from plotly.offline import plot
+from chapter_preamble import *
 from IPython.display import HTML
-
-from myst_nb import glue
 ```
 
 ## Overview
@@ -150,8 +132,9 @@ We begin the analysis by loading and examining the data,
 as well as setting the seed value.
 
 ```{code-cell} ipython3
-import pandas as pd
 import altair as alt
+import numpy as np
+import pandas as pd
 from sklearn.model_selection import GridSearchCV, train_test_split
 from sklearn.compose import make_column_transformer
 from sklearn.pipeline import make_pipeline
@@ -320,13 +303,10 @@ nn_plot = small_plot + rule
 
 # plot horizontal lines which is perpendicular to x=2000
 for i in range(5):
-    h_line_df = pd.concat(
-        (
-            pd.DataFrame(nearest_neighbors.iloc[i, [4, 6]]).T,
-            pd.DataFrame({"sqft": [2000], "price": [nearest_neighbors.iloc[i, 6]]}),
-        ),
-        ignore_index=True,
-    )
+    h_line_df = pd.DataFrame({
+        "sqft": [nearest_neighbors.iloc[i, 4], 2000],
+        "price": [nearest_neighbors.iloc[i, 6]] * 2
+    })
     h_line = alt.Chart(h_line_df).mark_line(color="orange").encode(x="sqft", y="price")
     nn_plot += h_line
 
@@ -671,7 +651,7 @@ glue("cv_RMSPE", "{0:,.0f}".format(int(best_cv_RMSPE)))
 :tags: [remove-cell]
 
 sacr_tunek_plot = alt.Chart(sacr_results).mark_line(point=True).encode(
-    x=alt.X("n_neighbors", title="Neighbors"),
+    x=alt.X("n_neighbors:Q", title="Neighbors"),
     y=alt.Y("mean_test_score", scale=alt.Scale(zero=False), title="Cross-Validation RMSPE Estimate")
 )
 
diff --git a/source/regression2.md b/source/regression2.md
diff --git a/source/viz.md b/source/viz.md
diff --git a/source/wrangling.md b/source/wrangling.md