Skip to content

Commit ddf5383

Browse files
Merge pull request #218 from UBC-DSCI/chart-sizes_pdf-build
Fix PDF build and reduce chart sizes in HTML build
2 parents fe4132d + 8fb7a94 commit ddf5383

13 files changed

+91
-179
lines changed

Dockerfile

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@ RUN rm -rf work
1717
RUN pip install docutils==0.17.1 # Need to pin docutils to an old version for now, due to https://github.com/executablebooks/jupyter-book/issues/2022
1818
RUN pip install referencing
1919
RUN pip install jupyter-book
20-
RUN pip install numpy jinja2 altair_data_server vl-convert-python click ibis-framework ghp-import jupytext nodejs
21-
22-
# Update altair to >= 5.1.1 get a png rendering in PDF build
23-
RUN pip install --upgrade altair
20+
# Pinning pandas until altair 5.1.2 to avoid future warning https://github.com/altair-viz/altair/issues/3181
21+
RUN pip install numpy jinja2 pandas"<2.1" altair">=5.1.1" "vegafusion[embed]" vl-convert-python">=0.13" click ibis-framework ghp-import jupytext nodejs
2422

2523
# forces scikit-learn to grab latest to avoid bug in 1.3.0 related to checking for c-contiguity breaking figures in classification 2. See https://github.com/scikit-learn/scikit-learn/pull/26772
2624
# TODO: remove this once scikit-learn 1.4.x or beyond releases and is incorporated into jupyter/scipy-notebook

build_pdf.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
chmod -R o+w source/
2-
docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202308311605146eaa0f /bin/bash -c "jupyter-book build source --builder pdflatex"
2+
docker run --rm -v $(pwd):/home/jovyan ubcdsci/py-intro-to-ds:202308311605146eaa0f /bin/bash -c "export BOOK_BUILD_TYPE='PDF'; jupyter-book build source --builder pdflatex"
33
chmod -R o-w source/

source/_config.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,8 @@ sphinx:
7878
local_extensions: # A list of local extensions to load by sphinx specified by "name: path" items
7979
config: # key-value pairs to directly over-ride the Sphinx configuration
8080
bibtex_reference_style: author_year
81+
html_context:
82+
default_mode: light
8183

8284
#######################################################################################
8385
bibtex_bibfiles:

source/chapter_preamble.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import os
2+
import altair as alt
3+
from myst_nb import glue
4+
5+
6+
# Use PNG images in the PDF version of the books to make sure that they render
7+
if 'BOOK_BUILD_TYPE' in os.environ and os.environ['BOOK_BUILD_TYPE'] == 'PDF':
8+
alt.data_transformers.disable_max_rows()
9+
alt.renderers.enable('png', scale_factor=0.7, ppi=300)
10+
else:
11+
# Reduce chart sizes and allow to plot up to 100k graphical objects (not the same as rows in the data frame)
12+
alt.data_transformers.enable('vegafusion')

source/classification1.md

Lines changed: 46 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,10 @@ kernelspec:
1414

1515
```{code-cell} ipython3
1616
:tags: [remove-cell]
17-
import warnings
18-
warnings.filterwarnings("ignore", category=DeprecationWarning)
19-
warnings.filterwarnings("ignore", category=FutureWarning)
2017
21-
from myst_nb import glue
22-
import numpy as np
23-
from sklearn.metrics.pairwise import euclidean_distances
18+
from chapter_preamble import *
2419
from IPython.display import HTML
20+
from sklearn.metrics.pairwise import euclidean_distances
2521
```
2622

2723
(classification1)=
@@ -388,13 +384,10 @@ Scatter plot of concavity versus perimeter with new observation represented as a
388384
```{code-cell} ipython3
389385
:tags: [remove-cell]
390386
391-
near_neighbor_df = pd.concat(
392-
(
393-
cancer.loc[np.argmin(my_distances), attrs],
394-
perim_concav_with_new_point_df.loc[len(cancer), attrs],
395-
),
396-
axis=1,
397-
).T
387+
near_neighbor_df = pd.concat([
388+
cancer.loc[[np.argmin(my_distances)], attrs],
389+
perim_concav_with_new_point_df.loc[[cancer.shape[0]], attrs],
390+
])
398391
glue("1-neighbor_per", round(near_neighbor_df.iloc[0, :]['Perimeter'], 1))
399392
glue("1-neighbor_con", round(near_neighbor_df.iloc[0, :]['Concavity'], 1))
400393
```
@@ -466,13 +459,10 @@ perim_concav_with_new_point2 = (
466459
)
467460
)
468461
469-
near_neighbor_df2 = pd.concat(
470-
(
471-
cancer.loc[np.argmin(my_distances2), attrs],
472-
perim_concav_with_new_point_df2.loc[len(cancer), attrs],
473-
),
474-
axis=1,
475-
).T
462+
near_neighbor_df2 = pd.concat([
463+
cancer.loc[[np.argmin(my_distances2)], attrs],
464+
perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
465+
])
476466
line2 = alt.Chart(near_neighbor_df2).mark_line().encode(
477467
x='Perimeter',
478468
y='Concavity',
@@ -507,20 +497,14 @@ label.
507497
508498
# The index of 3 rows that has smallest distance to the new point
509499
min_3_idx = np.argpartition(my_distances2, 3)[:3]
510-
near_neighbor_df3 = pd.concat(
511-
(
512-
cancer.loc[min_3_idx[1], attrs],
513-
perim_concav_with_new_point_df2.loc[len(cancer), attrs],
514-
),
515-
axis=1,
516-
).T
517-
near_neighbor_df4 = pd.concat(
518-
(
519-
cancer.loc[min_3_idx[2], attrs],
520-
perim_concav_with_new_point_df2.loc[len(cancer), attrs],
521-
),
522-
axis=1,
523-
).T
500+
near_neighbor_df3 = pd.concat([
501+
cancer.loc[[min_3_idx[1]], attrs],
502+
perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
503+
])
504+
near_neighbor_df4 = pd.concat([
505+
cancer.loc[[min_3_idx[2]], attrs],
506+
perim_concav_with_new_point_df2.loc[[cancer.shape[0]], attrs],
507+
])
524508
```
525509

526510
```{code-cell} ipython3
@@ -1223,27 +1207,18 @@ area_smoothness_new_point = (
12231207
12241208
# The index of 3 rows that has smallest distance to the new point
12251209
min_3_idx = np.argpartition(my_distances, 3)[:3]
1226-
neighbor1 = pd.concat(
1227-
(
1228-
unscaled_cancer.loc[min_3_idx[0], attrs],
1229-
new_obs[attrs].T,
1230-
),
1231-
axis=1,
1232-
).T
1233-
neighbor2 = pd.concat(
1234-
(
1235-
unscaled_cancer.loc[min_3_idx[1], attrs],
1236-
new_obs[attrs].T,
1237-
),
1238-
axis=1,
1239-
).T
1240-
neighbor3 = pd.concat(
1241-
(
1242-
unscaled_cancer.loc[min_3_idx[2], attrs],
1243-
new_obs[attrs].T,
1244-
),
1245-
axis=1,
1246-
).T
1210+
neighbor1 = pd.concat([
1211+
unscaled_cancer.loc[[min_3_idx[0]], attrs],
1212+
new_obs[attrs],
1213+
])
1214+
neighbor2 = pd.concat([
1215+
unscaled_cancer.loc[[min_3_idx[1]], attrs],
1216+
new_obs[attrs],
1217+
])
1218+
neighbor3 = pd.concat([
1219+
unscaled_cancer.loc[[min_3_idx[2]], attrs],
1220+
new_obs[attrs],
1221+
])
12471222
12481223
line1 = (
12491224
alt.Chart(neighbor1)
@@ -1297,27 +1272,18 @@ area_smoothness_new_point_scaled = (
12971272
)
12981273
)
12991274
min_3_idx_scaled = np.argpartition(my_distances_scaled, 3)[:3]
1300-
neighbor1_scaled = pd.concat(
1301-
(
1302-
scaled_cancer_all.loc[min_3_idx_scaled[0], attrs],
1303-
new_obs_scaled[attrs].T,
1304-
),
1305-
axis=1,
1306-
).T
1307-
neighbor2_scaled = pd.concat(
1308-
(
1309-
scaled_cancer_all.loc[min_3_idx_scaled[1], attrs],
1310-
new_obs_scaled[attrs].T,
1311-
),
1312-
axis=1,
1313-
).T
1314-
neighbor3_scaled = pd.concat(
1315-
(
1316-
scaled_cancer_all.loc[min_3_idx_scaled[2], attrs],
1317-
new_obs_scaled[attrs].T,
1318-
),
1319-
axis=1,
1320-
).T
1275+
neighbor1_scaled = pd.concat([
1276+
scaled_cancer_all.loc[[min_3_idx_scaled[0]], attrs],
1277+
new_obs_scaled[attrs],
1278+
])
1279+
neighbor2_scaled = pd.concat([
1280+
scaled_cancer_all.loc[[min_3_idx_scaled[1]], attrs],
1281+
new_obs_scaled[attrs],
1282+
])
1283+
neighbor3_scaled = pd.concat([
1284+
scaled_cancer_all.loc[[min_3_idx_scaled[2]], attrs],
1285+
new_obs_scaled[attrs],
1286+
])
13211287
13221288
line1_scaled = (
13231289
alt.Chart(neighbor1_scaled)
@@ -1499,13 +1465,10 @@ for i in range(7):
14991465
clr = "#1f77b4"
15001466
if rare_cancer.iloc[min_7_idx[i], :]["Class"] == "Malignant":
15011467
clr = "#ff7f0e"
1502-
neighbor = pd.concat(
1503-
(
1504-
rare_cancer.iloc[min_7_idx[i], :][attrs],
1505-
new_point_df[attrs].T,
1506-
),
1507-
axis=1,
1508-
).T
1468+
neighbor = pd.concat([
1469+
rare_cancer.iloc[[min_7_idx[i]], :][attrs],
1470+
new_point_df[attrs],
1471+
])
15091472
rare_plot = rare_plot + (
15101473
alt.Chart(neighbor)
15111474
.mark_line(opacity=0.3)

source/classification2.md

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,7 @@ kernelspec:
1818
```{code-cell} ipython3
1919
:tags: [remove-cell]
2020
21-
import warnings
22-
warnings.filterwarnings("ignore", category=DeprecationWarning)
23-
warnings.filterwarnings("ignore", category=FutureWarning)
24-
25-
import altair as alt
26-
import numpy as np
27-
import pandas as pd
28-
29-
from myst_nb import glue
21+
from chapter_preamble import *
3022
```
3123

3224
## Overview
@@ -292,6 +284,7 @@ on the series of numbers, passing the argument `n = 10` to indicate that we want
292284

293285
```{code-cell} ipython3
294286
import numpy as np
287+
import pandas as pd
295288
296289
np.random.seed(1)
297290

source/clustering.md

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ kernelspec:
1818
```{code-cell} ipython3
1919
:tags: [remove-cell]
2020
21-
import warnings
22-
warnings.filterwarnings("ignore", category=FutureWarning)
21+
from chapter_preamble import *
2322
```
2423

2524
## Overview
@@ -174,7 +173,6 @@ Now we can load and preview the data.
174173
```{code-cell} ipython3
175174
:tags: [remove-cell]
176175
177-
from myst_nb import glue
178176
import pandas as pd
179177
180178
data = pd.read_csv(
@@ -632,6 +630,10 @@ visualize them as shown in {numref}`cluster_plot`.
632630
Note that we are plotting the *un-standardized* data here; if we for some reason wanted to
633631
visualize the *standardized* data, we would need to use the `fit` and `transform` functions
634632
on the `StandardScaler` preprocessor directly to obtain that first.
633+
As in Chapter {ref}`viz`,
634+
adding the `:N` suffix ensures that `altair`
635+
will treat the `cluster` variable as a nominal/categorical variable, and
636+
hence use a discrete color map for the visualization.
635637

636638
```{code-cell} ipython3
637639
cluster_plot=alt.Chart(clustered_data).mark_circle().encode(

source/inference.md

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,7 @@ kernelspec:
1818
```{code-cell} ipython3
1919
:tags: [remove-cell]
2020
21-
import altair as alt
22-
from myst_nb import glue
23-
import warnings
24-
25-
26-
warnings.filterwarnings("ignore", category=FutureWarning)
27-
alt.data_transformers.disable_max_rows()
21+
from chapter_preamble import *
2822
```
2923

3024
## Overview
@@ -732,7 +726,7 @@ glue(
732726
)
733727
).facet(
734728
alt.Facet(
735-
'sample_size',
729+
'sample_size:N',
736730
header=alt.Header(
737731
title='',
738732
labelFontWeight='bold',
@@ -1043,7 +1037,7 @@ alt.Chart(six_bootstrap_samples, height=150).mark_bar().encode(
10431037
.title("Price per night (dollars)"),
10441038
y=alt.Y("count()").title("Count")
10451039
).facet(
1046-
"replicate",
1040+
"replicate:N", # Recall that `:N` converts the variable to a categorical type
10471041
columns=2
10481042
)
10491043
```

source/intro.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ kernelspec:
1616
# Python and Pandas
1717

1818
```{code-cell} ipython3
19-
:tags: ["remove-cell"]
20-
from myst_nb import glue
19+
:tags: [remove-cell]
20+
21+
from chapter_preamble import *
2122
```
2223

2324
## Overview

source/regression1.md

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -18,26 +18,8 @@ kernelspec:
1818
```{code-cell} ipython3
1919
:tags: [remove-cell]
2020
21-
import warnings
22-
warnings.filterwarnings("ignore", category=DeprecationWarning)
23-
warnings.filterwarnings("ignore", category=FutureWarning)
24-
25-
import altair as alt
26-
import numpy as np
27-
import pandas as pd
28-
29-
# from sklearn.model_selection import GridSearchCV, train_test_split
30-
# from sklearn.compose import make_column_transformer
31-
# from sklearn.neighbors import KNeighborsRegressor
32-
# from sklearn.pipeline import Pipeline, make_pipeline
33-
# from sklearn.preprocessing import StandardScaler
34-
35-
# import plotly.express as px
36-
# import plotly.graph_objs as go
37-
# from plotly.offline import plot
21+
from chapter_preamble import *
3822
from IPython.display import HTML
39-
40-
from myst_nb import glue
4123
```
4224

4325
## Overview
@@ -150,8 +132,9 @@ We begin the analysis by loading and examining the data,
150132
as well as setting the seed value.
151133

152134
```{code-cell} ipython3
153-
import pandas as pd
154135
import altair as alt
136+
import numpy as np
137+
import pandas as pd
155138
from sklearn.model_selection import GridSearchCV, train_test_split
156139
from sklearn.compose import make_column_transformer
157140
from sklearn.pipeline import make_pipeline
@@ -320,13 +303,10 @@ nn_plot = small_plot + rule
320303
321304
# plot horizontal lines which is perpendicular to x=2000
322305
for i in range(5):
323-
h_line_df = pd.concat(
324-
(
325-
pd.DataFrame(nearest_neighbors.iloc[i, [4, 6]]).T,
326-
pd.DataFrame({"sqft": [2000], "price": [nearest_neighbors.iloc[i, 6]]}),
327-
),
328-
ignore_index=True,
329-
)
306+
h_line_df = pd.DataFrame({
307+
"sqft": [nearest_neighbors.iloc[i, 4], 2000],
308+
"price": [nearest_neighbors.iloc[i, 6]] * 2
309+
})
330310
h_line = alt.Chart(h_line_df).mark_line(color="orange").encode(x="sqft", y="price")
331311
nn_plot += h_line
332312
@@ -671,7 +651,7 @@ glue("cv_RMSPE", "{0:,.0f}".format(int(best_cv_RMSPE)))
671651
:tags: [remove-cell]
672652
673653
sacr_tunek_plot = alt.Chart(sacr_results).mark_line(point=True).encode(
674-
x=alt.X("n_neighbors", title="Neighbors"),
654+
x=alt.X("n_neighbors:Q", title="Neighbors"),
675655
y=alt.Y("mean_test_score", scale=alt.Scale(zero=False), title="Cross-Validation RMSPE Estimate")
676656
)
677657

0 commit comments

Comments
 (0)