Skip to content

Commit 684a901

Browse files
author
SebastienMelo
committed
2 parents 222a3c2 + 5f74314 commit 684a901

File tree

75 files changed

+453
-383
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+453
-383
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# exlude datasets and externals
22
notebooks/datasets
33
notebooks/joblib/
4+
wrap-up/
45

56
# jupyter-book
67
jupyter-book/_build

CONTRIBUTING.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ $ make notebooks/02_numerical_pipeline_scaling.ipynb
3939
- when saving the notebook inside Jupyter it will actually write to the `.py` file
4040

4141
In our experience, this workflow is less convenient (Visual Studio Code is a
42-
nicer developping environment) and also it tends to add some not very important
42+
nicer developing environment) and also it tends to add some not very important
4343
(and different on everyone's machine) metadata changes in the `.py` file, for
4444
example about jupytext version, Jupyter kernel, Python version, etc ...
4545

@@ -101,7 +101,7 @@ make full-index
101101
## JupyterBook
102102

103103
JupyterBook is the tool we use to generate our .github.io website from our
104-
`.py` and `.md` files (note than `.ipynb` files are not used in our JupyterBook
104+
`.py` and `.md` files (note that `.ipynb` files are not used in our JupyterBook
105105
setup).
106106

107107
```

Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
PYTHON_SCRIPTS_DIR = python_scripts
22
NOTEBOOKS_DIR = notebooks
33
JUPYTER_BOOK_DIR = jupyter-book
4+
WRAP_UP_DIR = wrap-up
45
JUPYTER_KERNEL := python3
56
MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@")
67

@@ -37,6 +38,10 @@ quizzes:
3738
full-index:
3839
python build_tools/generate-index.py
3940

41+
run-code-in-wrap-up-quizzes:
42+
python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
43+
jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py
44+
4045
$(JUPYTER_BOOK_DIR):
4146
jupyter-book build $(JUPYTER_BOOK_DIR)
4247
rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html

build_tools/generate-wrap-up.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
import sys
2+
import os
3+
import glob
4+
5+
6+
def extract_python_code_blocks(md_file_path):
7+
"""
8+
Extract Python code blocks from a markdown file.
9+
10+
Args:
11+
md_file_path (str): Path to the markdown file
12+
13+
Returns:
14+
list: List of extracted Python code blocks
15+
"""
16+
code_blocks = []
17+
in_python_block = False
18+
current_block = []
19+
20+
with open(md_file_path, "r", encoding="utf-8") as file:
21+
for line in file:
22+
line = line.rstrip("\n")
23+
24+
if line.strip() == "```python":
25+
in_python_block = True
26+
current_block = []
27+
elif line.strip() == "```" and in_python_block:
28+
in_python_block = False
29+
code_blocks.append("\n".join(current_block))
30+
elif in_python_block:
31+
current_block.append(line)
32+
33+
return code_blocks
34+
35+
36+
def write_jupyter_notebook_file(
37+
code_blocks, output_file="notebook_from_md.py"
38+
):
39+
"""
40+
Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.
41+
42+
Args:
43+
code_blocks (list): List of code blocks to write
44+
output_file (str): Path to the output file
45+
"""
46+
with open(output_file, "w", encoding="utf-8") as file:
47+
file.write(
48+
"# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
49+
)
50+
51+
for i, block in enumerate(code_blocks, 1):
52+
file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")
53+
54+
print(
55+
f"Successfully wrote {len(code_blocks)} code cells to"
56+
f" {output_file}"
57+
)
58+
59+
60+
def process_quiz_files(input_path, output_dir):
61+
"""
62+
Process all wrap_up_quiz files in the input path and convert them to notebooks.
63+
64+
Args:
65+
input_path (str): Path to look for wrap_up_quiz files in subfolders
66+
output_dir (str): Directory to write the generated notebooks
67+
"""
68+
# Create output directory if it doesn't exist
69+
if not os.path.exists(output_dir):
70+
os.makedirs(output_dir)
71+
print(f"Created output directory: {output_dir}")
72+
73+
# Find all files containing "wrap_up_quiz" in their name in the input path subfolders
74+
quiz_files = glob.glob(
75+
f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
76+
)
77+
78+
if not quiz_files:
79+
print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
80+
return
81+
82+
print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")
83+
84+
# Process each file
85+
for md_file_path in quiz_files:
86+
print(f"\nProcessing: {md_file_path}")
87+
88+
# Extract code blocks
89+
code_blocks = extract_python_code_blocks(md_file_path)
90+
91+
# Generate output filename
92+
subfolder = md_file_path.split(os.sep)[3] # Get subfolder name
93+
output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")
94+
95+
# Display results and write notebook file
96+
if code_blocks:
97+
print(f"Found {len(code_blocks)} Python code blocks")
98+
write_jupyter_notebook_file(code_blocks, output_file=output_file)
99+
else:
100+
print(f"No Python code blocks found in {md_file_path}.")
101+
102+
103+
if __name__ == "__main__":
104+
input_path = sys.argv[1]
105+
output_dir = sys.argv[2]
106+
107+
process_quiz_files(input_path, output_dir)

full-index.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -192,7 +192,7 @@
192192
"\n",
193193
"* [🎥 Intuitions on ensemble models: boosting](https://inria.github.io/scikit-learn-mooc/ensemble/boosting_slides.html)\n",
194194
"* [Adaptive Boosting (AdaBoost)](notebooks/ensemble_adaboost.ipynb)\n",
195-
"* [Gradient-boosting decision tree (GBDT)](notebooks/ensemble_gradient_boosting.ipynb)\n",
195+
"* [Gradient-boosting decision tree](notebooks/ensemble_gradient_boosting.ipynb)\n",
196196
"* [📝 Exercise M6.03](notebooks/ensemble_ex_03.ipynb)\n",
197197
"* [📃 Solution for Exercise M6.03](notebooks/ensemble_sol_03.ipynb)\n",
198198
"* [Speeding-up gradient-boosting](notebooks/ensemble_hist_gradient_boosting.ipynb)\n",

jupyter-book/appendix/glossary.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,7 @@ The dataset used to train the [model](#model).
368368

369369
An [estimator](#estimator) (i.e. an object that has a `fit` method) supporting
370370
`transform` and/or `fit_transform`. Examples for transformers are
371-
`StandardScaler` or `ColumnTransformer`.
371+
`StandardScaler` or `OneHotEncoder`.
372372

373373
### underfitting
374374

notebooks/01_tabular_data_exploration.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102
"cell_type": "markdown",
103103
"metadata": {},
104104
"source": [
105-
"An alternative is to omit the `head` method. This would output the intial and\n",
105+
"An alternative is to omit the `head` method. This would output the initial and\n",
106106
"final rows and columns, but everything in between is not shown by default. It\n",
107107
"also provides the dataframe's dimensions at the bottom in the format `n_rows`\n",
108108
"x `n_columns`."

notebooks/03_categorical_pipeline_column_transformer.ipynb

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,10 @@
9696
" categories.\n",
9797
"* **numerical scaling** numerical features which will be standardized.\n",
9898
"\n",
99-
"Now, we create our `ColumnTransfomer` by specifying three values: the\n",
100-
"preprocessor name, the transformer, and the columns. First, let's create the\n",
101-
"preprocessors for the numerical and categorical parts."
99+
"Now, we create our `ColumnTransfomer` using the helper function\n",
100+
"`make_column_transformer`. We specify two values: the transformer, and the\n",
101+
"columns. First, let's create the preprocessors for the numerical and\n",
102+
"categorical parts."
102103
]
103104
},
104105
{
@@ -127,13 +128,11 @@
127128
"metadata": {},
128129
"outputs": [],
129130
"source": [
130-
"from sklearn.compose import ColumnTransformer\n",
131+
"from sklearn.compose import make_column_transformer\n",
131132
"\n",
132-
"preprocessor = ColumnTransformer(\n",
133-
" [\n",
134-
" (\"one-hot-encoder\", categorical_preprocessor, categorical_columns),\n",
135-
" (\"standard_scaler\", numerical_preprocessor, numerical_columns),\n",
136-
" ]\n",
133+
"preprocessor = make_column_transformer(\n",
134+
" (categorical_preprocessor, categorical_columns),\n",
135+
" (numerical_preprocessor, numerical_columns),\n",
137136
")"
138137
]
139138
},
@@ -365,8 +364,8 @@
365364
" handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
366365
")\n",
367366
"\n",
368-
"preprocessor = ColumnTransformer(\n",
369-
" [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
367+
"preprocessor = make_column_transformer(\n",
368+
" (categorical_preprocessor, categorical_columns),\n",
370369
" remainder=\"passthrough\",\n",
371370
")\n",
372371
"\n",

notebooks/03_categorical_pipeline_ex_02.ipynb

Lines changed: 5 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,19 @@
8282
"\n",
8383
"from sklearn.model_selection import cross_validate\n",
8484
"from sklearn.pipeline import make_pipeline\n",
85-
"from sklearn.compose import ColumnTransformer\n",
85+
"from sklearn.compose import make_column_transformer\n",
8686
"from sklearn.preprocessing import OrdinalEncoder\n",
8787
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
8888
"\n",
8989
"categorical_preprocessor = OrdinalEncoder(\n",
9090
" handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
9191
")\n",
92-
"preprocessor = ColumnTransformer(\n",
93-
" [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
92+
"preprocessor = make_column_transformer(\n",
93+
" (categorical_preprocessor, categorical_columns),\n",
9494
" remainder=\"passthrough\",\n",
9595
")\n",
9696
"\n",
97+
"\n",
9798
"model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
9899
"\n",
99100
"start = time.time()\n",
@@ -160,26 +161,7 @@
160161
"cell_type": "markdown",
161162
"metadata": {},
162163
"source": [
163-
"### Analysis\n",
164-
"\n",
165-
"From an accuracy point of view, the result is almost exactly the same. The\n",
166-
"reason is that `HistGradientBoostingClassifier` is expressive and robust\n",
167-
"enough to deal with misleading ordering of integer coded categories (which was\n",
168-
"not the case for linear models).\n",
169-
"\n",
170-
"However from a computation point of view, the training time is much longer:\n",
171-
"this is caused by the fact that `OneHotEncoder` generates more features than\n",
172-
"`OrdinalEncoder`; for each unique categorical value a column is created.\n",
173-
"\n",
174-
"Note that the current implementation `HistGradientBoostingClassifier` is still\n",
175-
"incomplete, and once sparse representation are handled correctly, training\n",
176-
"time might improve with such kinds of encodings.\n",
177-
"\n",
178-
"The main take away message is that arbitrary integer coding of categories is\n",
179-
"perfectly fine for `HistGradientBoostingClassifier` and yields fast training\n",
180-
"times.\n",
181-
"\n",
182-
"Which encoder should I use?\n",
164+
"## Which encoder should I use?\n",
183165
"\n",
184166
"| | Meaningful order | Non-meaningful order |\n",
185167
"| ---------------- | ----------------------------- | -------------------- |\n",

notebooks/03_categorical_pipeline_sol_02.ipynb

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -82,18 +82,19 @@
8282
"\n",
8383
"from sklearn.model_selection import cross_validate\n",
8484
"from sklearn.pipeline import make_pipeline\n",
85-
"from sklearn.compose import ColumnTransformer\n",
85+
"from sklearn.compose import make_column_transformer\n",
8686
"from sklearn.preprocessing import OrdinalEncoder\n",
8787
"from sklearn.ensemble import HistGradientBoostingClassifier\n",
8888
"\n",
8989
"categorical_preprocessor = OrdinalEncoder(\n",
9090
" handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
9191
")\n",
92-
"preprocessor = ColumnTransformer(\n",
93-
" [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
92+
"preprocessor = make_column_transformer(\n",
93+
" (categorical_preprocessor, categorical_columns),\n",
9494
" remainder=\"passthrough\",\n",
9595
")\n",
9696
"\n",
97+
"\n",
9798
"model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
9899
"\n",
99100
"start = time.time()\n",
@@ -130,17 +131,12 @@
130131
"\n",
131132
"from sklearn.preprocessing import StandardScaler\n",
132133
"\n",
133-
"preprocessor = ColumnTransformer(\n",
134-
" [\n",
135-
" (\"numerical\", StandardScaler(), numerical_columns),\n",
136-
" (\n",
137-
" \"categorical\",\n",
138-
" OrdinalEncoder(\n",
139-
" handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
140-
" ),\n",
141-
" categorical_columns,\n",
142-
" ),\n",
143-
" ]\n",
134+
"preprocessor = make_column_transformer(\n",
135+
" (StandardScaler(), numerical_columns),\n",
136+
" (\n",
137+
" OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=-1),\n",
138+
" categorical_columns,\n",
139+
" ),\n",
144140
")\n",
145141
"\n",
146142
"model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
@@ -209,8 +205,8 @@
209205
"categorical_preprocessor = OneHotEncoder(\n",
210206
" handle_unknown=\"ignore\", sparse_output=False\n",
211207
")\n",
212-
"preprocessor = ColumnTransformer(\n",
213-
" [(\"one-hot-encoder\", categorical_preprocessor, categorical_columns)],\n",
208+
"preprocessor = make_column_transformer(\n",
209+
" (categorical_preprocessor, categorical_columns),\n",
214210
" remainder=\"passthrough\",\n",
215211
")\n",
216212
"\n",

0 commit comments

Comments
 (0)