SebastienMelo
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions b/‎CONTRIBUTING.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎Makefile‎
Lines changed: 5 additions & 0 deletions b/‎Makefile‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎build_tools/generate-wrap-up.py‎
Lines changed: 107 additions & 0 deletions b/‎build_tools/generate-wrap-up.py‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎full-index.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎full-index.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎jupyter-book/appendix/glossary.md‎
Lines changed: 1 addition & 1 deletion b/‎jupyter-book/appendix/glossary.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎notebooks/01_tabular_data_exploration.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎notebooks/01_tabular_data_exploration.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎notebooks/03_categorical_pipeline_column_transformer.ipynb‎
Lines changed: 10 additions & 11 deletions b/‎notebooks/03_categorical_pipeline_column_transformer.ipynb‎
Lines changed: 10 additions & 11 deletions
diff --git a/‎notebooks/03_categorical_pipeline_ex_02.ipynb‎
Lines changed: 5 additions & 23 deletions b/‎notebooks/03_categorical_pipeline_ex_02.ipynb‎
Lines changed: 5 additions & 23 deletions
diff --git a/‎notebooks/03_categorical_pipeline_sol_02.ipynb‎
Lines changed: 12 additions & 16 deletions b/‎notebooks/03_categorical_pipeline_sol_02.ipynb‎
Lines changed: 12 additions & 16 deletions
@@ -1,6 +1,7 @@
 # exlude datasets and externals
 notebooks/datasets
 notebooks/joblib/
+wrap-up/
 
 # jupyter-book
 jupyter-book/_build
 
@@ -39,7 +39,7 @@ $ make notebooks/02_numerical_pipeline_scaling.ipynb
 - when saving the notebook inside Jupyter it will actually write to the `.py` file
 
 In our experience, this workflow is less convenient (Visual Studio Code is a
-nicer developping environment) and also it tends to add some not very important
+nicer developing environment) and also it tends to add some not very important
 (and different on everyone's machine) metadata changes in the `.py` file, for
 example about jupytext version, Jupyter kernel, Python version, etc ...
 
@@ -101,7 +101,7 @@ make full-index
 ## JupyterBook
 
 JupyterBook is the tool we use to generate our .github.io website from our
-`.py` and `.md` files (note than `.ipynb` files are not used in our JupyterBook
+`.py` and `.md` files (note that `.ipynb` files are not used in our JupyterBook
 setup).
 
 ```
 
@@ -1,6 +1,7 @@
 PYTHON_SCRIPTS_DIR = python_scripts
 NOTEBOOKS_DIR = notebooks
 JUPYTER_BOOK_DIR = jupyter-book
+WRAP_UP_DIR = wrap-up
 JUPYTER_KERNEL := python3
 MINIMAL_NOTEBOOK_FILES = $(shell ls $(PYTHON_SCRIPTS_DIR)/*.py | perl -pe "s@$(PYTHON_SCRIPTS_DIR)@$(NOTEBOOKS_DIR)@" | perl -pe "s@\.py@.ipynb@")
 
@@ -37,6 +38,10 @@ quizzes:
 full-index:
 	python build_tools/generate-index.py
 
+run-code-in-wrap-up-quizzes:
+	python build_tools/generate-wrap-up.py $(GITLAB_REPO_JUPYTERBOOK_DIR) $(WRAP_UP_DIR)
+	jupytext --execute --to notebook $(WRAP_UP_DIR)/*.py
+
 $(JUPYTER_BOOK_DIR):
 	jupyter-book build $(JUPYTER_BOOK_DIR)
 	rm -rf $(JUPYTER_BOOK_DIR)/_build/html/{slides,figures} && cp -r slides figures $(JUPYTER_BOOK_DIR)/_build/html
 
@@ -0,0 +1,107 @@
+import sys
+import os
+import glob
+
+
+def extract_python_code_blocks(md_file_path):
+    """
+    Extract Python code blocks from a markdown file.
+
+    Args:
+        md_file_path (str): Path to the markdown file
+
+    Returns:
+        list: List of extracted Python code blocks
+    """
+    code_blocks = []
+    in_python_block = False
+    current_block = []
+
+    with open(md_file_path, "r", encoding="utf-8") as file:
+        for line in file:
+            line = line.rstrip("\n")
+
+            if line.strip() == "```python":
+                in_python_block = True
+                current_block = []
+            elif line.strip() == "```" and in_python_block:
+                in_python_block = False
+                code_blocks.append("\n".join(current_block))
+            elif in_python_block:
+                current_block.append(line)
+
+    return code_blocks
+
+
+def write_jupyter_notebook_file(
+    code_blocks, output_file="notebook_from_md.py"
+):
+    """
+    Writes extracted code blocks to a Python file formatted as Jupyter notebook cells.
+
+    Args:
+        code_blocks (list): List of code blocks to write
+        output_file (str): Path to the output file
+    """
+    with open(output_file, "w", encoding="utf-8") as file:
+        file.write(
+            "# %% [markdown] \n # ## Notebook generated from Markdown file\n\n"
+        )
+
+        for i, block in enumerate(code_blocks, 1):
+            file.write(f"# %% [markdown]\n# ## Cell {i}\n\n# %%\n{block}\n\n")
+
+        print(
+            f"Successfully wrote {len(code_blocks)} code cells to"
+            f" {output_file}"
+        )
+
+
+def process_quiz_files(input_path, output_dir):
+    """
+    Process all wrap_up_quiz files in the input path and convert them to notebooks.
+
+    Args:
+        input_path (str): Path to look for wrap_up_quiz files in subfolders
+        output_dir (str): Directory to write the generated notebooks
+    """
+    # Create output directory if it doesn't exist
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+        print(f"Created output directory: {output_dir}")
+
+    # Find all files containing "wrap_up_quiz" in their name in the input path subfolders
+    quiz_files = glob.glob(
+        f"{input_path}/**/*wrap_up_quiz*.md", recursive=True
+    )
+
+    if not quiz_files:
+        print(f"No wrap_up_quiz.md files found in {input_path} subfolders.")
+        return
+
+    print(f"Found {len(quiz_files)} wrap_up_quiz files to process.")
+
+    # Process each file
+    for md_file_path in quiz_files:
+        print(f"\nProcessing: {md_file_path}")
+
+        # Extract code blocks
+        code_blocks = extract_python_code_blocks(md_file_path)
+
+        # Generate output filename
+        subfolder = md_file_path.split(os.sep)[3]  # Get subfolder name
+        output_file = os.path.join(output_dir, f"{subfolder}_wrap_up_quiz.py")
+
+        # Display results and write notebook file
+        if code_blocks:
+            print(f"Found {len(code_blocks)} Python code blocks")
+            write_jupyter_notebook_file(code_blocks, output_file=output_file)
+        else:
+            print(f"No Python code blocks found in {md_file_path}.")
+
+
+if __name__ == "__main__":
+    input_path = sys.argv[1]
+    output_dir = sys.argv[2]
+
+    process_quiz_files(input_path, output_dir)
@@ -192,7 +192,7 @@
     "\n",
     "* [🎥 Intuitions on ensemble models: boosting](https://inria.github.io/scikit-learn-mooc/ensemble/boosting_slides.html)\n",
     "* [Adaptive Boosting (AdaBoost)](notebooks/ensemble_adaboost.ipynb)\n",
-    "* [Gradient-boosting decision tree (GBDT)](notebooks/ensemble_gradient_boosting.ipynb)\n",
+    "* [Gradient-boosting decision tree](notebooks/ensemble_gradient_boosting.ipynb)\n",
     "* [📝 Exercise M6.03](notebooks/ensemble_ex_03.ipynb)\n",
     "* [📃 Solution for Exercise M6.03](notebooks/ensemble_sol_03.ipynb)\n",
     "* [Speeding-up gradient-boosting](notebooks/ensemble_hist_gradient_boosting.ipynb)\n",
 
@@ -368,7 +368,7 @@ The dataset used to train the [model](#model).
 
 An [estimator](#estimator) (i.e. an object that has a `fit` method) supporting
 `transform` and/or `fit_transform`. Examples for transformers are
-`StandardScaler` or `ColumnTransformer`.
+`StandardScaler` or `OneHotEncoder`.
 
 ### underfitting
 
 
@@ -102,7 +102,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "An alternative is to omit the `head` method. This would output the intial and\n",
+    "An alternative is to omit the `head` method. This would output the initial and\n",
     "final rows and columns, but everything in between is not shown by default. It\n",
     "also provides the dataframe's dimensions at the bottom in the format `n_rows`\n",
     "x `n_columns`."
 
@@ -96,9 +96,10 @@
     "  categories.\n",
     "* **numerical scaling** numerical features which will be standardized.\n",
     "\n",
-    "Now, we create our `ColumnTransfomer` by specifying three values: the\n",
-    "preprocessor name, the transformer, and the columns. First, let's create the\n",
-    "preprocessors for the numerical and categorical parts."
+    "Now, we create our `ColumnTransfomer` using the helper function\n",
+    "`make_column_transformer`. We specify two values: the transformer, and the\n",
+    "columns. First, let's create the preprocessors for the numerical and\n",
+    "categorical parts."
    ]
   },
   {
@@ -127,13 +128,11 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.compose import make_column_transformer\n",
     "\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [\n",
-    "        (\"one-hot-encoder\", categorical_preprocessor, categorical_columns),\n",
-    "        (\"standard_scaler\", numerical_preprocessor, numerical_columns),\n",
-    "    ]\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (categorical_preprocessor, categorical_columns),\n",
+    "    (numerical_preprocessor, numerical_columns),\n",
     ")"
    ]
   },
@@ -365,8 +364,8 @@
     "    handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
     ")\n",
     "\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (categorical_preprocessor, categorical_columns),\n",
     "    remainder=\"passthrough\",\n",
     ")\n",
     "\n",
 
@@ -82,18 +82,19 @@
     "\n",
     "from sklearn.model_selection import cross_validate\n",
     "from sklearn.pipeline import make_pipeline\n",
-    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.compose import make_column_transformer\n",
     "from sklearn.preprocessing import OrdinalEncoder\n",
     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
     "\n",
     "categorical_preprocessor = OrdinalEncoder(\n",
     "    handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
     ")\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (categorical_preprocessor, categorical_columns),\n",
     "    remainder=\"passthrough\",\n",
     ")\n",
     "\n",
+    "\n",
     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
     "\n",
     "start = time.time()\n",
@@ -160,26 +161,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "### Analysis\n",
-    "\n",
-    "From an accuracy point of view, the result is almost exactly the same. The\n",
-    "reason is that `HistGradientBoostingClassifier` is expressive and robust\n",
-    "enough to deal with misleading ordering of integer coded categories (which was\n",
-    "not the case for linear models).\n",
-    "\n",
-    "However from a computation point of view, the training time is much longer:\n",
-    "this is caused by the fact that `OneHotEncoder` generates more features than\n",
-    "`OrdinalEncoder`; for each unique categorical value a column is created.\n",
-    "\n",
-    "Note that the current implementation `HistGradientBoostingClassifier` is still\n",
-    "incomplete, and once sparse representation are handled correctly, training\n",
-    "time might improve with such kinds of encodings.\n",
-    "\n",
-    "The main take away message is that arbitrary integer coding of categories is\n",
-    "perfectly fine for `HistGradientBoostingClassifier` and yields fast training\n",
-    "times.\n",
-    "\n",
-    "Which encoder should I use?\n",
+    "## Which encoder should I use?\n",
     "\n",
     "|                  | Meaningful order              | Non-meaningful order |\n",
     "| ---------------- | ----------------------------- | -------------------- |\n",
 
@@ -82,18 +82,19 @@
     "\n",
     "from sklearn.model_selection import cross_validate\n",
     "from sklearn.pipeline import make_pipeline\n",
-    "from sklearn.compose import ColumnTransformer\n",
+    "from sklearn.compose import make_column_transformer\n",
     "from sklearn.preprocessing import OrdinalEncoder\n",
     "from sklearn.ensemble import HistGradientBoostingClassifier\n",
     "\n",
     "categorical_preprocessor = OrdinalEncoder(\n",
     "    handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
     ")\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [(\"categorical\", categorical_preprocessor, categorical_columns)],\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (categorical_preprocessor, categorical_columns),\n",
     "    remainder=\"passthrough\",\n",
     ")\n",
     "\n",
+    "\n",
     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
     "\n",
     "start = time.time()\n",
@@ -130,17 +131,12 @@
     "\n",
     "from sklearn.preprocessing import StandardScaler\n",
     "\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [\n",
-    "        (\"numerical\", StandardScaler(), numerical_columns),\n",
-    "        (\n",
-    "            \"categorical\",\n",
-    "            OrdinalEncoder(\n",
-    "                handle_unknown=\"use_encoded_value\", unknown_value=-1\n",
-    "            ),\n",
-    "            categorical_columns,\n",
-    "        ),\n",
-    "    ]\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (StandardScaler(), numerical_columns),\n",
+    "    (\n",
+    "        OrdinalEncoder(handle_unknown=\"use_encoded_value\", unknown_value=-1),\n",
+    "        categorical_columns,\n",
+    "    ),\n",
     ")\n",
     "\n",
     "model = make_pipeline(preprocessor, HistGradientBoostingClassifier())\n",
@@ -209,8 +205,8 @@
     "categorical_preprocessor = OneHotEncoder(\n",
     "    handle_unknown=\"ignore\", sparse_output=False\n",
     ")\n",
-    "preprocessor = ColumnTransformer(\n",
-    "    [(\"one-hot-encoder\", categorical_preprocessor, categorical_columns)],\n",
+    "preprocessor = make_column_transformer(\n",
+    "    (categorical_preprocessor, categorical_columns),\n",
     "    remainder=\"passthrough\",\n",
     ")\n",
     "\n",