enh: improve bdt notebook time

jonas-eschle · jonas-eschle · commit 2503a54b4fea · 2025-02-10T16:58:36.000+01:00
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -30,11 +30,10 @@ repos:
         args: [ --extra-keys=metadata.language_info.codemirror_mode.version metadata.kernelspec metadata.language_info.pygments_lexer metadata.language_info.version ]
 
 
-
-# needs rust, only activate if needed
+  # needs rust, only activate if needed
 #  - repo: https://github.com/shssoichiro/oxipng
-#    rev: v9.1.2
+#    rev: v9.1.3
 #    hooks:
 #      - id: oxipng
-#        args: [ --best --strip all --quiet ]
+#        args: [ "--", "--best", "--strip", "all", "--quiet" ]
 
diff --git a/advanced-python/11AdvancedPython.ipynb b/advanced-python/11AdvancedPython.ipynb
@@ -295,6 +295,8 @@
    "outputs": [],
    "source": [
     "# SOLUTION\n",
+    "\n",
+    "\n",
     "@contextlib.contextmanager\n",
     "def func(x):\n",
     "    yield x\n",
@@ -479,6 +481,8 @@
    "outputs": [],
    "source": [
     "# SOLUTION\n",
+    "\n",
+    "\n",
     "def timed_func(func):\n",
     "    def wrapped_func(*args, **kwargs):\n",
     "        print(args)\n",
diff --git a/advanced-python/20DataAndPlotting.ipynb b/advanced-python/20DataAndPlotting.ipynb
@@ -32,7 +32,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "jupyter": {
+     "is_executing": true
+    }
+   },
    "outputs": [],
    "source": [
     "import mplhep\n",
diff --git a/advanced-python/32BoostingToUniformity.ipynb b/advanced-python/32BoostingToUniformity.ipynb
@@ -146,7 +146,7 @@
     "classifiers['uBoost'] = uboost.uBoostClassifier(uniform_features=uniform_features, uniform_label=1,\n",
     "                                     base_estimator=base_estimator,\n",
     "                                     n_estimators=n_estimators, train_features=train_features,\n",
-    "                                     efficiency_steps=12, n_jobs=4)\n",
+    "                                     efficiency_steps=12, n_threads=4)\n",
     "\n",
     "flatnessloss = ugb.KnnFlatnessLossFunction(uniform_features, fl_coefficient=3., power=1.3, uniform_label=1)\n",
     "classifiers['uGB+FL'] = ugb.UGradientBoostingClassifier(loss=flatnessloss, max_depth=4,\n",
diff --git a/advanced-python/40Histograms.ipynb b/advanced-python/40Histograms.ipynb
@@ -42,7 +42,7 @@
     "%store -r mc_df\n",
     "%store -r data_df\n",
     "\n",
-    "import boost_histogram as bh\n",
+    "import boost_histogram as bh  # usually not needed\n",
     "import hist\n",
     "import mplhep\n",
     "import numpy as np"
@@ -61,8 +61,8 @@
     "# Let's get started with a simple example\n",
     "\n",
     "# Compose axis however you like; this is a 2D histogram\n",
-    "h = bh.Histogram(bh.axis.Regular(2, 0, 1),\n",
-    "                 bh.axis.Regular(4, 0.0, 1.0))\n",
+    "h = hist.Hist(hist.axis.Regular(2, 0, 1),\n",
+    "                 hist.axis.Regular(4, 0.0, 1.0))\n",
     "\n",
     "# Filling can be done with arrays, one per dimension\n",
     "h.fill([.3, .5, .2],\n",
@@ -131,7 +131,7 @@
    "outputs": [],
    "source": [
     "start, stop = data_df['Jpsi_M'].min(), data_df['Jpsi_M'].max()\n",
-    "axis1 = hist.axis.Regular(bins=50, start=start, stop=stop, name=\"mass\")"
+    "axis1 = hist.axis.Regular(bins=50, start=start, stop=stop, name=\"mass\", label=r\"$m(J/\\psi)$\")"
    ]
   },
   {
@@ -558,5 +558,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 1
+ "nbformat_minor": 4
 }
diff --git a/advanced-python/45DemoReweighting.ipynb b/advanced-python/45DemoReweighting.ipynb
@@ -28,6 +28,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "from xgboost import XGBClassifier\n",
+    "\n",
     "%matplotlib inline\n",
     "\n",
     "import numpy as np\n",
@@ -231,8 +233,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "reweighter = reweight.GBReweighter(n_estimators=250, learning_rate=0.1, max_depth=3, min_samples_leaf=1000,\n",
-    "                                   gb_args={'subsample': 0.4})\n",
+    "reweighter = reweight.GBReweighter(n_estimators=50, learning_rate=0.1, max_depth=3, min_samples_leaf=1000,\n",
+    "                                   gb_args={'subsample': 0.7})\n",
     "reweighter.fit(original_train, target_train)\n",
     "\n",
     "gb_weights_test = reweighter.predict_weights(original_test)\n",
@@ -331,9 +333,9 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from sklearn.ensemble import GradientBoostingClassifier\n",
     "from sklearn.metrics import roc_auc_score\n",
     "from sklearn.model_selection import train_test_split\n",
+    "from xgboost import XGBClassifier\n",
     "\n",
     "data = np.concatenate([original_test, target_test])\n",
     "labels = np.array([0] * len(original_test) + [1] * len(target_test))\n",
@@ -347,7 +349,7 @@
     "for name, new_weights in weights.items():\n",
     "    W = np.concatenate([new_weights / new_weights.sum() * len(target_test), [1] * len(target_test)])\n",
     "    Xtr, Xts, Ytr, Yts, Wtr, Wts = train_test_split(data, labels, W, random_state=42, train_size=0.51)\n",
-    "    clf = GradientBoostingClassifier(subsample=0.3, n_estimators=50).fit(Xtr, Ytr, sample_weight=Wtr)\n",
+    "    clf = XGBClassifier(subsample=0.8, n_estimators=50).fit(Xtr, Ytr, sample_weight=Wtr)\n",
     "\n",
     "    print(name, roc_auc_score(Yts, clf.predict_proba(Xts)[:, 1], sample_weight=Wts))"
    ]
@@ -428,8 +430,8 @@
    "outputs": [],
    "source": [
     "# define base reweighter\n",
-    "reweighter_base = reweight.GBReweighter(n_estimators=80,\n",
-    "                                        learning_rate=0.01, max_depth=4, min_samples_leaf=100,\n",
+    "reweighter_base = reweight.GBReweighter(n_estimators=40,\n",
+    "                                        learning_rate=0.02, max_depth=4, min_samples_leaf=100,\n",
     "                                        gb_args={'subsample': 0.8})\n",
     "reweighter = reweight.FoldingReweighter(reweighter_base, n_folds=2)\n",
     "# it is not needed divide data into train/test parts; reweighter can be train on the whole samples\n",
@@ -467,7 +469,7 @@
     "for name, new_weights in weights.items():\n",
     "    W = np.concatenate([new_weights / new_weights.sum() * len(target), [1] * len(target)])\n",
     "    Xtr, Xts, Ytr, Yts, Wtr, Wts = train_test_split(data, labels, W, random_state=42, train_size=0.51)\n",
-    "    clf = GradientBoostingClassifier(subsample=0.6, n_estimators=30).fit(Xtr, Ytr, sample_weight=Wtr)\n",
+    "    clf = XGBClassifier(subsample=0.6, n_estimators=30).fit(Xtr, Ytr, sample_weight=Wtr)\n",
     "\n",
     "    print(name, roc_auc_score(Yts, clf.predict_proba(Xts)[:, 1], sample_weight=Wts))"
    ]
diff --git a/advanced-python/50LikelihoodInference.ipynb b/advanced-python/50LikelihoodInference.ipynb
@@ -213,6 +213,24 @@
     "plot_fit(model, data)  # before the fit"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### A quick plot\n",
+    "\n",
+    "Sometimes, it's useful to just quickly make a plot of the model without controlling all of the aspects. This can be done using the `plot` attribute of the model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.plot.plotpdf()"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
diff --git a/advanced-python/70ScikitHEPUniverse.ipynb b/advanced-python/70ScikitHEPUniverse.ipynb
@@ -8,7 +8,9 @@
     "\n",
     "The [Scikit-HEP project](https://scikit-hep.org/) is a well maintained collection of HEP packages in Python that are useful for analysis. It contains tools ranging from plotting helpers, PDG lookup, DecayLanguage converters over high-performance histogramming libraries and ROOT I/O up to likelihood fitting and statistics libraries.\n",
     "\n",
-    "This is a minimal overview over the packages that are available and have not yet been used in the other tutorials."
+    "This is a minimal overview over the packages that are available and have not yet been used in the other tutorials.\n",
+    "\n",
+    "**If you are interested to become pa"
    ]
   },
   {
diff --git a/environment.yml b/environment.yml
@@ -4,8 +4,10 @@ channels:
   - nodefaults
 dependencies:
   - python ~=3.11.0
+  - aiohttp  # needed for uproot http access
   - boost-histogram
   - hep_ml
+  - hepstats
   - hepunits
   - hist
   - ipython
@@ -19,16 +21,14 @@ dependencies:
   - particle
   - pandoc
   - pip
-  - uv
+  - requests  # needed for uproot http access
   - scikit-learn
   - scipy
   - uproot >=5.0.0  # 5.0.0 breaks the httpsource argument with open, TODO upgrade (what's the equivalent?) in the "get_truth" function
-  - aiohttp  # needed for uproot http access
-  - requests  # needed for uproot http access
+  - uv
   - vector
   - wget
   #  - xgboost
-  - hepstats
   - pip:
       - zfit >=0.24.0  # to have the newest version, TensorFlow is a bit stuck: https://github.com/conda-forge/tensorflow-feedstock/pull/408
       - zfit-physics >=0.7.0

Original file line number	Diff line number	Diff line change
`@@ -8,7 +8,9 @@`
`8`	`8`	`"\n",`
`9`	`9`	`"The [Scikit-HEP project](https://scikit-hep.org/) is a well maintained collection of HEP packages in Python that are useful for analysis. It contains tools ranging from plotting helpers, PDG lookup, DecayLanguage converters over high-performance histogramming libraries and ROOT I/O up to likelihood fitting and statistics libraries.\n",`
`10`	`10`	`"\n",`
`11`		`- "This is a minimal overview over the packages that are available and have not yet been used in the other tutorials."`
	`11`	`+ "This is a minimal overview over the packages that are available and have not yet been used in the other tutorials.\n",`
	`12`	`+ "\n",`
	`13`	`+ "**If you are interested to become pa"`
`12`	`14`	`]`
`13`	`15`	`},`
`14`	`16`	`{`