[TMVA][Python] Fix feature name handling for RBDT::LoadText

guitargeek · guitargeek · commit 8c83c8fcf4e7 · 2025-11-28T20:24:51.000+01:00
If the XGBoost model encodes feature names, they will also be used in the `.txt` dump of the model. We have to use these names in `RBDT::LoadTxt` as well, so that the `.txt` file can be read correctly without errors. The RBDT unit test is also updated to cover this case of custom feature names, which happens when the training data comes from a pandas DataFrame. Also implement some suggestions by ruff. Closes #20267.
diff --git a/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_tree_inference.py b/bindings/pyroot/pythonizations/python/ROOT/_pythonization/_tmva/_tree_inference.py
@@ -8,11 +8,10 @@
 # For the list of contributors see $ROOTSYS/README/CREDITS.                    #
 ################################################################################
 
-from .. import pythonization
-import cppyy
-
 import json
 
+import cppyy
+
 
 def get_basescore(model):
     """Get base score from an XGBoost sklearn estimator.
@@ -60,7 +59,7 @@ def SaveXGBoost(xgb_model, key_name, output_path, num_inputs):
         "reg:squarederror": "identity",
     }
     model_objective = xgb_model.objective
-    if not model_objective in objective_map:
+    if model_objective not in objective_map:
         raise Exception(
             'XGBoost model has unsupported objective "{}". Supported objectives are {}.'.format(
                 model_objective, objective_map.keys()
@@ -74,13 +73,13 @@ def SaveXGBoost(xgb_model, key_name, output_path, num_inputs):
     # Dump XGB model as json file
     xgb_model.get_booster().dump_model(output_path, dump_format="json")
 
-    with open(output_path, "r") as json_file:
-        forest = json.load(json_file)
-
     # Dump XGB model as txt file
     xgb_model.get_booster().dump_model(output_path)
 
-    features = cppyy.gbl.std.vector["std::string"]([f"f{i}" for i in range(num_inputs)])
+    if xgb_model.get_booster().feature_names is None:
+        features = cppyy.gbl.std.vector["std::string"]([f"f{i}" for i in range(num_inputs)])
+    else:
+        features = cppyy.gbl.std.vector["std::string"](xgb_model.get_booster().feature_names)
     bs = get_basescore(xgb_model)
     logistic = objective == "logistic"
     bdt = cppyy.gbl.TMVA.Experimental.RBDT.LoadText(
diff --git a/tmva/tmva/test/CMakeLists.txt b/tmva/tmva/test/CMakeLists.txt
@@ -37,9 +37,12 @@ endif()
 
 if(dataframe)
   # Test uses the xgboost sklearn plugin, so we need to check for sklearn too.
+  # It also uses pandas to test the case where the training data is passed via
+  # a pandas DataFrame.
   ROOT_FIND_PYTHON_MODULE(xgboost)
+  ROOT_FIND_PYTHON_MODULE(pandas)
   ROOT_FIND_PYTHON_MODULE(sklearn)
-  if (ROOT_XGBOOST_FOUND AND ROOT_SKLEARN_FOUND)
+  if (ROOT_XGBOOST_FOUND AND ROOT_SKLEARN_FOUND AND ROOT_PANDAS_FOUND)
     ROOT_ADD_PYUNITTEST(rbdt_xgboost rbdt_xgboost.py)
   endif()
 endif()
diff --git a/tmva/tmva/test/rbdt_xgboost.py b/tmva/tmva/test/rbdt_xgboost.py
@@ -1,12 +1,9 @@
-# XGBoost has to be imported before ROOT to avoid crashes because of clashing
-# std::regexp symbols that are exported by cppyy.
-# See also: https://github.com/wlav/cppyy/issues/227
-import xgboost
-
 import unittest
-import ROOT
+
 import numpy as np
-import json
+import pandas
+import ROOT
+import xgboost
 
 np.random.seed(1234)
 
@@ -41,9 +38,18 @@ def _test_XGBRegression(label):
     """
     Compare response of XGB regressor and TMVA tree inference system.
     """
-    x, y = create_dataset(1000, 10, 1)
+    n_samples = 1000
+    n_features = 10
+    x, y = create_dataset(n_samples, n_features, 1)
+    # Other than in the XGBBinary test, we're passing the training features via
+    # a pandas DataFrame this time. In that case, XGBoost will define custom
+    # feature names according to the column names in the dataframe, and we can
+    # test the case where the feature names in the .txt dump are not the
+    # default "f0", "f1", "f2", etc.
+    df_x = pandas.DataFrame({f"myfeature_{i}": x[:, i] for i in range(n_features)})
+    assert len(x) == len(df_x)
     xgb = xgboost.XGBRegressor(n_estimators=1, max_depth=3)
-    xgb.fit(x, y)
+    xgb.fit(df_x, y)
     ROOT.TMVA.Experimental.SaveXGBoost(xgb, "myModel", "testXGBRegression{}.root".format(label), num_inputs=10)
     bdt = ROOT.TMVA.Experimental.RBDT("myModel", "testXGBRegression{}.root".format(label))