From 3a2a33455e9554077bb981c165bdfafef662ea20 Mon Sep 17 00:00:00 2001
From: xadupre <xadupre@microsoft.com>
Date: Thu, 6 Jun 2024 15:42:30 +0000
Subject: [PATCH] Add a unittest for investigation

Signed-off-by: xadupre <xadupre@microsoft.com>
---
 .gitignore                               |  1 +
 tests/xgboost/test_xgboost_issues_big.py | 93 ++++++++++++++++++++++++
 2 files changed, 94 insertions(+)
 create mode 100644 tests/xgboost/test_xgboost_issues_big.py

diff --git a/.gitignore b/.gitignore
index f0552d9b7..5780f770e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -23,6 +23,7 @@ build/
 *.bat
 
 # test generated files
+*.pkl
 .pytest_cache
 .cache
 tests/temp
diff --git a/tests/xgboost/test_xgboost_issues_big.py b/tests/xgboost/test_xgboost_issues_big.py
new file mode 100644
index 000000000..0ed24f15c
--- /dev/null
+++ b/tests/xgboost/test_xgboost_issues_big.py
@@ -0,0 +1,93 @@
+# SPDX-License-Identifier: Apache-2.0
+
+import unittest
+
+
+class TestXGBoostIssuesBig(unittest.TestCase):
+    def test_issue_early_stop(self):
+        import os
+        import pickle
+        import onnxruntime
+        import numpy as np
+        from numpy.testing import assert_almost_equal
+        from sklearn.model_selection import train_test_split
+        from sklearn.datasets import make_classification
+        from skl2onnx import convert_sklearn
+        from skl2onnx.common.data_types import FloatTensorType
+        from skl2onnx import update_registered_converter
+        from skl2onnx.common.shape_calculator import (
+            calculate_linear_classifier_output_shapes,
+        )
+        from onnxmltools.convert.xgboost.operator_converters.XGBoost import (
+            convert_xgboost,
+        )
+        from xgboost import XGBClassifier
+
+        update_registered_converter(
+            XGBClassifier,
+            "XGBoostXGBClassifier",
+            calculate_linear_classifier_output_shapes,
+            convert_xgboost,
+            options={"nocl": [True, False], "zipmap": [True, False, "columns"]},
+        )
+
+        filename = "test_issue_early_stop.pkl"
+        if os.path.exists(filename):
+            with open(filename, "rb") as f:
+                data = pickle.load(f)
+        else:
+            X, y = make_classification(100000, n_features=20, random_state=42)
+            X_train, X_test, y_train, y_test = train_test_split(X, y)
+            X_train, X_val, y_train, y_val = train_test_split(X_train, y_train)
+
+            print(f"training a model X_train.shape={X_train.shape}, X_test={X_test.shape}")
+
+            model = XGBClassifier(
+                n_estimators=7500, max_depth=10, early_stopping_rounds=250
+            )
+            model.fit(
+                X_train,
+                y_train,
+                eval_set=[(X_val, y_val)],
+                eval_metric="auc",
+                verbose=1,
+            )
+
+            data = dict(
+                X_train=X_train,
+                X_test=X_test,
+                X_val=X_val,
+                y_train=y_train,
+                y_test=y_test,
+                y_val=y_val,
+                model=model,
+            )
+            with open(filename, "wb") as f:
+                pickle.dump(data, f)
+
+        # Define input type (adjust shape according to your input)
+        X_test, model = data["X_test"], data["model"]
+        X_test = X_test[:10]
+        initial_type = [("float_input", FloatTensorType([None, X_test.shape[1]]))]
+        proba = model.predict_proba(X_test)
+        print(proba)
+
+        # Convert XGBoost model to ONNX
+        onnx_model = convert_sklearn(
+            model,
+            initial_types=initial_type,
+            target_opset={"": 18, "ai.onnx.ml": 3},
+            options={"zipmap": False},
+        )
+
+        sess = onnxruntime.InferenceSession(
+            onnx_model.SerializeToString(), providers=["CPUExecutionProvider"]
+        )
+        got = sess.run(None, {"float_input": X_test[:10].astype(np.float32)})
+        onnx_proba = got[1]
+        print(onnx_proba)
+        assert_almost_equal(proba, onnx_proba)
+
+
+if __name__ == "__main__":
+    unittest.main(verbosity=2)