diff --git a/src/model_loader/detail/xgboost.cc b/src/model_loader/detail/xgboost.cc index 52ce1e89..92e29419 100644 --- a/src/model_loader/detail/xgboost.cc +++ b/src/model_loader/detail/xgboost.cc @@ -16,6 +16,10 @@ #include #include +#include + +#include "./string_utils.h" + namespace treelite::model_loader { namespace detail::xgboost { @@ -55,6 +59,25 @@ double TransformBaseScoreToMargin(std::string const& postprocessor, double base_ } } +std::vector ParseBaseScore(std::string const& str) { + std::vector parsed_base_score; + if (StringStartsWith(str, "[")) { + // Vector base_score (from XGBoost 3.1+) + rapidjson::Document doc; + doc.Parse(str); + TREELITE_CHECK(doc.IsArray()) << "Expected an array for base_score"; + parsed_base_score.clear(); + for (auto const& e : doc.GetArray()) { + TREELITE_CHECK(e.IsFloat()) << "Expected a float array for base_score"; + parsed_base_score.push_back(e.GetFloat()); + } + } else { + // Scalar base_score (from XGBoost <3.1) + parsed_base_score = {std::stof(str)}; + } + return parsed_base_score; +} + } // namespace detail::xgboost std::string DetectXGBoostFormat(std::string const& filename) { diff --git a/src/model_loader/detail/xgboost.h b/src/model_loader/detail/xgboost.h index e936f16e..e6f4733e 100644 --- a/src/model_loader/detail/xgboost.h +++ b/src/model_loader/detail/xgboost.h @@ -28,6 +28,9 @@ std::string GetPostProcessor(std::string const& objective_name); // Transform base score from probability into margin score double TransformBaseScoreToMargin(std::string const& postprocessor, double base_score); +// Parse base score +std::vector ParseBaseScore(std::string const& str); + enum FeatureType { kNumerical = 0, kCategorical = 1 }; } // namespace treelite::model_loader::detail::xgboost diff --git a/src/model_loader/detail/xgboost_json/delegated_handler.cc b/src/model_loader/detail/xgboost_json/delegated_handler.cc index 15fcb782..b6be7a67 100644 --- a/src/model_loader/detail/xgboost_json/delegated_handler.cc +++ b/src/model_loader/detail/xgboost_json/delegated_handler.cc @@ -8,6 +8,7 @@ #include "./delegated_handler.h" +#include #include #include @@ -499,11 +500,10 @@ RegTreeArrayHandler::RegTreeArrayHandler(std::weak_ptr parent_delegat bool RegTreeArrayHandler::StartObject() { if (this->should_ignore_upcoming_value()) { - return this->template push_handler(); + return push_handler(); } - this->output.emplace_back(); - return this->template push_handler( - this->output.back(), model_builder); + output.emplace_back(); + return push_handler(output.back(), model_builder); } /****************************************************************************** @@ -525,7 +525,9 @@ bool GBTreeModelHandler::StartObject() { if (this->should_ignore_upcoming_value()) { return push_handler(); } - return push_key_handler("gbtree_model_param"); + return push_key_handler("gbtree_model_param") + || push_key_handler( + "cats", output.category_container); } bool GBTreeModelHandler::EndObject() { @@ -544,8 +546,124 @@ bool GBTreeModelHandler::EndObject() { } bool GBTreeModelHandler::is_recognized_key(std::string const& key) { - return (key == "trees" || key == "tree_info" || key == "gbtree_model_param" - || key == "iteration_indptr"); + return key == "trees" || key == "tree_info" || key == "gbtree_model_param" + || key == "iteration_indptr" || key == "cats"; +} + +/****************************************************************************** + * CategoryContainerHandler + * ***************************************************************************/ + +bool CategoryContainerHandler::StartArray() { + if (this->should_ignore_upcoming_value()) { + return push_handler(); + } + return push_key_handler("enc", output.enc) + || push_key_handler, std::vector>( + "feature_segments", output.feature_segments) + || push_key_handler, std::vector>( + "sorted_idx", output.sorted_idx); +} + +bool CategoryContainerHandler::is_recognized_key(std::string const& key) { + return key == "enc" || key == "feature_segments" || key == "sorted_idx"; +} + +/****************************************************************************** + * CategoryInfoArrayHandler + * ***************************************************************************/ +bool CategoryInfoArrayHandler::StartObject() { + if (this->should_ignore_upcoming_value()) { + return push_handler(); + } + output.emplace_back(); + return push_handler(output.back()); +} + +/****************************************************************************** + * CategoryInfoHandler + * ***************************************************************************/ +bool CategoryInfoHandler::Int64(std::int64_t i) { + if (this->should_ignore_upcoming_value()) { + return push_handler(); + } + bool got_type = check_cur_key("type"); + if (got_type) { + output.type = i; + } + return got_type; +} + +bool CategoryInfoHandler::Uint64(std::uint64_t u) { + // The "type" field can be int64 or uint64 + // Just defer to the int64 handler + return Int64(static_cast(u)); +} + +bool CategoryInfoHandler::StartArray() { + if (this->should_ignore_upcoming_value()) { + return push_handler(); + } + + bool got_offsets = check_cur_key("offsets"); + if (got_offsets) { + output.offsets = std::vector{}; + push_handler, std::vector>(output.offsets.value()); + } + + // Assumption: Either "offsets" or "type" fields have been given before "values" field. + // Only with this assumption can we infer the type of the "values" field. + bool got_values = check_cur_key("values"); + if (got_values) { + if (output.offsets.has_value()) { + // String categories + output.values = std::vector{}; + push_handler, std::vector>( + std::get>(output.values)); + } else if (output.type.has_value()) { + // Numerical categories + switch (static_cast(output.type.value())) { + case ValueKind::kU8Array: + case ValueKind::kU16Array: + case ValueKind::kU32Array: + case ValueKind::kU64Array: { + output.values = std::vector{}; + push_handler, std::vector>( + std::get>(output.values)); + break; + } + case ValueKind::kI8Array: + case ValueKind::kI16Array: + case ValueKind::kI32Array: + case ValueKind::kI64Array: { + output.values = std::vector{}; + push_handler, std::vector>( + std::get>(output.values)); + break; + } + case ValueKind::kF32Array: + case ValueKind::kF64Array: { + output.values = std::vector{}; + push_handler, std::vector>( + std::get>(output.values)); + break; + } + default: + TREELITE_LOG(ERROR) << "Got invalid type for `values` array"; + return false; + } + } else { + TREELITE_LOG(ERROR) << "Cannot determine the type of `values` array, since neither" + << "`type` or `offsets` fields are present"; + return false; + } + } + + return got_values || got_offsets; +} + +bool CategoryInfoHandler::is_recognized_key(std::string const& key) { + return key == "type" || key == "offsets" || key == "values"; } /****************************************************************************** @@ -647,14 +765,17 @@ bool LearnerParamHandler::String(std::string const& str) { if (this->should_ignore_upcoming_value()) { return true; } - // For now, XGBoost always outputs a scalar base_score - return ( - assign_value("base_score", static_cast(std::stof(str)), output.base_score) - || assign_value("num_class", std::max(std::stoi(str), 1), output.num_class) - || assign_value("num_target", static_cast(std::stoi(str)), output.num_target) - || assign_value("num_feature", std::stoi(str), output.num_feature) - || assign_value( - "boost_from_average", static_cast(std::stoi(str)), output.boost_from_average)); + + // Special handling logic for base_score + bool got_base_score = check_cur_key("base_score"); + if (got_base_score) { + output.base_score = ParseBaseScore(str); + } + return got_base_score || assign_value("num_class", std::max(std::stoi(str), 1), output.num_class) + || assign_value("num_target", static_cast(std::stoi(str)), output.num_target) + || assign_value("num_feature", std::stoi(str), output.num_feature) + || assign_value( + "boost_from_average", static_cast(std::stoi(str)), output.boost_from_average); } bool LearnerParamHandler::is_recognized_key(std::string const& key) { @@ -680,6 +801,13 @@ bool LearnerHandler::StartObject() { } bool LearnerHandler::EndObject() { + /* Throw an exception if category encoding is required. + * TODO(hcho3): Implement categorical encoding */ + TREELITE_CHECK(output.category_container.enc.empty() + && output.category_container.feature_segments.empty() + && output.category_container.sorted_idx.empty()) + << "Treelite does not yet support XGBoost models with categorical encoder"; + /* Set metadata */ auto const num_tree = output.num_tree; auto const num_feature = learner_params.num_feature; @@ -743,18 +871,30 @@ bool LearnerHandler::EndObject() { leaf_vector_shape[1] = 1; } } - // Set base scores. For now, XGBoost only supports a scalar base score for all targets / classes. - auto base_score = static_cast(learner_params.base_score); + // Set base scores + // Assume: Either num_target or num_class must be 1 + TREELITE_CHECK(learner_params.num_target == 1 || learner_params.num_class == 1); + std::vector base_scores(learner_params.num_target * learner_params.num_class); + if (learner_params.base_score.size() == 1) { + // Scalar base_score (XGBoost <3.1) + // Starting from 3.1, the base score is a vector. + std::fill(base_scores.begin(), base_scores.end(), + static_cast(learner_params.base_score.at(0))); + } else { + // Vector base_score (XGBoost 3.1+) + // Assume: If base_score is a vector, then its length should be num_target * num_class. + TREELITE_CHECK(base_scores.size() == learner_params.base_score.size()); + std::transform(learner_params.base_score.begin(), learner_params.base_score.end(), + base_scores.begin(), [](float e) { return static_cast(e); }); + } + // Before XGBoost 1.0.0, the base score saved in model is a transformed value. After // 1.0 it's the original value provided by user. bool const need_transform_to_margin = output.version.empty() || output.version[0] >= 1; if (need_transform_to_margin) { - base_score = xgboost::TransformBaseScoreToMargin(postprocessor.name, base_score); + std::for_each(base_scores.begin(), base_scores.end(), + [&](auto& e) { e = xgboost::TransformBaseScoreToMargin(postprocessor.name, e); }); } - // For now, XGBoost produces a scalar base_score - // Assume: Either num_target or num_class must be 1 - TREELITE_CHECK(learner_params.num_target == 1 || learner_params.num_class == 1); - std::vector base_scores(learner_params.num_target * learner_params.num_class, base_score); model_builder::Metadata metadata{ num_feature, task_type, average_tree_output, num_target, num_class, leaf_vector_shape}; diff --git a/src/model_loader/detail/xgboost_json/delegated_handler.h b/src/model_loader/detail/xgboost_json/delegated_handler.h index 7723b4a2..ac9e343d 100644 --- a/src/model_loader/detail/xgboost_json/delegated_handler.h +++ b/src/model_loader/detail/xgboost_json/delegated_handler.h @@ -12,20 +12,62 @@ #include #include #include +#include #include #include #include +#include #include #include namespace treelite::model_loader::detail::xgboost { +enum class ValueKind : std::int64_t { + kString = 0, + kNumber = 1, + kInteger = 2, + kObject = 3, + kArray = 4, + kBoolean = 5, + kNull = 6, + kF32Array = 7, + kF64Array = 8, + kI8Array = 9, + kU8Array = 10, + kI16Array = 11, + kU16Array = 12, + kI32Array = 13, + kU32Array = 14, + kI64Array = 15, + kU64Array = 16, +}; + class HandlerConfig { public: bool allow_unknown_field{false}; }; +struct ParsedStringCategorical { + std::vector offsets; + std::vector values; +}; + +using ParsedCategoryValuesArray = std::variant, + std::vector, std::vector, std::vector>; + +struct ParsedCategoryInfo { + std::optional type{}; + std::optional> offsets{}; + ParsedCategoryValuesArray values{std::monostate{}}; +}; + +struct ParsedCategoryContainer { + std::vector enc; + std::vector feature_segments; + std::vector sorted_idx; +}; + struct ParsedXGBoostModel { std::unique_ptr builder{}; std::int32_t num_tree{0}; @@ -34,6 +76,7 @@ struct ParsedXGBoostModel { std::string objective_name{}; std::int32_t size_leaf_vector{0}; std::vector weight_drop{}; + ParsedCategoryContainer category_container; }; struct ParsedRegTreeParams { @@ -42,7 +85,7 @@ struct ParsedRegTreeParams { }; struct ParsedLearnerParams { - float base_score{0.0}; + std::vector base_score; std::int32_t num_class{1}; std::int32_t num_feature{0}; std::int32_t num_target{1}; @@ -415,6 +458,35 @@ class GBTreeModelHandler : public OutputHandler { std::vector reg_tree_params; }; +/*! \brief Handler for CategoryContainer objects from XGBoost schema */ +class CategoryContainerHandler : public OutputHandler { + public: + using OutputHandler::OutputHandler; + bool StartArray() override; + + protected: + bool is_recognized_key(std::string const& key) override; +}; + +/*! \brief Handler for array of objects of CategoryInfo type */ +class CategoryInfoArrayHandler : public OutputHandler> { + public: + using OutputHandler::OutputHandler; + bool StartObject() override; +}; + +/*! \brief Handler for CategoryInfo objects */ +class CategoryInfoHandler : public OutputHandler { + public: + using OutputHandler::OutputHandler; + bool Int64(std::int64_t i) override; + bool Uint64(std::uint64_t u) override; + bool StartArray() override; + + protected: + bool is_recognized_key(std::string const& key) override; +}; + /*! \brief Handler for GradientBoosterHandler objects from XGBoost schema */ class GradientBoosterHandler : public OutputHandler { public: diff --git a/tests/cpp/test_model_loader.cc b/tests/cpp/test_model_loader.cc index 41191d89..7f2ba877 100644 --- a/tests/cpp/test_model_loader.cc +++ b/tests/cpp/test_model_loader.cc @@ -5,11 +5,14 @@ * \brief C++ tests for model loader */ +#include #include +#include #include #include "model_loader/detail/string_utils.h" +#include "model_loader/detail/xgboost.h" TEST(ModelLoader, StringTrim) { std::string s{"foobar\r\n"}; @@ -21,3 +24,19 @@ TEST(ModelLoader, StringStartsWith) { std::string s{"foobar"}; EXPECT_TRUE(treelite::model_loader::detail::StringStartsWith(s, "foo")); } + +TEST(ModelLoader, XGBoostBaseScore) { + { + std::string s{"[5.2008224E-1,4.665861E-1]"}; + std::vector parsed = treelite::model_loader::detail::xgboost::ParseBaseScore(s); + std::vector expected{5.2008224E-1, 4.665861E-1}; + EXPECT_EQ(parsed, expected); + } + + { + std::string s{"4.9333417E-1"}; + std::vector parsed = treelite::model_loader::detail::xgboost::ParseBaseScore(s); + std::vector expected{4.9333417E-1}; + EXPECT_EQ(parsed, expected); + } +} diff --git a/tests/python/test_xgboost_integration.py b/tests/python/test_xgboost_integration.py index 76f69618..74272aa0 100644 --- a/tests/python/test_xgboost_integration.py +++ b/tests/python/test_xgboost_integration.py @@ -27,7 +27,7 @@ standard_regression_datasets, standard_settings, ) -from .util import TemporaryDirectory, to_categorical +from .util import TemporaryDirectory, has_pandas, to_categorical def generate_data_for_squared_log_error(n_targets: int = 1): @@ -84,6 +84,8 @@ def test_xgb_regressor( else: X, y = callback.draw(standard_regression_datasets()) use_categorical = callback.draw(sampled_from([True, False])) + # TODO(hcho3): Remove this once Treelite supports categorical encoding + use_categorical = False if use_categorical: n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1])) df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1) @@ -92,7 +94,7 @@ def test_xgb_regressor( else: dtrain = xgb.DMatrix(X, label=y) X_pred = X.copy() - model_format = callback.draw(sampled_from(["json", "ubjson", "legacy_binary"])) + model_format = callback.draw(sampled_from(["json", "ubjson"])) param = { "max_depth": 8, "eta": 0.1, @@ -106,16 +108,11 @@ def test_xgb_regressor( num_boost_round=num_boost_round, ) with TemporaryDirectory() as tmpdir: - if model_format in ["json", "ubjson"]: - model_path = pathlib.Path(tmpdir) / f"model.{model_format}" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model( - model_path, format_choice=model_format - ) - else: - model_path = pathlib.Path(tmpdir) / "model.deprecated" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model_legacy_binary(model_path) + model_path = pathlib.Path(tmpdir) / f"model.{model_format}" + xgb_model.save_model(model_path) + tl_model = treelite.frontend.load_xgboost_model( + model_path, format_choice=model_format + ) assert ( len(json.loads(tl_model.dump_as_json())["trees"]) == num_boost_round * num_parallel_tree @@ -153,6 +150,8 @@ def test_xgb_multiclass_classifier( # pylint: disable=too-many-locals """Test XGBoost with multi-class classification problem""" X, y = dataset + # TODO(hcho3): Remove this once Treelite supports categorical encoding + use_categorical = False if use_categorical: n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1])) df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1) @@ -161,7 +160,7 @@ def test_xgb_multiclass_classifier( else: dtrain = xgb.DMatrix(X, label=y) X_pred = X.copy() - model_format = callback.draw(sampled_from(["json", "ubjson", "legacy_binary"])) + model_format = callback.draw(sampled_from(["json", "ubjson"])) num_class = np.max(y) + 1 param = { @@ -180,16 +179,11 @@ def test_xgb_multiclass_classifier( ) with TemporaryDirectory() as tmpdir: - if model_format in ["json", "ubjson"]: - model_path = pathlib.Path(tmpdir) / f"iris.{model_format}" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model( - model_path, format_choice=model_format - ) - else: - model_path = pathlib.Path(tmpdir) / "iris.deprecated" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model_legacy_binary(model_path) + model_path = pathlib.Path(tmpdir) / f"iris.{model_format}" + xgb_model.save_model(model_path) + tl_model = treelite.frontend.load_xgboost_model( + model_path, format_choice=model_format + ) expected_num_tree = num_class * num_boost_round * num_parallel_tree assert len(json.loads(tl_model.dump_as_json())["trees"]) == expected_num_tree @@ -239,6 +233,8 @@ def test_xgb_nonlinear_objective( n_classes=just(num_class), n_informative=just(5) ) ) + # TODO(hcho3): Remove this once Treelite supports categorical encoding + use_categorical = False if use_categorical: n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1])) df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1) @@ -247,7 +243,7 @@ def test_xgb_nonlinear_objective( else: dtrain = xgb.DMatrix(X, label=y) X_pred = X.copy() - model_format = callback.draw(sampled_from(["json", "ubjson", "legacy_binary"])) + model_format = callback.draw(sampled_from(["json", "ubjson"])) assert np.min(y) == 0 assert np.max(y) == num_class - 1 @@ -274,12 +270,9 @@ def test_xgb_nonlinear_objective( model_path = pathlib.Path(tmpdir) / model_name xgb_model.save_model(model_path) - if model_format in ["json", "ubjson"]: - tl_model = treelite.frontend.load_xgboost_model( - model_path, format_choice=model_format - ) - else: - tl_model = treelite.frontend.load_xgboost_model_legacy_binary(model_path) + tl_model = treelite.frontend.load_xgboost_model( + model_path, format_choice=model_format + ) out_pred = treelite.gtil.predict(tl_model, X_pred, pred_margin=True) expected_pred = xgb_model.predict( @@ -439,6 +432,8 @@ def test_xgb_multi_target_binary_classifier( ): """Test XGBoost with multi-target binary classification problem""" X, y = dataset + # TODO(hcho3): Remove this once Treelite supports categorical encoding + use_categorical = False if use_categorical: n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1])) df, X_pred = to_categorical(X, n_categorical=n_categorical, invalid_frac=0.1) @@ -447,10 +442,7 @@ def test_xgb_multi_target_binary_classifier( dtrain = xgb.DMatrix(X, label=y) X_pred = X.copy() - if use_categorical or multi_strategy == "multi_output_tree" or in_memory: - model_format = callback.draw(sampled_from(["ubjson", "json"])) - else: - model_format = callback.draw(sampled_from(["legacy_binary", "ubjson", "json"])) + model_format = callback.draw(sampled_from(["ubjson", "json"])) params = { "tree_method": "hist", @@ -466,18 +458,11 @@ def test_xgb_multi_target_binary_classifier( tl_model = treelite.frontend.from_xgboost(bst) else: with TemporaryDirectory() as tmpdir: - if model_format in ["json", "ubjson"]: - model_path = pathlib.Path(tmpdir) / f"multi_target.{model_format}" - bst.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model( - model_path, format_choice=model_format - ) - else: - model_path = pathlib.Path(tmpdir) / "multi_target.deprecated" - bst.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model_legacy_binary( - model_path - ) + model_path = pathlib.Path(tmpdir) / f"multi_target.{model_format}" + bst.save_model(model_path) + tl_model = treelite.frontend.load_xgboost_model( + model_path, format_choice=model_format + ) out_pred = treelite.gtil.predict(tl_model, X_pred, pred_margin=pred_margin) expected_pred = bst.predict( @@ -521,10 +506,9 @@ def test_xgb_multi_target_regressor( else: X, y = callback.draw(standard_regression_datasets(n_targets=just(n_targets))) use_categorical = callback.draw(sampled_from([True, False])) - if multi_strategy == "multi_output_tree" or use_categorical: - model_format = callback.draw(sampled_from(["ubjson", "json"])) - else: - model_format = callback.draw(sampled_from(["legacy_binary", "ubjson", "json"])) + model_format = callback.draw(sampled_from(["ubjson", "json"])) + # TODO(hcho3): Remove this once Treelite supports categorical encoding + use_categorical = False if use_categorical: n_categorical = callback.draw(integers(min_value=1, max_value=X.shape[1])) @@ -549,16 +533,11 @@ def test_xgb_multi_target_regressor( ) with TemporaryDirectory() as tmpdir: - if model_format in ["json", "ubjson"]: - model_path = pathlib.Path(tmpdir) / f"model.{model_format}" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model( - model_path, format_choice=model_format - ) - else: - model_path = pathlib.Path(tmpdir) / "model.deprecated" - xgb_model.save_model(model_path) - tl_model = treelite.frontend.load_xgboost_model_legacy_binary(model_path) + model_path = pathlib.Path(tmpdir) / f"model.{model_format}" + xgb_model.save_model(model_path) + tl_model = treelite.frontend.load_xgboost_model( + model_path, format_choice=model_format + ) expected_n_trees = num_boost_round * num_parallel_tree if multi_strategy == "one_output_per_tree": expected_n_trees *= n_targets @@ -620,3 +599,22 @@ def test_load_old_xgboost_model(): / "mushroom.model" ) _ = treelite.frontend.load_xgboost_model_legacy_binary(path) # should not crash + + +# TODO(hcho3): Remove this unit test once categorical encoding is added +@pytest.mark.skipif(not has_pandas(), reason="Pandas is not installed") +def test_categorical_encoding(): + """Treelite should throw an exception when XGBoost model uses categorical encoding""" + import pandas as pd + + df = pd.DataFrame({"c": ["a", "b", "c"], "a": [-1, 1, 2]}, dtype="category") + y = np.array([0, 0, 1]) + + Xy = xgb.DMatrix(df, y, enable_categorical=True) + bst = xgb.train({"max_depth": 1, "base_score": 0}, Xy, num_boost_round=1) + + with pytest.raises( + treelite.TreeliteError, + match=r".*Treelite does not yet support XGBoost models with categorical encoder.*", + ): + _ = treelite.frontend.from_xgboost(bst)