Added CatBoostScorer (#209)

nikiduki · Samoed · voorhs · web-flow · commit ac1b732c86d4 · 2025-06-18T19:56:19.000+03:00
* Added catboost in dependencies * Raw implementation of CatboostScorer * Update init.py files * Minor fix * Added CatBoostScorer * Added tests for CatBoostScorer * Fix init * Fix mypy * Minor fix * Fix loss function * fix multilabel prediction * refactor catboost scorer * minor fix * fix test to match * fix/wandb-final-metrics-skipped (#212) * fix * sklearn scorer proper name * fix typing errors * try to fix pydantic errors * Remove artifacts from final metrics (#216) * Update wandb.py * Update wandb.py * Update wandb.py * Update _optimization_info.py * remove print * fix few shot split (#219) * fix few shot split * lint * remove egor (#221) * Feat/bert early stopping (#223) * change how `clear_cache` is called * first version of early stopping * change mypy version * train_test_split bug fix * add `compute_metrics` and `EarlyStoppingCallback` * bug fix * fix mypy * try to fix `"eval_f1" not found` error * forgot to upd `from_context` * try to fix mypy * ty to fix "not found f1" error * refactor a little bit * disable early stopping for lora * fix typing errors * update contributing and makefile * minor change * use our metrics * add docstrings * set 3.10 for mypy * upd contributing.md * try to fix bug * try to fix typing issue * try to fix * add early stopping to ptuning * Check if metric can handle dataset type (#224) * add test for configuration * lint * satisfy mypy * add prompt logging (#220) * add prompt logging * Update optimizer_config.schema.json * fix --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> * fix caching (#225) * Fix default prompt (#226) * fix default prompt * allow to use default prompt with override * some refactors * add `use_embedding_features` * fix tests * fix lint * fix strenum * Added catboost in dependencies * Raw implementation of CatboostScorer * Update init.py files * Minor fix * Added CatBoostScorer * Added tests for CatBoostScorer * Fix init * Fix mypy * Minor fix * Fix loss function * fix multilabel prediction * refactor catboost scorer * minor fix * fix test to match * fix loading * fix lint * fix typing * fix dumper * add early stopping * fix errors * codestyle * patch catboost with early stopping and catboost * try to fix * fix embed type --------- Co-authored-by: Roman Solomatin <36135455+Samoed@users.noreply.github.com> Co-authored-by: Алексеев Илья <44509110+voorhs@users.noreply.github.com> Co-authored-by: Roman Solomatin <samoed.roman@gmail.com> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: voorhs <ilya_alekseev_2016@list.ru>
diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py
@@ -7,6 +7,7 @@
 import joblib
 import numpy as np
 import numpy.typing as npt
+from catboost import CatBoostClassifier
 from peft import PeftModel
 from pydantic import BaseModel
 from sklearn.base import BaseEstimator
@@ -47,6 +48,7 @@ class Dumper:
     hf_tokenizers = "hf_tokenizers"
     torch_models = "torch_models"
     ptuning_models = "ptuning_models"
+    catboost_models = "catboost_models"
 
     @staticmethod
     def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
@@ -67,6 +69,7 @@ def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
             path / Dumper.hf_tokenizers,
             path / Dumper.torch_models,
             path / Dumper.ptuning_models,
+            path / Dumper.catboost_models,
         ]
         for subdir in subdirectories:
             subdir.mkdir(parents=True, exist_ok=exists_ok)
@@ -165,6 +168,8 @@ def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]]
                 except Exception as e:
                     msg = f"Error dumping HF tokenizer {key}: {e}"
                     logger.exception(msg)
+            elif isinstance(val, CatBoostClassifier):
+                val.save_model(str(path / Dumper.catboost_models / key), format="cbm")
             else:
                 msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
                 logger.error(msg)
@@ -192,6 +197,7 @@ def load(  # noqa: C901, PLR0912, PLR0915
         pydantic_models: dict[str, Any] = {}
         hf_models: dict[str, Any] = {}
         hf_tokenizers: dict[str, Any] = {}
+        catboost_models: dict[str, Any] = {}
         torch_models: dict[str, Any] = {}
 
         for child in path.iterdir():
@@ -267,6 +273,15 @@ def load(  # noqa: C901, PLR0912, PLR0915
                     except Exception as e:  # noqa: PERF203
                         msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
                         logger.exception(msg)
+            elif child.name == Dumper.catboost_models:
+                for model_file in child.iterdir():
+                    try:
+                        model = CatBoostClassifier()
+                        model.load_model(str(path / Dumper.catboost_models / model_file))
+                        catboost_models[model_file.name] = model
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading CatBoost model: {e}"
+                        logger.exception(msg)
             elif child.name == Dumper.torch_models:
                 try:
                     for model_dir in child.iterdir():
@@ -294,5 +309,6 @@ def load(  # noqa: C901, PLR0912, PLR0915
             | pydantic_models
             | hf_models
             | hf_tokenizers
+            | catboost_models
             | torch_models
         )
diff --git a/autointent/modules/__init__.py b/autointent/modules/__init__.py
@@ -15,6 +15,7 @@
 from .scoring import (
     BERTLoRAScorer,
     BertScorer,
+    CatBoostScorer,
     CNNScorer,
     DescriptionScorer,
     DNNCScorer,
@@ -41,6 +42,7 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
 
 SCORING_MODULES: dict[str, type[BaseScorer]] = _create_modules_dict(
     [
+        CatBoostScorer,
         DNNCScorer,
         KNNScorer,
         LinearScorer,
@@ -68,6 +70,7 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
     "BaseModule",
     "BaseRegex",
     "BaseScorer",
+    "CatBoostScorer",
     "DNNCScorer",
     "DescriptionScorer",
     "JinoosDecision",
diff --git a/autointent/modules/scoring/__init__.py b/autointent/modules/scoring/__init__.py
@@ -1,4 +1,5 @@
 from ._bert import BertScorer
+from ._catboost import CatBoostScorer
 from ._cnn import CNNScorer
 from ._description import DescriptionScorer
 from ._dnnc import DNNCScorer
@@ -13,6 +14,7 @@
     "BERTLoRAScorer",
     "BertScorer",
     "CNNScorer",
+    "CatBoostScorer",
     "DNNCScorer",
     "DescriptionScorer",
     "KNNScorer",
diff --git a/autointent/modules/scoring/_catboost/__init__.py b/autointent/modules/scoring/_catboost/__init__.py
@@ -0,0 +1,3 @@
+from .catboost_scorer import CatBoostScorer
+
+__all__ = ["CatBoostScorer"]
diff --git a/autointent/modules/scoring/_catboost/catboost_scorer.py b/autointent/modules/scoring/_catboost/catboost_scorer.py
@@ -0,0 +1,230 @@
+"""CatBoostScorer class for CatBoost-based classification with switchable encoding."""
+
+import logging
+from enum import Enum
+from typing import Any, cast
+
+import numpy as np
+import numpy.typing as npt
+import pandas as pd
+from catboost import CatBoostClassifier
+
+from autointent import Context, Embedder
+from autointent.configs import EmbedderConfig, TaskTypeEnum
+from autointent.custom_types import FloatFromZeroToOne, ListOfLabels
+from autointent.modules.base import BaseScorer
+
+logger = logging.getLogger(__name__)
+
+
+class FeaturesType(str, Enum):
+    """Type of features used in CatBoostScorer."""
+
+    TEXT = "text"
+    EMBEDDING = "embedding"
+    BOTH = "both"
+
+
+class CatBoostScorer(BaseScorer):
+    """CatBoost scorer using either external embeddings or CatBoost's own BoW encoding.
+
+    Args:
+        embedder_config: Config of the base transformer model (HFModelConfig, str, or dict)
+                If None (default) the scorer relies on CatBoost's own Bag-of-Words encoding,
+                otherwise the provided embedder is used.
+
+        features_type: Type of features used in CatBoost. Can be one of:
+                - "text": Use only text features (CatBoost's BoW encoding).
+                - "embedding": Use only embedding features.
+                - "both": Use both text and embedding features.
+
+        use_embedding_features: If True, the model uses CatBoost `embedding_features` otherwise
+                each number will be in separate column.
+
+        loss_function: CatBoost loss function.  If None, an appropriate loss is
+                chosen automatically from the task type.
+
+        verbose: If True, CatBoost prints training progress.
+
+        val_fraction: fraction of training data used for early stopping. Set to None to disaple early stopping.
+                Note: early stopping is not supported with multilabel classification.
+
+        early_stopping_rounds: number of iterations without metric increasing waiting for early stopping.
+                Ignored when ``val_fraction`` is ``None``.
+
+        **catboost_kwargs: Any additional keyword arguments forwarded to
+                :class:`catboost.CatBoostClassifier`. Please refer to
+                `catboost's documentation <https://catboost.ai/docs/en/concepts/python-reference_catboostclassifier>`_
+
+    Example:
+    -------
+
+    .. testcode::
+
+        from autointent.modules import CatBoostScorer
+
+        scorer = CatBoostScorer(
+            iterations=50,
+            learning_rate=0.05,
+            depth=6,
+            l2_leaf_reg=3,
+            eval_metric="Accuracy",
+            random_seed=42,
+            verbose=False,
+            features_type="embedding",  # or "text" or "both"
+        )
+        utterances = ["hello", "goodbye", "allo", "sayonara"]
+        labels = [0, 1, 0, 1]
+        scorer.fit(utterances, labels)
+        test_utterances = ["hi", "bye"]
+        probabilities = scorer.predict(test_utterances)
+        print(probabilities)
+
+    .. testoutput::
+
+        [[0.41493207 0.58506793]
+         [0.55036046 0.44963954]]
+
+    """
+
+    name = "catboost"
+    supports_multiclass = True
+    supports_multilabel = True
+
+    _model: CatBoostClassifier
+
+    encoder_features_types = (FeaturesType.EMBEDDING, FeaturesType.BOTH)
+
+    def __init__(
+        self,
+        embedder_config: EmbedderConfig | str | dict[str, Any] | None = None,
+        features_type: FeaturesType = FeaturesType.BOTH,
+        use_embedding_features: bool = True,
+        loss_function: str | None = None,
+        verbose: bool = False,
+        val_fraction: float | None = 0.2,
+        early_stopping_rounds: int = 100,
+        **catboost_kwargs: dict[str, Any],
+    ) -> None:
+        self.val_fraction = val_fraction
+        self.early_stopping_rounds = early_stopping_rounds
+        self.features_type = features_type
+        self.use_embedding_features = use_embedding_features
+        if features_type == FeaturesType.TEXT and use_embedding_features:
+            msg = "Only catbooost text features will be used, `use_embedding_features` is ignored."
+            logger.warning(msg)
+
+        self.embedder_config = EmbedderConfig.from_search_config(embedder_config)
+        self.loss_function = loss_function
+        self.verbose = verbose
+        self.catboost_kwargs = catboost_kwargs or {}
+
+    @classmethod
+    def from_context(
+        cls,
+        context: Context,
+        embedder_config: EmbedderConfig | str | dict[str, Any] | None = None,
+        features_type: FeaturesType = FeaturesType.BOTH,
+        use_embedding_features: bool = True,
+        loss_function: str | None = None,
+        verbose: bool = False,
+        val_fraction: FloatFromZeroToOne | None = 0.2,
+        early_stopping_rounds: int = 100,
+        **catboost_kwargs: dict[str, Any],
+    ) -> "CatBoostScorer":
+        if embedder_config is None:
+            embedder_config = context.resolve_embedder()
+        return cls(
+            embedder_config=embedder_config,
+            loss_function=loss_function,
+            verbose=verbose,
+            features_type=features_type,
+            use_embedding_features=use_embedding_features,
+            val_fraction=val_fraction,
+            early_stopping_rounds=early_stopping_rounds,
+            **catboost_kwargs,
+        )
+
+    def get_implicit_initialization_params(self) -> dict[str, Any]:
+        return {
+            "embedder_config": self.embedder_config.model_dump()
+            if self.features_type in self.encoder_features_types
+            else None,
+        }
+
+    def _prepare_data_for_fit(
+        self,
+        utterances: list[str],
+    ) -> pd.DataFrame:
+        if self.features_type in self.encoder_features_types:
+            encoded_utterances = self._embedder.embed(utterances, TaskTypeEnum.classification).tolist()
+            if self.use_embedding_features:
+                data = pd.DataFrame({"embedding": encoded_utterances})
+            else:
+                data = pd.DataFrame(np.array(encoded_utterances))
+            if self.features_type == FeaturesType.BOTH:
+                data["text"] = utterances
+        else:
+            data = pd.DataFrame({"text": utterances})
+
+        return data
+
+    def get_extra_params(self) -> dict[str, Any]:
+        extra_params = {}
+        if self.features_type == FeaturesType.EMBEDDING:
+            if self.use_embedding_features:  # to not raise error if embedding without embedding_features
+                extra_params["embedding_features"] = ["embedding"]
+        elif self.features_type in {FeaturesType.TEXT, FeaturesType.BOTH}:
+            extra_params["text_features"] = ["text"]
+            if self.features_type == FeaturesType.BOTH and self.use_embedding_features:
+                extra_params["embedding_features"] = ["embedding"]
+        else:
+            msg = f"Unsupported features type: {self.features_type}"
+            raise ValueError(msg)
+        return extra_params
+
+    def fit(
+        self,
+        utterances: list[str],
+        labels: ListOfLabels,
+    ) -> None:
+        self._validate_task(labels)
+
+        if self.features_type in self.encoder_features_types:
+            self._embedder = Embedder(self.embedder_config)
+
+        dataset = self._prepare_data_for_fit(utterances)
+
+        default_loss = (
+            "MultiLogloss" if self._multilabel else ("MultiClass" if self._n_classes > 2 else "Logloss")  # noqa: PLR2004
+        )
+
+        if self._multilabel:
+            self.val_fraction = None
+            msg = "Disabling early stopping in CatBoostClassifier as it is not supported with multi-label task."
+            logger.warning(msg)
+
+        self._model = CatBoostClassifier(
+            loss_function=self.loss_function or default_loss,
+            verbose=self.verbose,
+            allow_writing_files=False,
+            eval_fraction=self.val_fraction,
+            **self.catboost_kwargs,
+            **self.get_extra_params(),
+        )
+        self._model.fit(
+            dataset, labels, early_stopping_rounds=self.early_stopping_rounds if self.val_fraction is not None else None
+        )
+
+    def predict(self, utterances: list[str]) -> npt.NDArray[np.float64]:
+        if getattr(self, "_model", None) is None:
+            msg = "Model is not trained. Call fit() first."
+            raise RuntimeError(msg)
+        data = self._prepare_data_for_fit(utterances)
+        return cast("npt.NDArray[np.float64]", self._model.predict_proba(data))
+
+    def clear_cache(self) -> None:
+        if hasattr(self, "_model"):
+            del self._model
+        if hasattr(self, "_embedder"):
+            del self._embedder
diff --git a/pyproject.toml b/pyproject.toml
@@ -46,6 +46,7 @@ dependencies = [
     "transformers[torch] (>=4.49.0,<5.0.0)",
     "peft (>= 0.10.0, !=0.15.0, !=0.15.1, <1.0.0)",
     "codecarbon (==2.6)",
+    "catboost (>=1.2.8,<2.0.0)",
 ]
 
 [project.optional-dependencies]
@@ -69,6 +70,7 @@ typing = [
     "types-pygments (>=2.18.0.20240506,<3.0.0)",
     "types-setuptools (>=75.2.0.20241019,<76.0.0)",
     "joblib-stubs (>=1.4.2.5.20240918,<2.0.0)",
+    "pandas-stubs (>= 2.2.3.250527, <3.0.0)",
 ]
 docs = [
     "sphinx (>=8.1.3,<9.0.0)",
@@ -219,6 +221,7 @@ module = [
     "wandb",
     "dspy",
     "dspy.evaluate.auto_evaluation",
+    "catboost",
 ]
 ignore_missing_imports = true
 
diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml
@@ -29,6 +29,16 @@
     - module_name: sklearn
       clf_name: [RandomForestClassifier]
       n_estimators: [5, 10]
+    - module_name: catboost
+      iterations: [50, 100]
+      learning_rate: [0.05, 0.1]
+      depth: [1, 10]
+      l2_leaf_reg: [1, 5]
+      eval_metric: ["Accuracy"]
+      random_seed: [42]
+      features_type: ["embedding"]
+      embedder_config:
+        - model_name: prajjwal1/bert-tiny
     - module_name: bert
       classification_model_config:
         - model_name: avsolatorio/GIST-small-Embedding-v0
diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml
@@ -25,6 +25,16 @@
     - module_name: sklearn
       clf_name: [RandomForestClassifier]
       n_estimators: [5, 10]
+    - module_name: catboost
+      iterations: [50, 100]
+      learning_rate: [0.05, 0.1]
+      depth: [1, 10]
+      l2_leaf_reg: [1, 5]
+      loss_function: ["MultiLogloss"]
+      random_seed: [42]
+      embedder_config:
+        - null
+        - model_name: prajjwal1/bert-tiny
     - module_name: bert
       classification_model_config:
         - model_name: avsolatorio/GIST-small-Embedding-v0
diff --git a/tests/modules/scoring/test_catboost.py b/tests/modules/scoring/test_catboost.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .catboost_scorer import CatBoostScorer`
	`2`	`+`
	`3`	`+__all__ = ["CatBoostScorer"]`