AutoIntent/autointent/modules/scoring/_linear.py at d1462f18af49d1155db5b3a3e6962910ec6f2831 · deeppavlov/AutoIntent · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""LinearScorer class for linear classification."""

from typing import Any

import numpy as np
import numpy.typing as npt
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.multioutput import MultiOutputClassifier

from autointent import Context, Embedder
from autointent.configs import EmbedderConfig, TaskTypeEnum
from autointent.custom_types import ListOfLabels
from autointent.modules.abc import BaseScorer


class LinearScorer(BaseScorer):
    """
    Scoring module for linear classification using logistic regression.

    This module uses embeddings generated from a transformer model to train a
    logistic regression classifier for intent classification.

    :ivar name: Name of the scorer, defaults to "linear".

    Example
    --------
    .. testcode::

        from autointent.modules import LinearScorer
        scorer = LinearScorer(
            embedder_config="sergeyzh/rubert-tiny-turbo", cv=2
        )
        utterances = ["hello", "goodbye", "allo", "sayonara"]
        labels = [0, 1, 0, 1]
        scorer.fit(utterances, labels)
        test_utterances = ["hi", "bye"]
        probabilities = scorer.predict(test_utterances)
        print(probabilities)

    .. testoutput::

        [[0.50000032 0.49999968]
         [0.50000032 0.49999968]]

    """

    name = "linear"
    _multilabel: bool
    _clf: LogisticRegressionCV | MultiOutputClassifier
    _embedder: Embedder
    supports_multiclass = True
    supports_multilabel = True

    def __init__(
        self,
        embedder_config: EmbedderConfig | str | dict[str, Any] | None = None,
        cv: int = 3,
        seed: int = 0,
    ) -> None:
        """
        Initialize the LinearScorer.

        :param embedder_config: Config of the embedder model.
        :param cv: Number of cross-validation folds, defaults to 3.
        :param n_jobs: Number of parallel jobs for cross-validation, defaults to -1 (all CPUs).
        :param seed: Random seed for reproducibility, defaults to 0.
        """
        self.cv = cv
        self.seed = seed
        self.embedder_config = EmbedderConfig.from_search_config(embedder_config)

        if self.cv < 0 or not isinstance(self.cv, int):
            msg = "`cv` argument of `LinearScorer` must be a positive int"
            raise ValueError(msg)

    @classmethod
    def from_context(
        cls,
        context: Context,
        embedder_config: EmbedderConfig | str | None = None,
    ) -> "LinearScorer":
        """
        Create a LinearScorer instance using a Context object.

        :param context: Context containing configurations and utilities.
        :param embedder_config: Config of the embedder, or None to use the best embedder.
        :return: Initialized LinearScorer instance.
        """
        if embedder_config is None:
            embedder_config = context.resolve_embedder()

        return cls(
            embedder_config=embedder_config,
        )

    def get_embedder_config(self) -> dict[str, Any]:
        """
        Get the name of the embedder.

        :return: Embedder name.
        """
        return self.embedder_config.model_dump()

    def fit(
        self,
        utterances: list[str],
        labels: ListOfLabels,
    ) -> None:
        """
        Train the logistic regression classifier.

        :param utterances: List of training utterances.
        :param labels: List of labels corresponding to the utterances.
        :raises ValueError: If the vector index mismatches the provided utterances.
        """
        if hasattr(self, "_clf"):
            self.clear_cache()

        self._validate_task(labels)

        embedder = Embedder(
            self.embedder_config,
        )
        features = embedder.embed(utterances, TaskTypeEnum.classification)

        if self._multilabel:
            base_clf = LogisticRegression()
            clf = MultiOutputClassifier(base_clf)
        else:
            clf = LogisticRegressionCV(cv=self.cv, random_state=self.seed)

        clf.fit(features, labels)

        self._clf = clf
        self._embedder = embedder

    def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
        """
        Predict probabilities for the given utterances.

        :param utterances: List of query utterances.
        :return: Array of predicted probabilities for each class.
        """
        features = self._embedder.embed(utterances, TaskTypeEnum.classification)
        probas = self._clf.predict_proba(features)
        if self._multilabel:
            probas = np.stack(probas, axis=1)[..., 1]
        return probas  # type: ignore[no-any-return]

    def clear_cache(self) -> None:
        """Clear cached data in memory used by the embedder."""
        self._embedder.clear_ram()