Skip to content

Commit 3d18626

Browse files
JakeRaskindSamoed
andauthored
Feat/sklearn scorer (#59)
* add_logit_adaptivness * Update __init__.py * Update logit_adaptivness.py * Update __init__.py * Update __init__.py * Update __init__.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update logit_adaptivness.py * Update regexp.py * Update logit_adaptivness.py * Update __init__.py * update to multilabel * Update adaptive.py * update * update * . * . * . * . * Update argmax.py * Update adaptive.py * Update adaptive.py * Update argmax.py * Update threshold.py * Update threshold.py * Update argmax.py * Update argmax.py * Update threshold.py * Update base.py * Update custom_types.py * Update threshold.py * Update argmax.py * Update threshold.py * Update argmax.py * Update __init__.py * up_to_date * . * . * . * add sklearn scorer * Delete autointent/modules/prediction/adaptive.py * Rename scorer.py to scorer.py * Rename __init__.py to __init__.py * add docstrings * Update __init__.py * Update scorer.py * Update scorer.py * Update __init__.py * Update scorer.py * Update scorer.py * Update __init__.py * Update scorer.py * Update scorer.py * Update scorer.py * Update __init__.py * Update scorer.py * update after merge * add test * update * upd tests * fix sklearn test * update args * update args --------- Co-authored-by: Roman Solomatin <[email protected]>
1 parent e859a31 commit 3d18626

File tree

9 files changed

+262
-7
lines changed

9 files changed

+262
-7
lines changed

autointent/_embedder.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212

1313
import numpy as np
1414
import numpy.typing as npt
15+
import torch
1516
from appdirs import user_cache_dir
1617
from sentence_transformers import SentenceTransformer
1718

@@ -58,6 +59,7 @@ class Embedder:
5859
"""
5960

6061
metadata_dict_name: str = "metadata.json"
62+
dump_dir: Path | None = None
6163

6264
def __init__(
6365
self,
@@ -70,7 +72,7 @@ def __init__(
7072
"""
7173
Initialize the Embedder.
7274
73-
:param model_name: Path to a local model directory or a Hugging Face model name.
75+
:param model_name_or_path: Path to a local model directory or a Hugging Face model name.
7476
:param device: Device to run the model on (e.g., "cpu", "cuda").
7577
:param batch_size: Batch size for embedding calculations.
7678
:param max_length: Maximum sequence length for the embedding model.
@@ -103,11 +105,13 @@ def clear_ram(self) -> None:
103105
self.logger.debug("Clearing embedder %s from memory", self.model_name)
104106
self.embedding_model.cpu()
105107
del self.embedding_model
108+
torch.cuda.empty_cache()
106109

107110
def delete(self) -> None:
108111
"""Delete the embedding model and its associated directory."""
109112
self.clear_ram()
110-
shutil.rmtree(self.dump_dir)
113+
if self.dump_dir is not None:
114+
shutil.rmtree(self.dump_dir)
111115

112116
def dump(self, path: Path) -> None:
113117
"""

autointent/modules/__init__.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
TunableDecision,
1212
)
1313
from .embedding import RetrievalEmbedding
14-
from .scoring import DescriptionScorer, DNNCScorer, KNNScorer, LinearScorer, MLKnnScorer, RerankScorer
14+
from .scoring import DescriptionScorer, DNNCScorer, KNNScorer, LinearScorer, MLKnnScorer, RerankScorer, SklearnScorer
1515

1616
T = TypeVar("T", bound=Module)
1717

@@ -25,11 +25,23 @@ def _create_modules_dict(modules: list[type[T]]) -> dict[str, type[T]]:
2525
RETRIEVAL_MODULES_MULTILABEL = RETRIEVAL_MODULES_MULTICLASS
2626

2727
SCORING_MODULES_MULTICLASS: dict[str, type[ScoringModule]] = _create_modules_dict(
28-
[DNNCScorer, KNNScorer, LinearScorer, DescriptionScorer, RerankScorer]
28+
[
29+
DNNCScorer,
30+
KNNScorer,
31+
LinearScorer,
32+
DescriptionScorer,
33+
RerankScorer,
34+
SklearnScorer,
35+
]
2936
)
3037

3138
SCORING_MODULES_MULTILABEL: dict[str, type[ScoringModule]] = _create_modules_dict(
32-
[MLKnnScorer, LinearScorer, DescriptionScorer],
39+
[
40+
MLKnnScorer,
41+
LinearScorer,
42+
DescriptionScorer,
43+
SklearnScorer,
44+
],
3345
)
3446

3547
PREDICTION_MODULES_MULTICLASS: dict[str, type[DecisionModule]] = _create_modules_dict(
Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,9 @@
1-
"""These modules take utterance as input and output predicted probabilities for each intent."""
2-
31
from ._description import DescriptionScorer
42
from ._dnnc import DNNCScorer
53
from ._knn import KNNScorer, RerankScorer
64
from ._linear import LinearScorer
75
from ._mlknn import MLKnnScorer
6+
from ._sklearn import SklearnScorer
87

98
__all__ = [
109
"DNNCScorer",
@@ -13,4 +12,6 @@
1312
"LinearScorer",
1413
"MLKnnScorer",
1514
"RerankScorer",
15+
"ScoringModule",
16+
"SklearnScorer",
1617
]
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
from .sklearn_scorer import SklearnScorer
2+
3+
__all__ = ["SklearnScorer"]
Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import logging
2+
from typing import Any
3+
4+
import numpy as np
5+
import numpy.typing as npt
6+
from sklearn.linear_model import LogisticRegression
7+
from sklearn.multioutput import MultiOutputClassifier
8+
from sklearn.utils import all_estimators
9+
from typing_extensions import Self
10+
11+
from autointent import Context, Embedder
12+
from autointent.custom_types import LabelType
13+
from autointent.modules.abc import ScoringModule
14+
15+
logger = logging.getLogger(__name__)
16+
AVAILABLE_CLASSIFIERS = {
17+
name: class_
18+
for name, class_ in all_estimators(
19+
type_filter=[
20+
# remove transformer (e.g. TfidfTransformer) from the list of available classifiers
21+
"classifier",
22+
"regressor",
23+
"cluster",
24+
]
25+
)
26+
if hasattr(class_, "predict_proba")
27+
}
28+
29+
30+
class SklearnScorer(ScoringModule):
31+
"""
32+
Scoring module for classification using sklearn classifiers with implemented predict_proba() method.
33+
34+
This module uses embeddings generated from a transformer model to train
35+
chosen sklearn classifier for intent classification.
36+
37+
:ivar name: Name of the scorer, defaults to "linear".
38+
"""
39+
40+
name = "sklearn"
41+
42+
def __init__(
43+
self,
44+
embedder_name: str,
45+
clf_name: str,
46+
embedder_batch_size: int = 32,
47+
embedder_max_length: int | None = None,
48+
embedder_device: str = "cpu",
49+
embedder_use_cache: bool = True,
50+
clf_args: dict[str, Any] | None = None,
51+
) -> None:
52+
"""
53+
Initialize the SklearnScorer.
54+
55+
:param embedder_name: Name of the embedder model.
56+
:param clf_name: Name of the sklearn classifier to use.
57+
:param clf_args: dictionary with the chosen sklearn classifier arguments, defaults to {}.
58+
:param embedder_batch_size: Batch size for embedding generation, defaults to 32.
59+
:param embedder_max_length: Maximum sequence length for embedding, or None for default.
60+
:param embedder_device: Device to run operations on, e.g., "cpu" or "cuda".
61+
:param embedder_use_cache: Flag indicating whether to cache intermediate embeddings.
62+
"""
63+
self.embedder_name = embedder_name
64+
self.clf_name = clf_name
65+
self.clf_args = clf_args or {}
66+
self.embedder_batch_size = embedder_batch_size
67+
self.embedder_max_length = embedder_max_length
68+
self.embedder_device = embedder_device
69+
self.embedder_use_cache = embedder_use_cache
70+
71+
@classmethod
72+
def from_context(
73+
cls,
74+
context: Context,
75+
clf_name: str = LogisticRegression.__name__,
76+
clf_args: dict[str, Any] | None = None,
77+
embedder_name: str | None = None,
78+
) -> Self:
79+
"""
80+
Create a SklearnScorer instance using a Context object.
81+
82+
:param context: Context containing configurations and utilities.
83+
:param clf_name: Name of the sklearn classifier to use.
84+
:param clf_args: dictionary with the chosen sklearn classifier arguments, defaults to {}.
85+
:param embedder_name: Name of the embedder, or None to use the best embedder.
86+
:return: Initialized SklearnScorer instance.
87+
"""
88+
if embedder_name is None:
89+
embedder_name = context.optimization_info.get_best_embedder()
90+
91+
return cls(
92+
embedder_name=embedder_name,
93+
embedder_device=context.get_device(),
94+
embedder_batch_size=context.get_batch_size(),
95+
embedder_max_length=context.get_max_length(),
96+
embedder_use_cache=context.get_use_cache(),
97+
clf_name=clf_name,
98+
clf_args=clf_args,
99+
)
100+
101+
def fit(
102+
self,
103+
utterances: list[str],
104+
labels: list[LabelType],
105+
) -> None:
106+
"""
107+
Train the chosen sklearn classifier.
108+
109+
:param utterances: List of training utterances.
110+
:param labels: List of labels corresponding to the utterances.
111+
:raises ValueError: If the vector index mismatches the provided utterances.
112+
"""
113+
self._multilabel = isinstance(labels[0], list)
114+
115+
embedder = Embedder(
116+
device=self.embedder_device,
117+
model_name_or_path=self.embedder_name,
118+
batch_size=self.embedder_batch_size,
119+
max_length=self.embedder_max_length,
120+
use_cache=self.embedder_use_cache,
121+
)
122+
features = embedder.embed(utterances)
123+
if AVAILABLE_CLASSIFIERS.get(self.clf_name):
124+
base_clf = AVAILABLE_CLASSIFIERS[self.clf_name](**self.clf_args)
125+
else:
126+
msg = f"Class {self.clf_name} does not exist in sklearn or does not have predict_proba method"
127+
logger.error(msg)
128+
raise ValueError(msg)
129+
130+
clf = MultiOutputClassifier(base_clf) if self._multilabel else base_clf
131+
132+
clf.fit(features, labels)
133+
134+
self._clf = clf
135+
self._embedder = embedder
136+
137+
def predict(self, utterances: list[str]) -> npt.NDArray[Any]:
138+
"""
139+
Predict probabilities for the given utterances.
140+
141+
:param utterances: List of query utterances.
142+
:return: Array of predicted probabilities for each class.
143+
"""
144+
features = self._embedder.embed(utterances)
145+
probas = self._clf.predict_proba(features)
146+
if self._multilabel:
147+
probas = np.stack(probas, axis=1)[..., 1]
148+
return probas # type: ignore[no-any-return]
149+
150+
def clear_cache(self) -> None:
151+
"""Clear cached data in memory used by the embedder."""
152+
self._embedder.delete()

tests/assets/configs/multiclass.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,12 @@
1919
- avsolatorio/GIST-small-Embedding-v0
2020
k: [1, 3]
2121
train_head: [false, true]
22+
- module_name: sklearn
23+
embedder_name:
24+
- sergeyzh/rubert-tiny-turbo
25+
clf_name:
26+
- LogisticRegression
27+
- RandomForestClassifier
2228
- module_name: rerank
2329
k: [ 5, 10 ]
2430
weights: [uniform, distance, closest]

tests/assets/configs/multilabel.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@
1515
- module_name: linear
1616
- module_name: mlknn
1717
k: [5]
18+
- module_name: sklearn
19+
embedder_name:
20+
- sergeyzh/rubert-tiny-turbo
21+
clf_name:
22+
- LogisticRegression
23+
- RandomForestClassifier
1824
- module_name: rerank
1925
k: [ 5, 10 ]
2026
weights: [ uniform, distance, closest ]
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import numpy as np
2+
3+
from autointent.context.data_handler import DataHandler
4+
from autointent.modules import SklearnScorer
5+
6+
7+
def test_base_sklearn(dataset):
8+
data_handler = DataHandler(dataset)
9+
10+
scorer = SklearnScorer(embedder_name="sergeyzh/rubert-tiny-turbo", clf_name="LogisticRegression")
11+
12+
scorer.fit(data_handler.train_utterances(0), data_handler.train_labels(0))
13+
test_data = [
14+
"why is there a hold on my american saving bank account",
15+
"i am nost sure why my account is blocked",
16+
"why is there a hold on my capital one checking account",
17+
"i think my account is blocked but i do not know the reason",
18+
"can you tell me why is my bank account frozen",
19+
]
20+
predictions = scorer.predict(test_data)
21+
22+
np.testing.assert_almost_equal(
23+
np.array(
24+
[
25+
[
26+
0.23748632,
27+
0.39067508,
28+
0.2393372,
29+
0.13250139,
30+
],
31+
[0.23913757, 0.37610976, 0.24952359, 0.13522908],
32+
[
33+
0.25714506,
34+
0.34984371,
35+
0.25495681,
36+
0.13805442,
37+
],
38+
[
39+
0.2571957,
40+
0.34850898,
41+
0.25346288,
42+
0.14083245,
43+
],
44+
[
45+
0.23885061,
46+
0.41527567,
47+
0.21830964,
48+
0.12756408,
49+
],
50+
],
51+
),
52+
predictions,
53+
decimal=2,
54+
)
55+
56+
predictions, metadata = scorer.predict_with_metadata(test_data)
57+
assert len(predictions) == len(test_data)
58+
assert metadata is None

tests/nodes/test_scoring.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ def test_scoring_multiclass(embedding_optimizer_multiclass):
3636
"k": [3],
3737
"train_head": [False, True],
3838
},
39+
{
40+
"module_name": "sklearn",
41+
"embedder_name": ["sergeyzh/rubert-tiny-turbo"],
42+
"clf_name": ["LogisticRegression", "RandomForestClassifier"],
43+
},
3944
{
4045
"module_name": "description",
4146
"temperature": [1.0, 0.5, 0.1, 0.05],
@@ -89,6 +94,14 @@ def test_scoring_multilabel(embedding_optimizer_multilabel):
8994
"embedder_name": ["sergeyzh/rubert-tiny-turbo"],
9095
},
9196
{"module_name": "mlknn", "k": [5], "embedder_name": ["sergeyzh/rubert-tiny-turbo"]},
97+
{
98+
"module_name": "sklearn",
99+
"embedder_name": ["sergeyzh/rubert-tiny-turbo"],
100+
"clf_name": [
101+
"LogisticRegression",
102+
"RandomForestClassifier",
103+
],
104+
},
92105
{
93106
"module_name": "rerank",
94107
"weights": ["uniform", "distance", "closest"],

0 commit comments

Comments
 (0)