Skip to content

Commit d925df9

Browse files
committed
Merge branch 'dev' into feat/crossencoder-description
2 parents 3bc1845 + 57b0b6a commit d925df9

File tree

22 files changed

+170
-37
lines changed

22 files changed

+170
-37
lines changed

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ poetry = poetry run
33

44
.PHONY: install
55
install:
6-
poetry install --with dev,test,typing,docs
6+
poetry install --extras "dev test typing docs"
77

88
.PHONY: test
99
test:
@@ -24,7 +24,7 @@ lint:
2424

2525
.PHONY: sync
2626
sync:
27-
poetry sync --with dev,test,typing,docs
27+
poetry sync --extras "dev test typing docs"
2828

2929
.PHONY: docs
3030
docs:

autointent/_dataset/_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def from_hub(cls, repo_name: str) -> "Dataset":
9898
"""Loads a dataset from the Hugging Face Hub.
9999
100100
Args:
101-
repo_name: The name of the Hugging Face repository, like `AutoIntent/clinc150`.
101+
repo_name: The name of the Hugging Face repository, like `DeepPavlov/clinc150`.
102102
"""
103103
from ._reader import DictReader
104104

autointent/_dump_tools.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ class Dumper:
3333
estimators = "estimators"
3434
cross_encoders = "cross_encoders"
3535
pydantic_models: str = "pydantic"
36+
hf_models = "hf_models"
37+
hf_tokenizers = "hf_tokenizers"
3638

3739
@staticmethod
3840
def make_subdirectories(path: Path) -> None:
@@ -48,12 +50,14 @@ def make_subdirectories(path: Path) -> None:
4850
path / Dumper.estimators,
4951
path / Dumper.cross_encoders,
5052
path / Dumper.pydantic_models,
53+
path / Dumper.hf_models,
54+
path / Dumper.hf_tokenizers,
5155
]
5256
for subdir in subdirectories:
5357
subdir.mkdir(parents=True, exist_ok=True)
5458

5559
@staticmethod
56-
def dump(obj: Any, path: Path) -> None: # noqa: ANN401, C901
60+
def dump(obj: Any, path: Path) -> None: # noqa: ANN401, C901, PLR0912, PLR0915
5761
"""Dump modules attributes to filestystem.
5862
5963
Args:
@@ -89,6 +93,28 @@ def dump(obj: Any, path: Path) -> None: # noqa: ANN401, C901
8993
except Exception as e:
9094
msg = f"Error dumping pydantic model {key}: {e}"
9195
logging.exception(msg)
96+
elif (key == "_model" or "model" in key.lower()) and hasattr(val, "save_pretrained"):
97+
model_path = path / Dumper.hf_models / key
98+
model_path.mkdir(parents=True, exist_ok=True)
99+
try:
100+
val.save_pretrained(model_path)
101+
class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
102+
with (model_path / "class_info.json").open("w") as f:
103+
json.dump(class_info, f)
104+
except Exception as e:
105+
msg = f"Error dumping HF model {key}: {e}"
106+
logger.exception(msg)
107+
elif (key == "_tokenizer" or "tokenizer" in key.lower()) and hasattr(val, "save_pretrained"):
108+
tokenizer_path = path / Dumper.hf_tokenizers / key
109+
tokenizer_path.mkdir(parents=True, exist_ok=True)
110+
try:
111+
val.save_pretrained(tokenizer_path)
112+
class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
113+
with (tokenizer_path / "class_info.json").open("w") as f:
114+
json.dump(class_info, f)
115+
except Exception as e:
116+
msg = f"Error dumping HF tokenizer {key}: {e}"
117+
logger.exception(msg)
92118
else:
93119
msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
94120
logger.error(msg)
@@ -114,6 +140,8 @@ def load( # noqa: PLR0912, C901, PLR0915
114140
estimators: dict[str, Any] = {}
115141
cross_encoders: dict[str, Any] = {}
116142
pydantic_models: dict[str, Any] = {}
143+
hf_models: dict[str, Any] = {}
144+
hf_tokenizers: dict[str, Any] = {}
117145

118146
for child in path.iterdir():
119147
if child.name == Dumper.tags:
@@ -151,7 +179,6 @@ def load( # noqa: PLR0912, C901, PLR0915
151179
sig = inspect.signature(obj.__init__)
152180
if variable_name in sig.parameters:
153181
model_type = sig.parameters[variable_name].annotation
154-
155182
if model_type is None:
156183
msg = f"No type annotation found for {variable_name}"
157184
logger.error(msg)
@@ -174,9 +201,45 @@ def load( # noqa: PLR0912, C901, PLR0915
174201
continue
175202

176203
pydantic_models[variable_name] = model_type(**content)
204+
elif child.name == Dumper.hf_models:
205+
for model_dir in child.iterdir():
206+
try:
207+
with (model_dir / "class_info.json").open("r") as f:
208+
class_info = json.load(f)
209+
210+
module = __import__(class_info["module"], fromlist=[class_info["name"]])
211+
model_class = getattr(module, class_info["name"])
212+
213+
hf_models[model_dir.name] = model_class.from_pretrained(model_dir)
214+
except Exception as e: # noqa: PERF203
215+
msg = f"Error loading HF model {model_dir.name}: {e}"
216+
logger.exception(msg)
217+
elif child.name == Dumper.hf_tokenizers:
218+
for tokenizer_dir in child.iterdir():
219+
try:
220+
with (tokenizer_dir / "class_info.json").open("r") as f:
221+
class_info = json.load(f)
222+
223+
module = __import__(class_info["module"], fromlist=[class_info["name"]])
224+
tokenizer_class = getattr(module, class_info["name"])
225+
226+
hf_tokenizers[tokenizer_dir.name] = tokenizer_class.from_pretrained(tokenizer_dir)
227+
except Exception as e: # noqa: PERF203
228+
msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
229+
logger.exception(msg)
177230
else:
178231
msg = f"Found unexpected child {child}"
179232
logger.error(msg)
233+
180234
obj.__dict__.update(
181-
tags | simple_attrs | arrays | embedders | indexes | estimators | cross_encoders | pydantic_models
235+
tags
236+
| simple_attrs
237+
| arrays
238+
| embedders
239+
| indexes
240+
| estimators
241+
| cross_encoders
242+
| pydantic_models
243+
| hf_models
244+
| hf_tokenizers
182245
)

autointent/_embedder.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def __init__(self, embedder_config: EmbedderConfig) -> None:
7979
device=self.config.device,
8080
prompts=embedder_config.get_prompt_config(),
8181
similarity_fn_name=self.config.similarity_fn_name,
82+
trust_remote_code=self.config.trust_remote_code,
8283
)
8384

8485
self._logger = logging.getLogger(__name__)
@@ -184,7 +185,7 @@ def embed(self, utterances: list[str], task_type: TaskTypeEnum | None = None) ->
184185
convert_to_numpy=True,
185186
batch_size=self.config.batch_size,
186187
normalize_embeddings=True,
187-
prompt_name=self.config.get_prompt_type(task_type),
188+
prompt=self.config.get_prompt_type(task_type),
188189
)
189190

190191
if self.config.use_cache:

autointent/_ranker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,7 @@ def __init__(
111111
self.config = CrossEncoderConfig.from_search_config(cross_encoder_config)
112112
self.cross_encoder = st.CrossEncoder(
113113
self.config.model_name,
114-
trust_remote_code=True,
114+
trust_remote_code=self.config.trust_remote_code,
115115
device=self.config.device,
116116
max_length=self.config.tokenizer_config.max_length, # type: ignore[arg-type]
117117
)

autointent/configs/_transformers.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ class HFModelConfig(BaseModel):
1919
batch_size: PositiveInt = Field(32, description="Batch size for model inference.")
2020
device: str | None = Field(None, description="Torch notation for CPU or CUDA.")
2121
tokenizer_config: TokenizerConfig = Field(default_factory=TokenizerConfig)
22+
trust_remote_code: bool = Field(False, description="Whether to trust the remote code when loading the model.")
2223

2324
@classmethod
2425
def from_search_config(cls, values: dict[str, Any] | str | BaseModel | None) -> Self:

autointent/context/data_handler/_stratification.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from numpy import typing as npt
1313
from sklearn.model_selection import train_test_split
1414
from skmultilearn.model_selection import IterativeStratification
15-
from transformers import set_seed
15+
from transformers import set_seed # type: ignore[attr-defined]
1616

1717
from autointent import Dataset
1818
from autointent.custom_types import LabelType
@@ -128,7 +128,8 @@ def _split_multilabel(self, dataset: HFDataset, test_size: float) -> Sequence[np
128128
Returns:
129129
A sequence containing indices for train and test splits.
130130
"""
131-
set_seed(self.random_seed) # workaround for buggy nature of IterativeStratification from skmultilearn
131+
if self.random_seed is not None:
132+
set_seed(self.random_seed) # workaround for buggy nature of IterativeStratification from skmultilearn
132133
splitter = IterativeStratification(
133134
n_splits=2,
134135
order=2,

autointent/modules/scoring/_bert.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy.typing as npt
88
import torch
99
from datasets import Dataset
10-
from transformers import (
10+
from transformers import ( # type: ignore[attr-defined]
1111
AutoModelForSequenceClassification,
1212
AutoTokenizer,
1313
DataCollatorWithPadding,
@@ -89,6 +89,7 @@ def fit(
8989

9090
self._model = AutoModelForSequenceClassification.from_pretrained(
9191
model_name,
92+
trust_remote_code=self.classification_model_config.trust_remote_code,
9293
num_labels=self._n_classes,
9394
label2id=label2id,
9495
id2label=id2label,
@@ -127,15 +128,15 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
127128
use_cpu=use_cpu,
128129
)
129130

130-
trainer = Trainer(
131+
trainer = Trainer( # type: ignore[no-untyped-call]
131132
model=self._model,
132133
args=training_args,
133134
train_dataset=tokenized_dataset,
134135
tokenizer=self._tokenizer,
135136
data_collator=DataCollatorWithPadding(tokenizer=self._tokenizer),
136137
)
137138

138-
trainer.train()
139+
trainer.train() # type: ignore[attr-defined]
139140

140141
self._model.eval()
141142

docs/optimizer_config.schema.json

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232
"tokenizer_config": {
3333
"$ref": "#/$defs/TokenizerConfig"
3434
},
35+
"trust_remote_code": {
36+
"default": false,
37+
"description": "Whether to trust the remote code when loading the model.",
38+
"title": "Trust Remote Code",
39+
"type": "boolean"
40+
},
3541
"train_head": {
3642
"default": false,
3743
"description": "Whether to train the head of the model. If False, LogReg will be trained.",
@@ -122,6 +128,12 @@
122128
"tokenizer_config": {
123129
"$ref": "#/$defs/TokenizerConfig"
124130
},
131+
"trust_remote_code": {
132+
"default": false,
133+
"description": "Whether to trust the remote code when loading the model.",
134+
"title": "Trust Remote Code",
135+
"type": "boolean"
136+
},
125137
"default_prompt": {
126138
"anyOf": [
127139
{
@@ -383,6 +395,7 @@
383395
"padding": true,
384396
"truncation": true
385397
},
398+
"trust_remote_code": false,
386399
"default_prompt": null,
387400
"classifier_prompt": null,
388401
"cluster_prompt": null,
@@ -404,6 +417,7 @@
404417
"padding": true,
405418
"truncation": true
406419
},
420+
"trust_remote_code": false,
407421
"train_head": false
408422
}
409423
},
@@ -429,4 +443,4 @@
429443
],
430444
"title": "OptimizationConfig",
431445
"type": "object"
432-
}
446+
}

docs/source/conf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@
126126
},
127127
{
128128
"name": "HuggingFace",
129-
"url": "https://huggingface.co/AutoIntent",
129+
"url": "https://huggingface.co/DeepPavlov",
130130
"icon": f"{BASE_STATIC_URL}/hf-logo.svg",
131131
"type": "local",
132132
},

0 commit comments

Comments
 (0)