deeppavlov
diff --git a/‎Makefile‎
Lines changed: 2 additions & 2 deletions b/‎Makefile‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎autointent/_dataset/_dataset.py‎
Lines changed: 1 addition & 1 deletion b/‎autointent/_dataset/_dataset.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/_dump_tools.py‎
Lines changed: 66 additions & 3 deletions b/‎autointent/_dump_tools.py‎
Lines changed: 66 additions & 3 deletions
diff --git a/‎autointent/_embedder.py‎
Lines changed: 2 additions & 1 deletion b/‎autointent/_embedder.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎autointent/_ranker.py‎
Lines changed: 1 addition & 1 deletion b/‎autointent/_ranker.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎autointent/configs/_transformers.py‎
Lines changed: 1 addition & 0 deletions b/‎autointent/configs/_transformers.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎autointent/context/data_handler/_stratification.py‎
Lines changed: 3 additions & 2 deletions b/‎autointent/context/data_handler/_stratification.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎autointent/modules/scoring/_bert.py‎
Lines changed: 4 additions & 3 deletions b/‎autointent/modules/scoring/_bert.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/optimizer_config.schema.json‎
Lines changed: 15 additions & 1 deletion b/‎docs/optimizer_config.schema.json‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎docs/source/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/conf.py‎
Lines changed: 1 addition & 1 deletion
@@ -3,7 +3,7 @@ poetry = poetry run
 
 .PHONY: install
 install:
-	poetry install --with dev,test,typing,docs
+	poetry install --extras "dev test typing docs"
 
 .PHONY: test
 test:
@@ -24,7 +24,7 @@ lint:
 
 .PHONY: sync
 sync:
-	poetry sync --with dev,test,typing,docs
+	poetry sync --extras "dev test typing docs"
 
 .PHONY: docs
 docs:
 
@@ -98,7 +98,7 @@ def from_hub(cls, repo_name: str) -> "Dataset":
         """Loads a dataset from the Hugging Face Hub.
 
         Args:
-            repo_name: The name of the Hugging Face repository, like `AutoIntent/clinc150`.
+            repo_name: The name of the Hugging Face repository, like `DeepPavlov/clinc150`.
         """
         from ._reader import DictReader
 
 
@@ -33,6 +33,8 @@ class Dumper:
     estimators = "estimators"
     cross_encoders = "cross_encoders"
     pydantic_models: str = "pydantic"
+    hf_models = "hf_models"
+    hf_tokenizers = "hf_tokenizers"
 
     @staticmethod
     def make_subdirectories(path: Path) -> None:
@@ -48,12 +50,14 @@ def make_subdirectories(path: Path) -> None:
             path / Dumper.estimators,
             path / Dumper.cross_encoders,
             path / Dumper.pydantic_models,
+            path / Dumper.hf_models,
+            path / Dumper.hf_tokenizers,
         ]
         for subdir in subdirectories:
             subdir.mkdir(parents=True, exist_ok=True)
 
     @staticmethod
-    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
+    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
         """Dump modules attributes to filestystem.
 
         Args:
@@ -89,6 +93,28 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
                     logging.exception(msg)
+            elif (key == "_model" or "model" in key.lower()) and hasattr(val, "save_pretrained"):
+                model_path = path / Dumper.hf_models / key
+                model_path.mkdir(parents=True, exist_ok=True)
+                try:
+                    val.save_pretrained(model_path)
+                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
+                    with (model_path / "class_info.json").open("w") as f:
+                        json.dump(class_info, f)
+                except Exception as e:
+                    msg = f"Error dumping HF model {key}: {e}"
+                    logger.exception(msg)
+            elif (key == "_tokenizer" or "tokenizer" in key.lower()) and hasattr(val, "save_pretrained"):
+                tokenizer_path = path / Dumper.hf_tokenizers / key
+                tokenizer_path.mkdir(parents=True, exist_ok=True)
+                try:
+                    val.save_pretrained(tokenizer_path)
+                    class_info = {"module": val.__class__.__module__, "name": val.__class__.__name__}
+                    with (tokenizer_path / "class_info.json").open("w") as f:
+                        json.dump(class_info, f)
+                except Exception as e:
+                    msg = f"Error dumping HF tokenizer {key}: {e}"
+                    logger.exception(msg)
             else:
                 msg = f"Attribute {key} of type {type(val)} cannot be dumped to file system."
                 logger.error(msg)
@@ -114,6 +140,8 @@ def load(  # noqa: PLR0912, C901, PLR0915
         estimators: dict[str, Any] = {}
         cross_encoders: dict[str, Any] = {}
         pydantic_models: dict[str, Any] = {}
+        hf_models: dict[str, Any] = {}
+        hf_tokenizers: dict[str, Any] = {}
 
         for child in path.iterdir():
             if child.name == Dumper.tags:
@@ -151,7 +179,6 @@ def load(  # noqa: PLR0912, C901, PLR0915
                         sig = inspect.signature(obj.__init__)
                         if variable_name in sig.parameters:
                             model_type = sig.parameters[variable_name].annotation
-
                     if model_type is None:
                         msg = f"No type annotation found for {variable_name}"
                         logger.error(msg)
@@ -174,9 +201,45 @@ def load(  # noqa: PLR0912, C901, PLR0915
                         continue
 
                     pydantic_models[variable_name] = model_type(**content)
+            elif child.name == Dumper.hf_models:
+                for model_dir in child.iterdir():
+                    try:
+                        with (model_dir / "class_info.json").open("r") as f:
+                            class_info = json.load(f)
+
+                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
+                        model_class = getattr(module, class_info["name"])
+
+                        hf_models[model_dir.name] = model_class.from_pretrained(model_dir)
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading HF model {model_dir.name}: {e}"
+                        logger.exception(msg)
+            elif child.name == Dumper.hf_tokenizers:
+                for tokenizer_dir in child.iterdir():
+                    try:
+                        with (tokenizer_dir / "class_info.json").open("r") as f:
+                            class_info = json.load(f)
+
+                        module = __import__(class_info["module"], fromlist=[class_info["name"]])
+                        tokenizer_class = getattr(module, class_info["name"])
+
+                        hf_tokenizers[tokenizer_dir.name] = tokenizer_class.from_pretrained(tokenizer_dir)
+                    except Exception as e:  # noqa: PERF203
+                        msg = f"Error loading HF tokenizer {tokenizer_dir.name}: {e}"
+                        logger.exception(msg)
             else:
                 msg = f"Found unexpected child {child}"
                 logger.error(msg)
+
         obj.__dict__.update(
-            tags | simple_attrs | arrays | embedders | indexes | estimators | cross_encoders | pydantic_models
+            tags
+            | simple_attrs
+            | arrays
+            | embedders
+            | indexes
+            | estimators
+            | cross_encoders
+            | pydantic_models
+            | hf_models
+            | hf_tokenizers
         )
@@ -79,6 +79,7 @@ def __init__(self, embedder_config: EmbedderConfig) -> None:
             device=self.config.device,
             prompts=embedder_config.get_prompt_config(),
             similarity_fn_name=self.config.similarity_fn_name,
+            trust_remote_code=self.config.trust_remote_code,
         )
 
         self._logger = logging.getLogger(__name__)
@@ -184,7 +185,7 @@ def embed(self, utterances: list[str], task_type: TaskTypeEnum | None = None) ->
             convert_to_numpy=True,
             batch_size=self.config.batch_size,
             normalize_embeddings=True,
-            prompt_name=self.config.get_prompt_type(task_type),
+            prompt=self.config.get_prompt_type(task_type),
         )
 
         if self.config.use_cache:
 
@@ -111,7 +111,7 @@ def __init__(
         self.config = CrossEncoderConfig.from_search_config(cross_encoder_config)
         self.cross_encoder = st.CrossEncoder(
             self.config.model_name,
-            trust_remote_code=True,
+            trust_remote_code=self.config.trust_remote_code,
             device=self.config.device,
             max_length=self.config.tokenizer_config.max_length,  # type: ignore[arg-type]
         )
 
@@ -19,6 +19,7 @@ class HFModelConfig(BaseModel):
     batch_size: PositiveInt = Field(32, description="Batch size for model inference.")
     device: str | None = Field(None, description="Torch notation for CPU or CUDA.")
     tokenizer_config: TokenizerConfig = Field(default_factory=TokenizerConfig)
+    trust_remote_code: bool = Field(False, description="Whether to trust the remote code when loading the model.")
 
     @classmethod
     def from_search_config(cls, values: dict[str, Any] | str | BaseModel | None) -> Self:
 
@@ -12,7 +12,7 @@
 from numpy import typing as npt
 from sklearn.model_selection import train_test_split
 from skmultilearn.model_selection import IterativeStratification
-from transformers import set_seed
+from transformers import set_seed  # type: ignore[attr-defined]
 
 from autointent import Dataset
 from autointent.custom_types import LabelType
@@ -128,7 +128,8 @@ def _split_multilabel(self, dataset: HFDataset, test_size: float) -> Sequence[np
         Returns:
             A sequence containing indices for train and test splits.
         """
-        set_seed(self.random_seed)  # workaround for buggy nature of IterativeStratification from skmultilearn
+        if self.random_seed is not None:
+            set_seed(self.random_seed)  # workaround for buggy nature of IterativeStratification from skmultilearn
         splitter = IterativeStratification(
             n_splits=2,
             order=2,
 
@@ -7,7 +7,7 @@
 import numpy.typing as npt
 import torch
 from datasets import Dataset
-from transformers import (
+from transformers import (  # type: ignore[attr-defined]
     AutoModelForSequenceClassification,
     AutoTokenizer,
     DataCollatorWithPadding,
@@ -89,6 +89,7 @@ def fit(
 
         self._model = AutoModelForSequenceClassification.from_pretrained(
             model_name,
+            trust_remote_code=self.classification_model_config.trust_remote_code,
             num_labels=self._n_classes,
             label2id=label2id,
             id2label=id2label,
@@ -127,15 +128,15 @@ def tokenize_function(examples: dict[str, Any]) -> dict[str, Any]:
                 use_cpu=use_cpu,
             )
 
-            trainer = Trainer(
+            trainer = Trainer(  # type: ignore[no-untyped-call]
                 model=self._model,
                 args=training_args,
                 train_dataset=tokenized_dataset,
                 tokenizer=self._tokenizer,
                 data_collator=DataCollatorWithPadding(tokenizer=self._tokenizer),
             )
 
-            trainer.train()
+            trainer.train()  # type: ignore[attr-defined]
 
         self._model.eval()
 
 
@@ -32,6 +32,12 @@
                 "tokenizer_config": {
                     "$ref": "#/$defs/TokenizerConfig"
                 },
+                "trust_remote_code": {
+                    "default": false,
+                    "description": "Whether to trust the remote code when loading the model.",
+                    "title": "Trust Remote Code",
+                    "type": "boolean"
+                },
                 "train_head": {
                     "default": false,
                     "description": "Whether to train the head of the model. If False, LogReg will be trained.",
@@ -122,6 +128,12 @@
                 "tokenizer_config": {
                     "$ref": "#/$defs/TokenizerConfig"
                 },
+                "trust_remote_code": {
+                    "default": false,
+                    "description": "Whether to trust the remote code when loading the model.",
+                    "title": "Trust Remote Code",
+                    "type": "boolean"
+                },
                 "default_prompt": {
                     "anyOf": [
                         {
@@ -383,6 +395,7 @@
                     "padding": true,
                     "truncation": true
                 },
+                "trust_remote_code": false,
                 "default_prompt": null,
                 "classifier_prompt": null,
                 "cluster_prompt": null,
@@ -404,6 +417,7 @@
                     "padding": true,
                     "truncation": true
                 },
+                "trust_remote_code": false,
                 "train_head": false
             }
         },
@@ -429,4 +443,4 @@
     ],
     "title": "OptimizationConfig",
     "type": "object"
-}
+}
@@ -126,7 +126,7 @@
         },
         {
             "name": "HuggingFace",
-            "url": "https://huggingface.co/AutoIntent",
+            "url": "https://huggingface.co/DeepPavlov",
             "icon": f"{BASE_STATIC_URL}/hf-logo.svg",
             "type": "local",
         },
Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ def __init__(self, embedder_config: EmbedderConfig) -> None:`
`79`	`79`	`device=self.config.device,`
`80`	`80`	`prompts=embedder_config.get_prompt_config(),`
`81`	`81`	`similarity_fn_name=self.config.similarity_fn_name,`
	`82`	`+ trust_remote_code=self.config.trust_remote_code,`
`82`	`83`	`)`
`83`	`84`
`84`	`85`	`self._logger = logging.getLogger(__name__)`
`@@ -184,7 +185,7 @@ def embed(self, utterances: list[str], task_type: TaskTypeEnum \| None = None) ->`
`184`	`185`	`convert_to_numpy=True,`
`185`	`186`	`batch_size=self.config.batch_size,`
`186`	`187`	`normalize_embeddings=True,`
`187`		`- prompt_name=self.config.get_prompt_type(task_type),`
	`188`	`+ prompt=self.config.get_prompt_type(task_type),`
`188`	`189`	`)`
`189`	`190`
`190`	`191`	`if self.config.use_cache:`
Original file line number	Diff line number	Diff line change
`@@ -111,7 +111,7 @@ def __init__(`
`111`	`111`	`self.config = CrossEncoderConfig.from_search_config(cross_encoder_config)`
`112`	`112`	`self.cross_encoder = st.CrossEncoder(`
`113`	`113`	`self.config.model_name,`
`114`		`- trust_remote_code=True,`
	`114`	`+ trust_remote_code=self.config.trust_remote_code,`
`115`	`115`	`device=self.config.device,`
`116`	`116`	`max_length=self.config.tokenizer_config.max_length, # type: ignore[arg-type]`
`117`	`117`	`)`
Original file line number	Diff line number	Diff line change
`@@ -126,7 +126,7 @@`
`126`	`126`	`},`
`127`	`127`	`{`
`128`	`128`	`"name": "HuggingFace",`
`129`		`- "url": "https://huggingface.co/AutoIntent",`
	`129`	`+ "url": "https://huggingface.co/DeepPavlov",`
`130`	`130`	`"icon": f"{BASE_STATIC_URL}/hf-logo.svg",`
`131`	`131`	`"type": "local",`
`132`	`132`	`},`