deeppavlov
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎autointent/_dump_tools.py‎
Lines changed: 43 additions & 42 deletions b/‎autointent/_dump_tools.py‎
Lines changed: 43 additions & 42 deletions
diff --git a/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 8 additions & 0 deletions b/‎autointent/_pipeline/_pipeline.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎autointent/context/_context.py‎
Lines changed: 18 additions & 9 deletions b/‎autointent/context/_context.py‎
Lines changed: 18 additions & 9 deletions
diff --git a/‎autointent/context/optimization_info/_data_models.py‎
Lines changed: 60 additions & 0 deletions b/‎autointent/context/optimization_info/_data_models.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎autointent/context/optimization_info/_optimization_info.py‎
Lines changed: 31 additions & 3 deletions b/‎autointent/context/optimization_info/_optimization_info.py‎
Lines changed: 31 additions & 3 deletions
@@ -179,4 +179,6 @@ tests_logs
 tests/logs
 runs/
 vector_db*
+*.db
+*.sqlite
 /wandb
@@ -1,9 +1,8 @@
-import inspect
+import importlib
 import json
 import logging
 from pathlib import Path
-from types import UnionType
-from typing import Any, TypeAlias, Union, get_args, get_origin
+from typing import Any, TypeAlias
 
 import joblib
 import numpy as np
@@ -37,11 +36,12 @@ class Dumper:
     hf_tokenizers = "hf_tokenizers"
 
     @staticmethod
-    def make_subdirectories(path: Path) -> None:
+    def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
         """Make subdirectories for dumping.
 
         Args:
             path: Path to make subdirectories in
+            exists_ok: If True, do not raise an error if the directory already exists
         """
         subdirectories = [
             path / Dumper.tags,
@@ -54,23 +54,27 @@ def make_subdirectories(path: Path) -> None:
             path / Dumper.hf_tokenizers,
         ]
         for subdir in subdirectories:
-            subdir.mkdir(parents=True, exist_ok=True)
+            subdir.mkdir(parents=True, exist_ok=exists_ok)
 
     @staticmethod
-    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
+    def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]] | None = None) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
         """Dump modules attributes to filestystem.
 
         Args:
             obj: Object to dump
             path: Path to dump to
+            exists_ok: If True, do not raise an error if the directory already exists
+            exclude: List of types to exclude from dumping
         """
         attrs: dict[str, ModuleAttributes] = vars(obj)
         simple_attrs = {}
         arrays: dict[str, npt.NDArray[Any]] = {}
 
-        Dumper.make_subdirectories(path)
+        Dumper.make_subdirectories(path, exists_ok)
 
         for key, val in attrs.items():
+            if exclude and isinstance(val, tuple(exclude)):
+                continue
             if isinstance(val, TagsList):
                 val.dump(path / Dumper.tags / key)
             elif isinstance(val, ModuleSimpleAttributes):
@@ -85,10 +89,14 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
                 joblib.dump(val, path / Dumper.estimators / key)
             elif isinstance(val, Ranker):
                 val.save(str(path / Dumper.cross_encoders / key))
-            elif isinstance(val, CrossEncoderConfig | EmbedderConfig):
+            elif isinstance(val, BaseModel):
                 try:
-                    pydantic_path = path / Dumper.pydantic_models / f"{key}.json"
-                    with pydantic_path.open("w", encoding="utf-8") as file:
+                    class_info = {"name": val.__class__.__name__, "module": val.__class__.__module__}
+                    pydantic_path = path / Dumper.pydantic_models / key
+                    pydantic_path.mkdir(parents=True, exist_ok=exists_ok)
+                    with (pydantic_path / "class_info.json").open("w", encoding="utf-8") as file:
+                        json.dump(class_info, file, ensure_ascii=False, indent=4)
+                    with (pydantic_path / "model_dump.json").open("w", encoding="utf-8") as file:
                         json.dump(val.model_dump(), file, ensure_ascii=False, indent=4)
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
@@ -125,7 +133,7 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901, PLR0912, PLR0915
         np.savez(path / Dumper.arrays, allow_pickle=False, **arrays)
 
     @staticmethod
-    def load(  # noqa: PLR0912, C901, PLR0915
+    def load(  # noqa: C901, PLR0912, PLR0915
         obj: Any,  # noqa: ANN401
         path: Path,
         embedder_config: EmbedderConfig | None = None,
@@ -166,41 +174,34 @@ def load(  # noqa: PLR0912, C901, PLR0915
                     for cross_encoder_dump in child.iterdir()
                 }
             elif child.name == Dumper.pydantic_models:
-                for model_file in child.iterdir():
-                    with model_file.open("r", encoding="utf-8") as file:
-                        content = json.load(file)
-                    variable_name = model_file.stem
-
-                    # First try to get the type annotation from the class annotations.
-                    model_type = obj.__class__.__annotations__.get(variable_name)
-
-                    # Fallback: inspect __init__ signature if not found in class-level annotations.
-                    if model_type is None:
-                        sig = inspect.signature(obj.__init__)
-                        if variable_name in sig.parameters:
-                            model_type = sig.parameters[variable_name].annotation
-                    if model_type is None:
-                        msg = f"No type annotation found for {variable_name}"
-                        logger.error(msg)
-                        continue
+                for model_dir in child.iterdir():
+                    try:
+                        with (model_dir / "model_dump.json").open("r", encoding="utf-8") as file:
+                            content = json.load(file)
+
+                        variable_name = model_dir.name
 
-                    # If the annotation is a Union, extract the pydantic model type.
-                    if get_origin(model_type) in (UnionType, Union):
-                        for arg in get_args(model_type):
-                            if isinstance(arg, type) and issubclass(arg, BaseModel):
-                                model_type = arg
-                                break
-                        else:
-                            msg = f"No pydantic type found in Union for {variable_name}"
-                            logger.error(msg)
+                        with (model_dir / "class_info.json").open("r", encoding="utf-8") as file:
+                            class_info = json.load(file)
+
+                        try:
+                            model_type = importlib.import_module(class_info["module"])
+                            model_type = getattr(model_type, class_info["name"])
+                        except (ImportError, AttributeError) as e:
+                            msg = f"Failed to import model type for {variable_name}: {e}"
+                            logger.exception(msg)
                             continue
 
-                    if not (isinstance(model_type, type) and issubclass(model_type, BaseModel)):
-                        msg = f"Type for {variable_name} is not a pydantic model: {model_type}"
-                        logger.error(msg)
+                        try:
+                            pydantic_models[variable_name] = model_type.model_validate(content)
+                        except Exception as e:
+                            msg = f"Failed to reconstruct Pydantic model {variable_name}: {e}"
+                            logger.exception(msg)
+                            continue
+                    except Exception as e:
+                        msg = f"Error loading Pydantic model from {model_dir}: {e}"
+                        logger.exception(msg)
                         continue
-
-                    pydantic_models[variable_name] = model_type(**content)
             elif child.name == Dumper.hf_models:
                 for model_dir in child.iterdir():
                     try:
 
@@ -144,6 +144,14 @@ def _fit(self, context: Context, sampler: SamplerType) -> None:
         """
         self.context = context
         self._logger.info("starting pipeline optimization...")
+
+        if not context.logging_config.dump_modules:
+            self._logger.warning(
+                "Memory storage is not compatible with resuming optimization. "
+                "Modules from previous runs won't be available. "
+                "Set dump_modules=True in LoggingConfig to enable proper resuming."
+            )
+
         self.context.callback_handler.start_run(
             run_name=self.context.logging_config.get_run_name(),
             dirpath=self.context.logging_config.dirpath,
 
@@ -1,6 +1,5 @@
 """Context manager for configuring and managing data handling, vector indexing, and optimization."""
 
-import json
 import logging
 from pathlib import Path
 
@@ -10,7 +9,6 @@
 from autointent._callbacks import CallbackHandler, get_callbacks
 from autointent.configs import CrossEncoderConfig, DataConfig, EmbedderConfig, LoggingConfig
 
-from ._utils import NumpyEncoder
 from .data_handler import DataHandler
 from .optimization_info import OptimizationInfo
 
@@ -77,15 +75,9 @@ def dump(self) -> None:
         Save metrics, hyperparameters, inference, configurations, and datasets to disk.
         """
         self._logger.debug("dumping logs...")
-        optimization_results = self.optimization_info.dump_evaluation_results()
-
         logs_dir = self.logging_config.dirpath
-        logs_dir.mkdir(parents=True, exist_ok=True)
-
-        logs_path = logs_dir / "logs.json"
-        with logs_path.open("w") as file:
-            json.dump(optimization_results, file, indent=4, ensure_ascii=False, cls=NumpyEncoder)
 
+        self.optimization_info.dump(logs_dir)
         self.data_handler.dataset.to_json(logs_dir / "dataset.json")
 
         self._logger.info("logs and other assets are saved to %s", logs_dir)
@@ -95,6 +87,23 @@ def dump(self) -> None:
         with inference_config_path.open("w") as file:
             yaml.dump(inference_config, file)
 
+    def load(self) -> None:
+        """Restore the context state to resume the optimization process.
+
+        Raises:
+            RuntimeError: If the modules artifacts are not found.
+        """
+        self._logger.debug("loading logs...")
+        logs_dir = self.logging_config.dirpath
+        self.optimization_info.load(logs_dir)
+        if not self.optimization_info.artifacts.has_artifacts():
+            msg = (
+                "It is impossible to continue from the previous point, "
+                "start again with dump_modules=True settings if you want to resume the run."
+                "To load optimization info only, use Context.optimization_info.load(logs_dir)."
+            )
+            raise RuntimeError(msg)
+
     def get_dump_dir(self) -> Path | None:
         """Get the directory for saving dumped modules.
 
 
@@ -52,6 +52,32 @@ class ScorerArtifact(Artifact):
         None, description="Scores for each fold from cross-validation"
     )
 
+    def model_dump(self, **kwargs: Any) -> dict[str, Any]:  # noqa: ANN401
+        """Convert the model to a dictionary, converting numpy arrays to lists."""
+        data = super().model_dump(**kwargs)
+        if data["train_scores"] is not None:
+            data["train_scores"] = data["train_scores"].tolist()
+        if data["validation_scores"] is not None:
+            data["validation_scores"] = data["validation_scores"].tolist()
+        if data["test_scores"] is not None:
+            data["test_scores"] = data["test_scores"].tolist()
+        if data["folded_scores"] is not None:
+            data["folded_scores"] = [arr.tolist() for arr in data["folded_scores"]]
+        return data
+
+    @classmethod
+    def model_validate(cls, obj: dict[str, Any]) -> "ScorerArtifact":
+        """Convert lists back to numpy arrays during validation."""
+        if obj.get("train_scores") is not None:
+            obj["train_scores"] = np.array(obj["train_scores"])
+        if obj.get("validation_scores") is not None:
+            obj["validation_scores"] = np.array(obj["validation_scores"])
+        if obj.get("test_scores") is not None:
+            obj["test_scores"] = np.array(obj["test_scores"])
+        if obj.get("folded_scores") is not None:
+            obj["folded_scores"] = [np.array(arr) for arr in obj["folded_scores"]]
+        return super().model_validate(obj)
+
 
 class DecisionArtifact(Artifact):
     """Artifact containing outputs from the predictor node.
@@ -104,6 +130,31 @@ class Artifacts(BaseModel):
     scoring: list[ScorerArtifact] = []
     decision: list[DecisionArtifact] = []
 
+    def model_dump(self, **kwargs: Any) -> dict[str, Any]:  # noqa: ANN401
+        """Convert the model to a dictionary, ensuring nested artifacts are properly serialized."""
+        data = super().model_dump(**kwargs)
+        for node_type in [NodeType.regex, NodeType.embedding, NodeType.scoring, NodeType.decision]:
+            artifacts = getattr(self, node_type.value)
+            data[node_type.value] = [artifact.model_dump(**kwargs) for artifact in artifacts]
+        return data
+
+    @classmethod
+    def model_validate(cls, obj: dict[str, Any]) -> "Artifacts":
+        """Convert the dictionary back to an Artifacts instance, ensuring nested artifacts are properly deserialized."""
+        # First convert the lists back to numpy arrays in the scoring artifacts
+        if "scoring" in obj:
+            for artifact in obj["scoring"]:
+                if artifact.get("train_scores") is not None:
+                    artifact["train_scores"] = np.array(artifact["train_scores"])
+                if artifact.get("validation_scores") is not None:
+                    artifact["validation_scores"] = np.array(artifact["validation_scores"])
+                if artifact.get("test_scores") is not None:
+                    artifact["test_scores"] = np.array(artifact["test_scores"])
+                if artifact.get("folded_scores") is not None:
+                    artifact["folded_scores"] = [np.array(arr) for arr in artifact["folded_scores"]]
+
+        return super().model_validate(obj)
+
     def add_artifact(self, node_type: str, artifact: Artifact) -> None:
         """Add an artifact to the specified node type.
 
@@ -136,6 +187,15 @@ def get_best_artifact(self, node_type: str, idx: int) -> Artifact:
         """
         return self.get_artifacts(node_type)[idx]
 
+    def has_artifacts(self) -> bool:
+        """Check if any artifacts have been saved in RAM.
+
+        Returns:
+            True if any artifacts exist, False otherwise.
+        """
+        node_types = [NodeType.regex, NodeType.embedding, NodeType.scoring, NodeType.decision]
+        return any(len(self.get_artifacts(nt)) > 0 for nt in node_types)
+
 
 class Trial(BaseModel):
     """Representation of an individual optimization trial.
 
@@ -4,13 +4,16 @@
 trials, and modules during the pipeline's execution.
 """
 
+import json
 import logging
 from dataclasses import dataclass, field
+from pathlib import Path
 from typing import TYPE_CHECKING, Any
 
 import numpy as np
 from numpy.typing import NDArray
 
+from autointent._dump_tools import Dumper
 from autointent.configs import EmbedderConfig, InferenceNodeConfig
 from autointent.custom_types import NodeType
 
@@ -20,6 +23,9 @@
     from autointent.modules.base import BaseModule
 
 
+logger = logging.getLogger(__name__)
+
+
 @dataclass
 class ModulesList:
     """Container for managing lists of modules for each node type.
@@ -56,6 +62,19 @@ def add_module(self, node_type: str, module: "BaseModule") -> None:
         """
         self.get(node_type).append(module)
 
+    def model_dump(self) -> dict[str, list["BaseModule"]]:
+        """Dump the modules to a dictionary format.
+
+        Returns:
+            Dictionary representation of the modules.
+        """
+        return {
+            "regex": self.regex,
+            "embedding": self.embedding,
+            "scoring": self.scoring,
+            "decision": self.decision,
+        }
+
 
 class OptimizationInfo:
     """Tracks optimization results, including trials, artifacts, and modules.
@@ -73,8 +92,6 @@ class OptimizationInfo:
 
     def __init__(self) -> None:
         """Initialize optimization info."""
-        self._logger = logging.getLogger(__name__)
-
         self.artifacts = Artifacts()
         self.trials = Trials()
         self._trials_best_ids = TrialsIds()
@@ -115,7 +132,7 @@ def log_module_optimization(
             metrics=metrics,
         )
         self.trials.add_trial(node_type, trial)
-        self._logger.debug("module %s fitted and saved to optimization info", module_name, extra=trial.model_dump())
+        logger.debug("module %s fitted and saved to optimization info %s", module_name, json.dumps(trial.model_dump()))
 
         if module:
             self.modules.add_module(node_type, module)
@@ -225,8 +242,19 @@ def dump_evaluation_results(self) -> dict[str, Any]:
             "pipeline_metrics": self.pipeline_metrics,
             "metrics": node_wise_metrics,
             "configs": self.trials.model_dump(),
+            "artifacts": self.artifacts.model_dump(),
+            "modules": self.modules.model_dump(),
         }
 
+    def dump(self, path: Path) -> None:
+        """Dump the optimization information to a file."""
+        exclude = [ModulesList]
+        Dumper.dump(self, path / "optimization_info", exists_ok=True, exclude=exclude)
+
+    def load(self, path: Path) -> None:
+        """Load the optimization information from a file."""
+        Dumper.load(self, path / "optimization_info")
+
     def get_inference_nodes_config(self, asdict: bool = False) -> list[InferenceNodeConfig]:
         """Generate configuration for inference nodes based on the best trials.