Fix/context not dumped error (#197)

voorhs · web-flow · commit d4249aa6fe3c · 2025-05-03T18:35:01.000+03:00
* try to fix

* dump context constantly and fix serialization issues

* add exclude option to dumper

* fix codestyle and typing errors

* try to fix file exists error

* fix no fixture found error
diff --git a/autointent/_dump_tools.py b/autointent/_dump_tools.py
@@ -1,9 +1,8 @@
-import inspect
+import importlib
 import json
 import logging
 from pathlib import Path
-from types import UnionType
-from typing import Any, TypeAlias, Union, get_args, get_origin
+from typing import Any, TypeAlias
 
 import joblib
 import numpy as np
@@ -13,7 +12,6 @@
 
 from autointent import Embedder, Ranker, VectorIndex
 from autointent.configs import CrossEncoderConfig, EmbedderConfig
-from autointent.context._utils import NumpyEncoder
 from autointent.schemas import TagsList
 
 ModuleSimpleAttributes = None | str | int | float | bool | list  # type: ignore[type-arg]
@@ -36,11 +34,12 @@ class Dumper:
     pydantic_models: str = "pydantic"
 
     @staticmethod
-    def make_subdirectories(path: Path) -> None:
+    def make_subdirectories(path: Path, exists_ok: bool = False) -> None:
         """Make subdirectories for dumping.
 
         Args:
             path: Path to make subdirectories in
+            exists_ok: If True, do not raise an error if the directory already exists
         """
         subdirectories = [
             path / Dumper.tags,
@@ -51,23 +50,27 @@ def make_subdirectories(path: Path) -> None:
             path / Dumper.pydantic_models,
         ]
         for subdir in subdirectories:
-            subdir.mkdir(parents=True, exist_ok=True)
+            subdir.mkdir(parents=True, exist_ok=exists_ok)
 
     @staticmethod
-    def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
+    def dump(obj: Any, path: Path, exists_ok: bool = False, exclude: list[type[Any]] | None = None) -> None:  # noqa: ANN401, C901
         """Dump modules attributes to filestystem.
 
         Args:
             obj: Object to dump
             path: Path to dump to
+            exists_ok: If True, do not raise an error if the directory already exists
+            exclude: List of types to exclude from dumping
         """
         attrs: dict[str, ModuleAttributes] = vars(obj)
         simple_attrs = {}
         arrays: dict[str, npt.NDArray[Any]] = {}
 
-        Dumper.make_subdirectories(path)
+        Dumper.make_subdirectories(path, exists_ok)
 
         for key, val in attrs.items():
+            if exclude and isinstance(val, tuple(exclude)):
+                continue
             if isinstance(val, TagsList):
                 val.dump(path / Dumper.tags / key)
             elif isinstance(val, ModuleSimpleAttributes):
@@ -84,9 +87,13 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
                 val.save(str(path / Dumper.cross_encoders / key))
             elif isinstance(val, BaseModel):
                 try:
-                    pydantic_path = path / Dumper.pydantic_models / f"{key}.json"
-                    with pydantic_path.open("w", encoding="utf-8") as file:
-                        json.dump(val.model_dump(), file, ensure_ascii=False, indent=4, cls=NumpyEncoder)
+                    class_info = {"name": val.__class__.__name__, "module": val.__class__.__module__}
+                    pydantic_path = path / Dumper.pydantic_models / key
+                    pydantic_path.mkdir(parents=True, exist_ok=exists_ok)
+                    with (pydantic_path / "class_info.json").open("w", encoding="utf-8") as file:
+                        json.dump(class_info, file, ensure_ascii=False, indent=4)
+                    with (pydantic_path / "model_dump.json").open("w", encoding="utf-8") as file:
+                        json.dump(val.model_dump(), file, ensure_ascii=False, indent=4)
                 except Exception as e:
                     msg = f"Error dumping pydantic model {key}: {e}"
                     logging.exception(msg)
@@ -100,7 +107,7 @@ def dump(obj: Any, path: Path) -> None:  # noqa: ANN401, C901
         np.savez(path / Dumper.arrays, allow_pickle=False, **arrays)
 
     @staticmethod
-    def load(  # noqa: PLR0912, C901, PLR0915
+    def load(  # noqa: C901, PLR0912, PLR0915
         obj: Any,  # noqa: ANN401
         path: Path,
         embedder_config: EmbedderConfig | None = None,
@@ -139,42 +146,34 @@ def load(  # noqa: PLR0912, C901, PLR0915
                     for cross_encoder_dump in child.iterdir()
                 }
             elif child.name == Dumper.pydantic_models:
-                for model_file in child.iterdir():
-                    with model_file.open("r", encoding="utf-8") as file:
-                        content = json.load(file)
-                    variable_name = model_file.stem
-
-                    # First try to get the type annotation from the class annotations.
-                    model_type = obj.__class__.__annotations__.get(variable_name)
-
-                    # Fallback: inspect __init__ signature if not found in class-level annotations.
-                    if model_type is None:
-                        sig = inspect.signature(obj.__init__)
-                        if variable_name in sig.parameters:
-                            model_type = sig.parameters[variable_name].annotation
-
-                    if model_type is None:
-                        msg = f"No type annotation found for {variable_name}"
-                        logger.error(msg)
-                        continue
-
-                    # If the annotation is a Union, extract the pydantic model type.
-                    if get_origin(model_type) in (UnionType, Union):
-                        for arg in get_args(model_type):
-                            if isinstance(arg, type) and issubclass(arg, BaseModel):
-                                model_type = arg
-                                break
-                        else:
-                            msg = f"No pydantic type found in Union for {variable_name}"
-                            logger.error(msg)
+                for model_dir in child.iterdir():
+                    try:
+                        with (model_dir / "model_dump.json").open("r", encoding="utf-8") as file:
+                            content = json.load(file)
+
+                        variable_name = model_dir.name
+
+                        with (model_dir / "class_info.json").open("r", encoding="utf-8") as file:
+                            class_info = json.load(file)
+
+                        try:
+                            model_type = importlib.import_module(class_info["module"])
+                            model_type = getattr(model_type, class_info["name"])
+                        except (ImportError, AttributeError) as e:
+                            msg = f"Failed to import model type for {variable_name}: {e}"
+                            logger.exception(msg)
                             continue
 
-                    if not (isinstance(model_type, type) and issubclass(model_type, BaseModel)):
-                        msg = f"Type for {variable_name} is not a pydantic model: {model_type}"
-                        logger.error(msg)
+                        try:
+                            pydantic_models[variable_name] = model_type.model_validate(content)
+                        except Exception as e:
+                            msg = f"Failed to reconstruct Pydantic model {variable_name}: {e}"
+                            logger.exception(msg)
+                            continue
+                    except Exception as e:
+                        msg = f"Error loading Pydantic model from {model_dir}: {e}"
+                        logger.exception(msg)
                         continue
-
-                    pydantic_models[variable_name] = model_type(**content)
             else:
                 msg = f"Found unexpected child {child}"
                 logger.error(msg)
diff --git a/autointent/context/_context.py b/autointent/context/_context.py
@@ -88,14 +88,19 @@ def dump(self) -> None:
             yaml.dump(inference_config, file)
 
     def load(self) -> None:
-        """Load all information about optimization process from disk."""
+        """Restore the context state to resume the optimization process.
+
+        Raises:
+            RuntimeError: If the modules artifacts are not found.
+        """
         self._logger.debug("loading logs...")
         logs_dir = self.logging_config.dirpath
         self.optimization_info.load(logs_dir)
         if not self.optimization_info.artifacts.has_artifacts():
             msg = (
                 "It is impossible to continue from the previous point, "
-                "start again with dump_modules=True settings if you want to resume the run"
+                "start again with dump_modules=True settings if you want to resume the run."
+                "To load optimization info only, use Context.optimization_info.load(logs_dir)."
             )
             raise RuntimeError(msg)
 
diff --git a/autointent/context/optimization_info/_data_models.py b/autointent/context/optimization_info/_data_models.py
@@ -52,6 +52,32 @@ class ScorerArtifact(Artifact):
         None, description="Scores for each fold from cross-validation"
     )
 
+    def model_dump(self, **kwargs: Any) -> dict[str, Any]:  # noqa: ANN401
+        """Convert the model to a dictionary, converting numpy arrays to lists."""
+        data = super().model_dump(**kwargs)
+        if data["train_scores"] is not None:
+            data["train_scores"] = data["train_scores"].tolist()
+        if data["validation_scores"] is not None:
+            data["validation_scores"] = data["validation_scores"].tolist()
+        if data["test_scores"] is not None:
+            data["test_scores"] = data["test_scores"].tolist()
+        if data["folded_scores"] is not None:
+            data["folded_scores"] = [arr.tolist() for arr in data["folded_scores"]]
+        return data
+
+    @classmethod
+    def model_validate(cls, obj: dict[str, Any]) -> "ScorerArtifact":
+        """Convert lists back to numpy arrays during validation."""
+        if obj.get("train_scores") is not None:
+            obj["train_scores"] = np.array(obj["train_scores"])
+        if obj.get("validation_scores") is not None:
+            obj["validation_scores"] = np.array(obj["validation_scores"])
+        if obj.get("test_scores") is not None:
+            obj["test_scores"] = np.array(obj["test_scores"])
+        if obj.get("folded_scores") is not None:
+            obj["folded_scores"] = [np.array(arr) for arr in obj["folded_scores"]]
+        return super().model_validate(obj)
+
 
 class DecisionArtifact(Artifact):
     """Artifact containing outputs from the predictor node.
@@ -104,6 +130,31 @@ class Artifacts(BaseModel):
     scoring: list[ScorerArtifact] = []
     decision: list[DecisionArtifact] = []
 
+    def model_dump(self, **kwargs: Any) -> dict[str, Any]:  # noqa: ANN401
+        """Convert the model to a dictionary, ensuring nested artifacts are properly serialized."""
+        data = super().model_dump(**kwargs)
+        for node_type in [NodeType.regex, NodeType.embedding, NodeType.scoring, NodeType.decision]:
+            artifacts = getattr(self, node_type.value)
+            data[node_type.value] = [artifact.model_dump(**kwargs) for artifact in artifacts]
+        return data
+
+    @classmethod
+    def model_validate(cls, obj: dict[str, Any]) -> "Artifacts":
+        """Convert the dictionary back to an Artifacts instance, ensuring nested artifacts are properly deserialized."""
+        # First convert the lists back to numpy arrays in the scoring artifacts
+        if "scoring" in obj:
+            for artifact in obj["scoring"]:
+                if artifact.get("train_scores") is not None:
+                    artifact["train_scores"] = np.array(artifact["train_scores"])
+                if artifact.get("validation_scores") is not None:
+                    artifact["validation_scores"] = np.array(artifact["validation_scores"])
+                if artifact.get("test_scores") is not None:
+                    artifact["test_scores"] = np.array(artifact["test_scores"])
+                if artifact.get("folded_scores") is not None:
+                    artifact["folded_scores"] = [np.array(arr) for arr in artifact["folded_scores"]]
+
+        return super().model_validate(obj)
+
     def add_artifact(self, node_type: str, artifact: Artifact) -> None:
         """Add an artifact to the specified node type.
 
diff --git a/autointent/context/optimization_info/_optimization_info.py b/autointent/context/optimization_info/_optimization_info.py
@@ -4,6 +4,7 @@
 trials, and modules during the pipeline's execution.
 """
 
+import json
 import logging
 from dataclasses import dataclass, field
 from pathlib import Path
@@ -22,6 +23,9 @@
     from autointent.modules.base import BaseModule
 
 
+logger = logging.getLogger(__name__)
+
+
 @dataclass
 class ModulesList:
     """Container for managing lists of modules for each node type.
@@ -88,8 +92,6 @@ class OptimizationInfo:
 
     def __init__(self) -> None:
         """Initialize optimization info."""
-        self._logger = logging.getLogger(__name__)
-
         self.artifacts = Artifacts()
         self.trials = Trials()
         self._trials_best_ids = TrialsIds()
@@ -130,7 +132,7 @@ def log_module_optimization(
             metrics=metrics,
         )
         self.trials.add_trial(node_type, trial)
-        self._logger.debug("module %s fitted and saved to optimization info", module_name, extra=trial.model_dump())
+        logger.debug("module %s fitted and saved to optimization info %s", module_name, json.dumps(trial.model_dump()))
 
         if module:
             self.modules.add_module(node_type, module)
@@ -246,7 +248,8 @@ def dump_evaluation_results(self) -> dict[str, Any]:
 
     def dump(self, path: Path) -> None:
         """Dump the optimization information to a file."""
-        Dumper.dump(self, path / "optimization_info")
+        exclude = [ModulesList]
+        Dumper.dump(self, path / "optimization_info", exists_ok=True, exclude=exclude)
 
     def load(self, path: Path) -> None:
         """Load the optimization information from a file."""
diff --git a/autointent/nodes/_node_optimizer.py b/autointent/nodes/_node_optimizer.py
@@ -2,6 +2,7 @@
 
 import gc
 import itertools as it
+import json
 import logging
 from abc import ABC, abstractmethod
 from copy import deepcopy
@@ -140,7 +141,7 @@ def fit(self, context: Context, sampler: SamplerType = "brute", n_jobs: int = 1)
         Raises:
             AssertionError: If an invalid sampler type is provided.
         """
-        self._logger.info("Starting %s node optimization...", self.node_info.node_type)
+        self._logger.info("Starting %s node optimization...", self.node_info.node_type.value)
         for search_space in deepcopy(self.modules_search_spaces):
             self._counter: int = 0
             module_name = search_space.pop("module_name")
@@ -163,21 +164,18 @@ def fit(self, context: Context, sampler: SamplerType = "brute", n_jobs: int = 1)
 
             study, finished_trials, n_trials = load_or_create_study(
                 study_name=f"{self.node_info.node_type}_{module_name}",
-                storage_dir=context.get_dump_dir(),
+                context=context,
                 direction="maximize",
                 sampler=sampler_instance,
                 n_trials=n_trials,
             )
             self._counter = max(self._counter, finished_trials)
 
-            if n_trials == 0:
-                context.load()
-
             optuna.logging.set_verbosity(optuna.logging.WARNING)
             obj = partial(self.objective, module_name=module_name, search_space=search_space, context=context)
 
             study.optimize(obj, n_trials=n_trials, n_jobs=n_jobs)
-        context.dump()
+
         self._logger.info("%s node optimization is finished!", self.node_info.node_type)
 
     def objective(
@@ -200,7 +198,7 @@ def objective(
         """
         config = self.suggest(trial, search_space)
 
-        self._logger.debug("Initializing %s module...", module_name)
+        self._logger.debug("Initializing %s module with config: %s", module_name, json.dumps(config))
         module = self.node_info.modules_available[module_name].from_context(context, **config)
 
         embedder_config = module.get_embedder_config()
@@ -235,6 +233,7 @@ def objective(
             module_dump_dir,
             module=module if not context.is_ram_to_clear() else None,
         )
+        context.dump()
 
         if context.is_ram_to_clear():
             module.clear_cache()
@@ -416,7 +415,7 @@ def get_storage_url(study_name: str, storage_dir: Path | None) -> str | None:
 
 def load_or_create_study(
     study_name: str,
-    storage_dir: Path | None,
+    context: Context,
     sampler: optuna.samplers.BaseSampler,
     direction: str = "maximize",
     n_trials: int = 10,
@@ -425,7 +424,7 @@ def load_or_create_study(
 
     Args:
         study_name: Name of the study
-        storage_dir: Directory where study databases are stored
+        context: Context object
         direction: Optimization direction (maximize or minimize)
         sampler: Optuna sampler instance
         n_trials: n_trials
@@ -436,7 +435,7 @@ def load_or_create_study(
     remaining_trials = n_trials
     finished_trials = 0
 
-    storage_url = get_storage_url(study_name, storage_dir)
+    storage_url = get_storage_url(study_name, context.get_dump_dir())
 
     try:
         # will catch exception if study does not exist
@@ -451,6 +450,8 @@ def load_or_create_study(
             finished_trials = max(t.number for t in study.trials) + 1
             # Calculate remaining trials if n_trials is specified
             remaining_trials = n_trials if n_trials is None else max(0, n_trials - len(study.trials))
+
+        context.load()
         return study, finished_trials, remaining_trials  # noqa: TRY300
     except Exception:  # noqa: BLE001
         # Create a new study if none exists
diff --git a/tests/pipeline/test_inference.py b/tests/pipeline/test_inference.py