Artifact cache

schustmi · schustmi · commit 87e2a54e0667 · 2025-11-04T15:36:08.000+08:00
diff --git a/src/zenml/artifacts/in_memory_cache.py b/src/zenml/artifacts/in_memory_cache.py
@@ -0,0 +1,54 @@
+#  Copyright (c) ZenML GmbH 2025. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+"""In-memory artifact cache."""
+
+from typing import Any
+from uuid import UUID
+
+from zenml.utils import context_utils
+
+
+class InMemoryArtifactCache(context_utils.BaseContext):
+    """In-memory artifact cache."""
+
+    __context_var__ = context_utils.ContextVar("in_memory_artifact_cache")
+
+    def __init__(self) -> None:
+        """Initialize the artifact cache."""
+        super().__init__()
+        self._cache = {}
+
+    def clear(self) -> None:
+        """Clear the artifact cache."""
+        self._cache = {}
+
+    def get_artifact_data(self, id_: UUID) -> Any:
+        """Get the artifact data.
+
+        Args:
+            id_: The ID of the artifact to get the data for.
+
+        Returns:
+            The artifact data.
+        """
+        return self._cache.get(id_)
+
+    def set_artifact_data(self, id_: UUID, data: Any) -> None:
+        """Set the artifact data.
+
+        Args:
+            id_: The ID of the artifact to set the data for.
+            data: The artifact data to set.
+        """
+        self._cache[id_] = data
diff --git a/src/zenml/execution/pipeline/dynamic/outputs.py b/src/zenml/execution/pipeline/dynamic/outputs.py
@@ -103,13 +103,16 @@ def result(self) -> OutputArtifact:
                 f"{result}."
             )
 
-    def load(self) -> Any:
+    def load(self, disable_cache: bool = False) -> Any:
         """Load the step run output artifact data.
 
+        Args:
+            disable_cache: Whether to disable the artifact cache.
+
         Returns:
             The step run output artifact data.
         """
-        return self.result().load()
+        return self.result().load(disable_cache=disable_cache)
 
 
 class StepRunOutputsFuture(_BaseStepRunFuture):
@@ -157,9 +160,12 @@ def artifacts(self) -> StepRunOutputs:
         """
         return self._wrapped.result()
 
-    def load(self) -> Any:
+    def load(self, disable_cache: bool = False) -> Any:
         """Get the step run output artifact data.
 
+        Args:
+            disable_cache: Whether to disable the artifact cache.
+
         Raises:
             ValueError: If the step run output is invalid.
 
@@ -171,9 +177,11 @@ def load(self) -> Any:
         if result is None:
             return None
         elif isinstance(result, ArtifactVersionResponse):
-            return result.load()
+            return result.load(disable_cache=disable_cache)
         elif isinstance(result, tuple):
-            return tuple(item.load() for item in result)
+            return tuple(
+                item.load(disable_cache=disable_cache) for item in result
+            )
         else:
             raise ValueError(f"Invalid step run output: {result}")
 
diff --git a/src/zenml/execution/pipeline/dynamic/runner.py b/src/zenml/execution/pipeline/dynamic/runner.py
@@ -31,6 +31,7 @@
 from uuid import UUID
 
 from zenml import ExternalArtifact
+from zenml.artifacts.in_memory_cache import InMemoryArtifactCache
 from zenml.client import Client
 from zenml.config.compiler import Compiler
 from zenml.config.step_configurations import Step
@@ -150,44 +151,45 @@ def run_pipeline(self) -> None:
             snapshot=self._snapshot,
             run_id=self._run.id if self._run else None,
         ) as logs_request:
-            run = self._run or create_placeholder_run(
-                snapshot=self._snapshot,
-                orchestrator_run_id=self._orchestrator_run_id,
-                logs=logs_request,
-            )
-
-            assert (
-                self._snapshot.pipeline_spec
-            )  # Always exists for new snapshots
-            pipeline_parameters = self._snapshot.pipeline_spec.parameters
+            with InMemoryArtifactCache():
+                run = self._run or create_placeholder_run(
+                    snapshot=self._snapshot,
+                    orchestrator_run_id=self._orchestrator_run_id,
+                    logs=logs_request,
+                )
 
-            with DynamicPipelineRunContext(
-                pipeline=self.pipeline,
-                run=run,
-                snapshot=self._snapshot,
-                runner=self,
-            ):
-                self._orchestrator.run_init_hook(snapshot=self._snapshot)
-                try:
-                    # TODO: step logging isn't threadsafe
-                    # TODO: what should be allowed as pipeline returns?
-                    #  (artifacts, json serializable, anything?)
-                    #  how do we show it in the UI?
-                    self.pipeline._call_entrypoint(**pipeline_parameters)
-                except:
-                    publish_failed_pipeline_run(run.id)
-                    logger.error(
-                        "Pipeline run failed. All in-progress step runs will "
-                        "still finish executing."
-                    )
-                    raise
-                finally:
-                    self._orchestrator.run_cleanup_hook(
-                        snapshot=self._snapshot
-                    )
-                    self._executor.shutdown(wait=True, cancel_futures=True)
-                # self.await_all_step_run_futures()
-                publish_successful_pipeline_run(run.id)
+                assert (
+                    self._snapshot.pipeline_spec
+                )  # Always exists for new snapshots
+                pipeline_parameters = self._snapshot.pipeline_spec.parameters
+
+                with DynamicPipelineRunContext(
+                    pipeline=self.pipeline,
+                    run=run,
+                    snapshot=self._snapshot,
+                    runner=self,
+                ):
+                    self._orchestrator.run_init_hook(snapshot=self._snapshot)
+                    try:
+                        # TODO: step logging isn't threadsafe
+                        # TODO: what should be allowed as pipeline returns?
+                        #  (artifacts, json serializable, anything?)
+                        #  how do we show it in the UI?
+                        self.pipeline._call_entrypoint(**pipeline_parameters)
+                    except:
+                        publish_failed_pipeline_run(run.id)
+                        logger.error(
+                            "Pipeline run failed. All in-progress step runs "
+                            "will still finish executing."
+                        )
+                        raise
+                    finally:
+                        self._orchestrator.run_cleanup_hook(
+                            snapshot=self._snapshot
+                        )
+                        self._executor.shutdown(wait=True, cancel_futures=True)
+                    # self.await_all_step_run_futures()
+                    publish_successful_pipeline_run(run.id)
 
     @overload
     def launch_step(
diff --git a/src/zenml/models/v2/core/artifact_version.py b/src/zenml/models/v2/core/artifact_version.py
@@ -440,15 +440,28 @@ def run(self) -> "PipelineRunResponse":
 
         return Client().get_pipeline_run(self.step.pipeline_run_id)
 
-    def load(self) -> Any:
+    def load(self, disable_cache: bool = False) -> Any:
         """Materializes (loads) the data stored in this artifact.
 
+        Args:
+            disable_cache: Whether to disable the artifact cache.
+
         Returns:
             The materialized data.
         """
+        from zenml.artifacts.in_memory_cache import InMemoryArtifactCache
         from zenml.artifacts.utils import load_artifact_from_response
 
-        return load_artifact_from_response(self)
+        cache = InMemoryArtifactCache.get()
+
+        if cache and (data := cache.get_artifact_data(self.id)):
+            logger.debug(f"Returning artifact data (%s) from cache", self.id)
+            return data
+
+        data = load_artifact_from_response(self)
+        if not disable_cache:
+            cache.set_artifact_data(self.id, data)
+        return data
 
     def download_files(self, path: str, overwrite: bool = False) -> None:
         """Downloads data for an artifact with no materializing.