Fix using multiple data inputs with deferred datasets & Pulsar.

jmchilton · jmchilton · commit 50e652c2adc6 · 2025-07-21T10:35:02.000-04:00
We really have to eliminate that input_datasets dictionary - this is very hacky. Maybe we can take a pass at rewriting Pulsar staging after we have stronger typing in the tool state branch for this stuff. Now we're putting multiple copies of an HDA into the deferred object structure - so I am now caching them during materialization so we don't materialize them more than once - this was probably needed anyway regardless since an input can be supplied to a tool more than once.
diff --git a/lib/galaxy/model/deferred.py b/lib/galaxy/model/deferred.py
@@ -3,6 +3,7 @@
 import os
 import shutil
 from typing import (
+    Dict,
     NamedTuple,
     Optional,
     Union,
@@ -88,6 +89,7 @@ def __init__(
         self._object_store_populator = object_store_populator
         self._file_sources = file_sources
         self._sa_session = sa_session
+        self._previously_materialized: Dict[int, HistoryDatasetAssociation] = {}
 
     def ensure_materialized(
         self,
@@ -105,6 +107,12 @@ def ensure_materialized(
         if dataset.state != Dataset.states.DEFERRED and isinstance(dataset_instance, HistoryDatasetAssociation):
             return dataset_instance
 
+        if dataset_instance.id in self._previously_materialized and isinstance(
+            dataset_instance, HistoryDatasetAssociation
+        ):
+            # If we have already materialized this dataset, return the previously materialized instance.
+            return self._previously_materialized[dataset_instance.id]
+
         materialized_dataset_hashes = [h.copy() for h in dataset.hashes]
         if in_place:
             materialized_dataset = dataset_instance.dataset
@@ -195,6 +203,7 @@ def ensure_materialized(
                 metadata_tmp_files_dir = None
             materialized_dataset_instance.set_meta(metadata_tmp_files_dir=metadata_tmp_files_dir)
             materialized_dataset_instance.metadata_deferred = False
+        self._previously_materialized[dataset_instance.id] = materialized_dataset_instance
         return materialized_dataset_instance
 
     def _stream_source(self, target_source: DatasetSource, datatype, dataset: Dataset) -> str:
diff --git a/lib/galaxy/tools/evaluation.py b/lib/galaxy/tools/evaluation.py
@@ -405,6 +405,14 @@ def find_deferred_datasets(input, value, context, prefixed_name=None, **kwargs):
         visit_input_values(self.tool.inputs, incoming, find_deferred_datasets)
         visit_input_values(self.tool.inputs, incoming, find_deferred_collections)
 
+        # now place the the inputX datasets hacked in for multiple inputs into the deferred
+        # object array also. This is so messy. I think in this case - we only need these for
+        # Pulsar staging up which uses the hackier input_datasets flat dict.
+        for key, value in input_datasets.items():
+            if key not in deferred_objects and value is not None and value.state == model.Dataset.states.DEFERRED:
+                if self._should_materialize_deferred_input(key, value):
+                    deferred_objects[key] = value
+
         return deferred_objects
 
     def _should_materialize_deferred_input(self, input_name: str, input_value: DeferrableObjectsT) -> bool: