Merge pull request #21021 from mvdbeek/fix_implicit_conversion_breaking_job_cache

mvdbeek · web-flow · commit 5566adf410cf · 2025-10-22T14:31:01.000+02:00
[25.0] Use job cache also for implicit conversions
diff --git a/lib/galaxy/datatypes/data.py b/lib/galaxy/datatypes/data.py
@@ -849,6 +849,7 @@ def convert_dataset(
         deps: Optional[Dict] = None,
         target_context: Optional[Dict] = None,
         history=None,
+        use_cached_job: bool = False,
     ):
         """This function adds a job to the queue to convert a dataset to another type. Returns a message about success/failure."""
         converter = trans.app.datatypes_registry.get_converter_by_target_type(original_dataset.ext, target_type)
@@ -864,8 +865,16 @@ def convert_dataset(
         # Make the target datatype available to the converter
         params["__target_datatype__"] = target_type
         # Run converter, job is dispatched through Queue
+        # Always use cached job if it exists
+        completed_jobs = converter.completed_jobs(trans, all_params=[params], use_cached_job=use_cached_job)
+        completed_job = completed_jobs[0] if completed_jobs else None
         job, converted_datasets, *_ = converter.execute(
-            trans, incoming=params, set_output_hid=visible, history=history, flush_job=False
+            trans,
+            incoming=params,
+            set_output_hid=visible,
+            history=history,
+            flush_job=False,
+            completed_job=completed_job,
         )
         # We should only have a single converted output, but let's be defensive here
         n_converted_datasets = len(converted_datasets)
diff --git a/lib/galaxy/managers/jobs.py b/lib/galaxy/managers/jobs.py
@@ -368,6 +368,8 @@ def stop(self, job, message=None):
 class JobSearch:
     """Search for jobs using tool inputs or other jobs"""
 
+    IGNORED_NON_JOB_PARAMETERS = ("__use_cached_job__", "__workflow_invocation_uuid__", "__when_value__", "__input_ext")
+
     def __init__(
         self,
         sa_session: galaxy_scoped_session,
@@ -559,7 +561,7 @@ def replace_dataset_ids(path, key, value):
                         continue
                     elif k == "chromInfo" and "?.len" in v:
                         continue
-                    elif k == "__when_value__":
+                    elif k in self.IGNORED_NON_JOB_PARAMETERS:
                         continue
                     a = aliased(model.JobParameter)
                     job_parameter_conditions.append(
@@ -646,7 +648,7 @@ def _filter_jobs(
                 continue
             elif k == "chromInfo" and "?.len" in v:
                 continue
-            elif k == "__when_value__":
+            elif k in self.IGNORED_NON_JOB_PARAMETERS:
                 # TODO: really need to separate this.
                 continue
             value_dump = None if v is None else json.dumps(v, sort_keys=True)
diff --git a/lib/galaxy/model/__init__.py b/lib/galaxy/model/__init__.py
@@ -5165,7 +5165,7 @@ def get_converted_files_by_type(self, file_type, include_errored=False):
                     return item
         return None
 
-    def get_converted_dataset_deps(self, trans, target_ext):
+    def get_converted_dataset_deps(self, trans, target_ext, use_cached_job=False):
         """
         Returns dict of { "dependency" => HDA }
         """
@@ -5174,9 +5174,11 @@ def get_converted_dataset_deps(self, trans, target_ext):
             depends_list = trans.app.datatypes_registry.converter_deps[self.extension][target_ext]
         except KeyError:
             depends_list = []
-        return {dep: self.get_converted_dataset(trans, dep) for dep in depends_list}
+        return {dep: self.get_converted_dataset(trans, dep, use_cached_job=use_cached_job) for dep in depends_list}
 
-    def get_converted_dataset(self, trans, target_ext, target_context=None, history=None, include_errored=False):
+    def get_converted_dataset(
+        self, trans, target_ext, target_context=None, history=None, include_errored=False, use_cached_job=False
+    ):
         """
         Return converted dataset(s) if they exist, along with a dict of dependencies.
         If not converted yet, do so and return None (the first time). If unconvertible, raise exception.
@@ -5201,7 +5203,7 @@ def get_converted_dataset(self, trans, target_ext, target_context=None, history=
         # Check if we have dependencies
         try:
             for dependency in depends_list:
-                dep_dataset = self.get_converted_dataset(trans, dependency)
+                dep_dataset = self.get_converted_dataset(trans, dependency, use_cached_job=use_cached_job)
                 if dep_dataset is None:
                     # None means converter is running first time
                     return None
@@ -5226,6 +5228,7 @@ def get_converted_dataset(self, trans, target_ext, target_context=None, history=
                     deps=deps,
                     target_context=target_context,
                     history=history,
+                    use_cached_job=use_cached_job,
                 ).values()
             )
         )
diff --git a/lib/galaxy/tools/__init__.py b/lib/galaxy/tools/__init__.py
@@ -2402,6 +2402,7 @@ def execute(
         history: Optional[model.History] = None,
         set_output_hid: bool = DEFAULT_SET_OUTPUT_HID,
         flush_job: bool = True,
+        completed_job: Optional[Job] = None,
     ):
         """
         Execute the tool using parameter values in `incoming`. This just
@@ -2419,6 +2420,7 @@ def execute(
             history=history,
             set_output_hid=set_output_hid,
             flush_job=flush_job,
+            completed_job=completed_job,
         )
 
     def _execute(
diff --git a/lib/galaxy/tools/actions/__init__.py b/lib/galaxy/tools/actions/__init__.py
@@ -173,7 +173,13 @@ def process_dataset(data, formats=None):
                     if converted_dataset:
                         data = converted_dataset
                     else:
-                        data = data.get_converted_dataset(trans, target_ext, target_context=parent, history=history)
+                        data = data.get_converted_dataset(
+                            trans,
+                            target_ext,
+                            target_context=parent,
+                            history=history,
+                            use_cached_job=param_values.get("__use_cached_job__", False),
+                        )
 
                 input_name = prefixed_name
                 # Checked security of whole collection all at once if mapping over this input, else
diff --git a/lib/galaxy_test/api/test_tools.py b/lib/galaxy_test/api/test_tools.py
@@ -2613,7 +2613,7 @@ def test_multi_param_column_nested_list_fails_on_invalid_column(self):
                 exception_raised = e
             assert exception_raised, "Expected invalid column selection to fail job"
 
-    @skip_without_tool("implicit_conversion_format_input")
+    @skip_without_tool("Grep1")
     def test_implicit_conversion_input_dataset_tracking(self):
         with self.dataset_populator.test_history() as history_id:
             compressed_path = self.test_data_resolver.get_filename("1.fastqsanger.gz")
@@ -2622,7 +2622,7 @@ def test_implicit_conversion_input_dataset_tracking(self):
                     history_id, content=fh, file_type="fastqsanger.gz", wait=True
                 )
             outputs = self._run(
-                "Grep1", history_id=history_id, inputs={"data": {"src": "hda", "id": dataset["id"]}}, assert_ok=True
+                "Grep1", history_id=history_id, inputs={"input": {"src": "hda", "id": dataset["id"]}}, assert_ok=True
             )
             job_details = self.dataset_populator.get_job_details(outputs["jobs"][0]["id"], full=True).json()
             assert job_details["inputs"]["input"]["id"] != dataset["id"]
@@ -2631,6 +2631,17 @@ def test_implicit_conversion_input_dataset_tracking(self):
             )
             assert converted_input["extension"] == "fastqsanger"
 
+            outputs = self._run(
+                "Grep1",
+                history_id=history_id,
+                inputs={"input": {"src": "hda", "id": dataset["id"]}},
+                use_cached_job=True,
+                wait_for_job=True,
+                assert_ok=True,
+            )
+            job_details = self.dataset_populator.get_job_details(outputs["jobs"][0]["id"], full=True).json()
+            assert job_details["copied_from_job_id"]
+
     @skip_without_tool("column_multi_param")
     def test_implicit_conversion_and_reduce(self):
         with self.dataset_populator.test_history() as history_id:
diff --git a/lib/galaxy_test/api/test_workflows.py b/lib/galaxy_test/api/test_workflows.py
@@ -8487,3 +8487,96 @@ def test_run_workflow_use_cached_job_format_source_pick_param(self):
                 ).strip()
                 == "2"
             )
+
+    def test_run_workflow_use_cached_job_implicit_conversion_send_to_new_history(self):
+        wf = """class: GalaxyWorkflow
+inputs:
+  fastq_input:
+    type: data
+steps:
+  grep:
+    # Grep1 requires fastqsanger, so fastqsanger.gz will be implicitly converted
+    tool_id: Grep1
+    in:
+      input: fastq_input
+"""
+        with self.dataset_populator.test_history() as history_id:
+            # Create a fastqsanger.gz dataset
+            compressed_path = self.test_data_resolver.get_filename("1.fastqsanger.gz")
+            with open(compressed_path, "rb") as fh:
+                dataset = self.dataset_populator.new_dataset(
+                    history_id, content=fh, file_type="fastqsanger.gz", wait=True
+                )
+
+            # Upload workflow
+            workflow_id = self.workflow_populator.upload_yaml_workflow(wf)
+
+            # Run workflow first time
+            workflow_request: Dict[str, Any] = {
+                "inputs": json.dumps({"fastq_input": self._ds_entry(dataset)}),
+                "history": f"hist_id={history_id}",
+                "inputs_by": "name",
+            }
+            first_invocation_summary = self.workflow_populator.invoke_workflow_and_wait(
+                workflow_id, request=workflow_request
+            ).json()
+            self.workflow_populator.wait_for_invocation_and_jobs(
+                history_id=first_invocation_summary["history_id"],
+                workflow_id=workflow_id,
+                invocation_id=first_invocation_summary["id"],
+                assert_ok=True,
+            )
+            first_invocation = self.workflow_populator.get_invocation(first_invocation_summary["id"], step_details=True)
+            first_job_id = first_invocation["steps"][1]["jobs"][0]["id"]
+            first_job_details = self.dataset_populator.get_job_details(first_job_id, full=True).json()
+            assert first_job_details["state"] == "ok"
+            assert not first_job_details["copied_from_job_id"]
+
+            # Verify implicit conversion happened (input to Grep1 should be fastqsanger, not fastqsanger.gz)
+            grep_input_id = first_job_details["inputs"]["input"]["id"]
+            grep_input = self.dataset_populator.get_history_dataset_details(
+                history_id=first_job_details["history_id"], content_id=grep_input_id
+            )
+            assert grep_input["extension"] == "fastqsanger", "Expected implicit conversion to fastqsanger"
+            assert grep_input_id != dataset["id"], "Input should be implicitly converted dataset"
+
+            # Run workflow second time with use_cached_job and new_history_name
+            # Remove history parameter since we're specifying new_history_name
+            workflow_request.pop("history", None)
+            workflow_request["use_cached_job"] = True
+            workflow_request["new_history_name"] = self.dataset_populator.get_random_name()
+            second_invocation_response = self.workflow_populator.invoke_workflow(workflow_id, request=workflow_request)
+            second_invocation_summary = second_invocation_response.json()
+            second_history_id = second_invocation_summary["history_id"]
+            # Wait for the workflow to complete
+            self.workflow_populator.wait_for_invocation_and_jobs(
+                history_id=second_history_id,
+                workflow_id=workflow_id,
+                invocation_id=second_invocation_summary["id"],
+                assert_ok=True,
+            )
+            second_invocation = self.workflow_populator.get_invocation(
+                second_invocation_summary["id"], step_details=True
+            )
+            second_job_id = second_invocation["steps"][1]["jobs"][0]["id"]
+            second_job_details = self.dataset_populator.get_job_details(second_job_id, full=True).json()
+
+            # Verify job was cached
+            assert second_job_details["state"] == "ok"
+            assert second_job_details["copied_from_job_id"] == first_job_id, "Second job should be cached from first"
+
+            # Verify the second invocation is in a different history
+            assert (
+                second_job_details["history_id"] != first_job_details["history_id"]
+            ), "Second invocation should be in a new history"
+
+            # Verify implicit conversion dataset was copied to the new history
+            cached_grep_input_id = second_job_details["inputs"]["input"]["id"]
+            cached_grep_input = self.dataset_populator.get_history_dataset_details(
+                history_id=second_job_details["history_id"], content_id=cached_grep_input_id
+            )
+            assert cached_grep_input["extension"] == "fastqsanger"
+            # The implicitly converted dataset should have a different HDA ID but same underlying dataset
+            assert (
+                cached_grep_input_id != grep_input_id
+            ), "Cached run should have copied the implicitly converted dataset to the new history"