Enable hash-based job matching for composite datasets

nsoranzo · nsoranzo · commit 67a4689f62ce · 2026-01-26T12:20:17.000Z
by computing and storing hashes for all files (primary file and extra files).

Enhanced ``has_same_hash()`` to match datasets only when all hashes match,
preventing partial matches.
diff --git a/lib/galaxy/jobs/__init__.py b/lib/galaxy/jobs/__init__.py
@@ -2242,14 +2242,28 @@ def fail(message=job.info, exception=None):
                     if self.app.config.enable_celery_tasks:
                         from galaxy.celery.tasks import compute_dataset_hash
 
-                        extra_files_path = dataset.extra_files_path if dataset.extra_files_path_exists() else None
                         request = ComputeDatasetHashTaskRequest(
                             dataset_id=dataset.id,
-                            extra_files_path=extra_files_path,
+                            extra_files_path=None,
                             hash_function=self.app.config.hash_function,
                         )
                         compute_dataset_hash.delay(request=request)
 
+                        # For composite datasets with extra files, hash each extra file individually
+                        if dataset.extra_files_path_exists():
+                            for root, _, files in os.walk(dataset.extra_files_path):
+                                for file in files:
+                                    file_path = os.path.join(root, file)
+                                    if os.path.exists(file_path):
+                                        # Calculate relative path from extra_files_path
+                                        relative_path = os.path.relpath(file_path, dataset.extra_files_path)
+                                        request = ComputeDatasetHashTaskRequest(
+                                            dataset_id=dataset.id,
+                                            extra_files_path=relative_path,
+                                            hash_function=self.app.config.hash_function,
+                                        )
+                                        compute_dataset_hash.delay(request=request)
+
         user = job.user
         if user and collected_bytes > 0 and quota_source_info is not None and quota_source_info.use:
             user.adjust_total_disk_usage(collected_bytes, quota_source_info.label)
diff --git a/lib/galaxy/managers/jobs.py b/lib/galaxy/managers/jobs.py
@@ -177,14 +177,18 @@ def has_same_hash(
 ) -> "Select[tuple[int]]":
     a_hash = aliased(model.DatasetHash)
     b_hash = aliased(model.DatasetHash)
+    b_hash_total = aliased(model.DatasetHash)
+
     # Join b directly, checking for either direct dataset match or hash match
     # The hash match uses a correlated subquery to avoid the expensive cartesian product
     stmt = stmt.join(
         b,
         or_(
             # Direct dataset match
             b.dataset_id == a.dataset_id,
-            # Hash match: b's dataset has a hash that matches any of a's hashes
+            # Hash match: b's dataset has hashes that match all of a's hashes
+            # For composite datasets, this means matching the primary file hash AND all extra file hashes
+            # For regular datasets, this means matching the single primary file hash
             b.dataset_id.in_(
                 select(b_hash.dataset_id)
                 .select_from(a_hash)
@@ -193,9 +197,37 @@ def has_same_hash(
                     and_(
                         a_hash.hash_function == b_hash.hash_function,
                         a_hash.hash_value == b_hash.hash_value,
+                        # Match extra_files_path: both NULL or both the same path
+                        or_(
+                            and_(
+                                a_hash.extra_files_path.is_(None),
+                                b_hash.extra_files_path.is_(None),
+                            ),
+                            a_hash.extra_files_path == b_hash.extra_files_path,
+                        ),
                     ),
                 )
                 .where(a_hash.dataset_id == a.dataset_id)
+                # Group by b's dataset_id and ensure all of a's hashes are matched
+                .group_by(b_hash.dataset_id)
+                .having(
+                    and_(
+                        # Number of matched hashes equals total hashes in A
+                        func.count(b_hash.id)
+                        == select(func.count(model.DatasetHash.id))
+                        .where(model.DatasetHash.dataset_id == a.dataset_id)
+                        .correlate(a)
+                        .scalar_subquery(),
+                        # Total hashes in B equals total hashes in A (ensures no extra hashes in B)
+                        select(func.count(b_hash_total.id))
+                        .where(b_hash_total.dataset_id == b_hash.dataset_id)
+                        .scalar_subquery()
+                        == select(func.count(model.DatasetHash.id))
+                        .where(model.DatasetHash.dataset_id == a.dataset_id)
+                        .correlate(a)
+                        .scalar_subquery(),
+                    )
+                )
             ),
         ),
     )
diff --git a/lib/galaxy_test/api/test_datasets.py b/lib/galaxy_test/api/test_datasets.py
@@ -789,10 +789,12 @@ def test_compute_md5_on_primary_dataset(self, history_id):
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=hda)
         self.assert_hash_value(hda_details, "940cbe15c94d7e339dc15550f6bdcf4d", "MD5")
 
-    def test_compute_sha1_on_composite_dataset(self, history_id):
+    def test_compute_sha256_on_composite_dataset_by_default(self, history_id):
         output = self.dataset_populator.fetch_hda(history_id, COMPOSITE_DATA_FETCH_REQUEST_1, wait=True)
-        self.dataset_populator.compute_hash(output["id"], hash_function="SHA-256", extra_files_path="Roadmaps")
         hda_details = self.dataset_populator.get_history_dataset_details(history_id, dataset=output)
+        self.assert_hash_value(
+            hda_details, "94e09ae129f1ec32d1736af833160e8bdaa3a75cef2982712076c7bcd7d155d3", "SHA-256"
+        )
         self.assert_hash_value(
             hda_details,
             "3cbd311889963528954fe03b28b68a09685ea7a75660bd2268d5b44cafbe0d22",
diff --git a/lib/galaxy_test/api/test_tool_execute.py b/lib/galaxy_test/api/test_tool_execute.py
@@ -7,7 +7,9 @@
 files, etc..).
 """
 
+import copy
 from dataclasses import dataclass
+from typing import Any
 
 import pytest
 
@@ -610,6 +612,52 @@ def test_job_cache_with_dataset_hash(target_history: TargetHistory, required_too
     assert execution.final_details["copied_from_job_id"]
 
 
+@requires_tool_id("gx_data")
+def test_job_cache_with_extra_files(target_history: TargetHistory, required_tool: RequiredTool) -> None:
+    # Upload a composite dataset (velvet) which creates extra files
+    velvet_upload_request: dict[str, Any] = {
+        "src": "composite",
+        "ext": "velvet",
+        "composite": {
+            "items": [
+                {"src": "pasted", "paste_content": "sequences content"},
+                {"src": "pasted", "paste_content": "roadmaps content"},
+                {"src": "pasted", "paste_content": "log content"},
+            ]
+        },
+    }
+
+    # Upload first velvet dataset - access the private _dataset_populator
+    velvet1_hda = target_history._dataset_populator.fetch_hda(target_history.id, velvet_upload_request, wait=True)
+    velvet1 = {"src": "hda", "id": velvet1_hda["id"]}
+
+    # Run gx_data tool on the first velvet dataset
+    _ = required_tool.execute().with_inputs({"parameter": velvet1}).assert_has_single_job
+
+    # Upload the same velvet dataset a second time
+    velvet2_hda = target_history._dataset_populator.fetch_hda(target_history.id, velvet_upload_request, wait=True)
+    velvet2 = {"src": "hda", "id": velvet2_hda["id"]}
+
+    # Run gx_data on the second velvet dataset with job cache enabled
+    job = required_tool.execute(use_cached_job=True).with_inputs({"parameter": velvet2}).assert_has_single_job
+
+    # Job cache should be used when all hashes match
+    assert job.final_details["copied_from_job_id"]
+
+    # Upload a third velvet dataset with modified content in one of the extra files
+    velvet_modified_request = copy.deepcopy(velvet_upload_request)
+    velvet_modified_request["composite"]["items"][1]["paste_content"] = "roadmaps content MODIFIED"
+
+    velvet3_hda = target_history._dataset_populator.fetch_hda(target_history.id, velvet_modified_request, wait=True)
+    velvet3 = {"src": "hda", "id": velvet3_hda["id"]}
+
+    # Run gx_data on the third velvet dataset with job cache enabled
+    job3 = required_tool.execute(use_cached_job=True).with_inputs({"parameter": velvet3}).assert_has_single_job
+
+    # Job cache should NOT be used when hashes don't match completely
+    assert not job3.final_details["copied_from_job_id"]
+
+
 @requires_tool_id("gx_repeat_boolean_min")
 def test_optional_repeats_with_mins_filled_id(target_history: TargetHistory, required_tool: RequiredTool):
     # we have a tool test for this but I wanted to verify it wasn't just the