Skip to content

Commit d63fb42

Browse files
committed
test: regression test for CUR hash-join mismatch in query_run_costs
CUR costs.hash = LEFT(full_md5, 8) e.g. 'abcdef12'. Task hash = Nextflow workdir path e.g. 'ab/cdef12'. LEFT(t.hash, 8) = 'ab/cdef1' — never matches because of '/'. Two xfail tests prove the JOIN produces task-level fallback costs instead of CUR costs.
1 parent 130c99e commit d63fb42

File tree

3 files changed

+62
-0
lines changed

3 files changed

+62
-0
lines changed

.beads/interactions.jsonl

Whitespace-only changes.

.beads/issues.jsonl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
{"id":"nf-agg-cpi","title":"Epic: benchmark report production-readiness","status":"open","priority":1,"issue_type":"epic","owner":"edmund.miller@seqera.io","created_at":"2026-03-21T22:14:46Z","created_by":"Edmund Miller","updated_at":"2026-03-21T22:14:46Z"}
2+
{"id":"nf-agg-cpi.1","title":"Fix CUR hash-join mismatch in query_run_costs","description":"query_run_costs JOINs tasks to CUR on LEFT(t.hash, 8) = c.hash, but CUR has full MD5 hashes (e.g. 0d1cd26c8d8619d3b63df19c47b314c1) while tasks have Nextflow workdir paths (e.g. 45/d87388). The JOIN never matches, so reports always fall back to API task-level costs. CUR run-level totals differ significantly (e.g. cpu: $39.94 CUR vs $18.27 task-level). Fixing unlocks used-vs-unused cost charts.","status":"open","priority":1,"issue_type":"bug","owner":"edmund.miller@seqera.io","created_at":"2026-03-21T22:14:55Z","created_by":"Edmund Miller","updated_at":"2026-03-21T22:14:55Z","dependencies":[{"issue_id":"nf-agg-cpi.1","depends_on_id":"nf-agg-cpi","type":"parent-child","created_at":"2026-03-21T17:14:55Z","created_by":"Edmund Miller","metadata":"{}"}]}
3+
{"id":"nf-agg-cpi.2","title":"Squash/rebase remove-legacy-report branch before PR","description":"10 commits on remove-legacy-report could be tidied. Fix/test pairs can be squashed, rename commit can fold into removal. Goal: clean reviewable history.","status":"open","priority":2,"issue_type":"chore","owner":"edmund.miller@seqera.io","created_at":"2026-03-21T22:15:05Z","created_by":"Edmund Miller","updated_at":"2026-03-21T22:15:05Z","dependencies":[{"issue_id":"nf-agg-cpi.2","depends_on_id":"nf-agg-cpi","type":"parent-child","created_at":"2026-03-21T17:15:05Z","created_by":"Edmund Miller","metadata":"{}"}]}
4+
{"id":"nf-agg-cpi.3","title":"Open PR for remove-legacy-report branch","description":"Ship the branch: legacy R/Quarto removal, v2→v1 rename, nested task unwrap fix, CUR format support, regression tests. Target: small-nf base branch.","status":"open","priority":1,"issue_type":"task","owner":"edmund.miller@seqera.io","created_at":"2026-03-21T22:15:06Z","created_by":"Edmund Miller","updated_at":"2026-03-21T22:15:06Z","dependencies":[{"issue_id":"nf-agg-cpi.3","depends_on_id":"nf-agg-cpi","type":"parent-child","created_at":"2026-03-21T17:15:05Z","created_by":"Edmund Miller","metadata":"{}"},{"issue_id":"nf-agg-cpi.3","depends_on_id":"nf-agg-cpi.2","type":"blocks","created_at":"2026-03-21T17:15:11Z","created_by":"Edmund Miller","metadata":"{}"},{"issue_id":"nf-agg-cpi.3","depends_on_id":"nf-agg-cpi.4","type":"blocks","created_at":"2026-03-21T17:15:11Z","created_by":"Edmund Miller","metadata":"{}"}]}
5+
{"id":"nf-agg-cpi.4","title":"Verify nf-test passes with renamed benchmark_report module","description":"modules/local/benchmark_report/tests/main.nf.test exists but hasn't been run after the v2→v1 rename and new fixtures. Need to verify nf-test integration works end-to-end.","status":"open","priority":2,"issue_type":"task","owner":"edmund.miller@seqera.io","created_at":"2026-03-21T22:15:06Z","created_by":"Edmund Miller","updated_at":"2026-03-21T22:15:06Z","dependencies":[{"issue_id":"nf-agg-cpi.4","depends_on_id":"nf-agg-cpi","type":"parent-child","created_at":"2026-03-21T17:15:06Z","created_by":"Edmund Miller","metadata":"{}"}]}

bin/test_benchmark_report.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,63 @@ def test_mixed_nested_and_flat(self):
109109
assert count == 2
110110

111111

112+
# ── Bug: CUR hash-join mismatch in query_run_costs (nf-agg-cpi.1) ───────────
113+
114+
115+
class TestCurHashJoinMismatch:
116+
"""CUR costs.hash = LEFT(full_md5, 8) e.g. 'abcdef12'.
117+
Task hash = Nextflow workdir path e.g. 'ab/cdef12'.
118+
LEFT(t.hash, 8) = 'ab/cdef1' — never matches CUR hash because of '/'.
119+
120+
Fix: normalize task hash by stripping '/' before comparing.
121+
"""
122+
123+
def _write_cur_parquet(self, tmp_path, run_id="run1", task_hash_md5="abcdef1234567890"):
124+
"""Write CUR parquet with a known full MD5 task hash."""
125+
db = duckdb.connect()
126+
path = os.path.join(tmp_path, "cur.parquet")
127+
db.execute(f"""
128+
COPY (
129+
SELECT
130+
'{run_id}' AS resource_tags_user_unique_run_id,
131+
'PROC_A' AS resource_tags_user_pipeline_process,
132+
'{task_hash_md5}' AS resource_tags_user_task_hash,
133+
10.0 AS line_item_unblended_cost,
134+
8.0 AS split_line_item_split_cost,
135+
2.0 AS split_line_item_unused_cost
136+
) TO '{path}' (FORMAT PARQUET)
137+
""")
138+
db.close()
139+
return path
140+
141+
@pytest.mark.xfail(strict=True, reason="CUR hash-join mismatch: LEFT(t.hash,8) includes '/' so never matches CUR hash")
142+
def test_cur_costs_override_task_costs(self, tmp_path):
143+
"""When CUR data is present, query_run_costs should use CUR cost (10.0)
144+
not the task-level cost (1.50)."""
145+
# Task hash 'ab/cdef12' corresponds to MD5 'abcdef12...'
146+
task = _flat_task(hash_val="ab/cdef1234567890", cost=1.50)
147+
run = _make_run(run_id="run1", tasks=[task])
148+
cur = self._write_cur_parquet(tmp_path, run_id="run1", task_hash_md5="abcdef1234567890")
149+
150+
db = build_database([run], cur)
151+
costs = query_run_costs(db)
152+
assert len(costs) == 1
153+
# CUR total cost = split_cost + unused = 8.0 + 2.0 = 10.0
154+
assert costs[0]["cost"] == pytest.approx(10.0)
155+
156+
@pytest.mark.xfail(strict=True, reason="CUR hash-join mismatch: LEFT(t.hash,8) includes '/' so never matches CUR hash")
157+
def test_cur_used_and_unused_cost_split(self, tmp_path):
158+
"""CUR provides used_cost and unused_cost breakdown."""
159+
task = _flat_task(hash_val="ab/cdef1234567890", cost=1.50)
160+
run = _make_run(run_id="run1", tasks=[task])
161+
cur = self._write_cur_parquet(tmp_path, run_id="run1", task_hash_md5="abcdef1234567890")
162+
163+
db = build_database([run], cur)
164+
costs = query_run_costs(db)
165+
assert costs[0]["used_cost"] == pytest.approx(8.0)
166+
assert costs[0]["unused_cost"] == pytest.approx(2.0)
167+
168+
112169
# ── Bug: CUR old format missing nf_ column (1c1894d) ────────────────────────
113170

114171

0 commit comments

Comments
 (0)