Skip to content

Commit dba2a6e

Browse files
authored
chore: round earlier in TPC-H q15 to try and reduce non-determinism due to aggregating twice (#1877)
* chore: round earlier in TPC-H q15 to try and reduce non-determinism due to aggregating twice * remove unnecessary code
1 parent 4185afe commit dba2a6e

File tree

1 file changed

+5
-4
lines changed
  • third_party/bigframes_vendored/tpch/queries

1 file changed

+5
-4
lines changed

third_party/bigframes_vendored/tpch/queries/q15.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,11 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
3131
.agg(TOTAL_REVENUE=bpd.NamedAgg(column="REVENUE", aggfunc="sum"))
3232
.rename(columns={"L_SUPPKEY": "SUPPLIER_NO"})
3333
)
34+
# Round earlier to prevent non-determinism in the later join due to
35+
# differences in distributed floating point operation sort order.
36+
grouped_revenue = grouped_revenue.assign(
37+
TOTAL_REVENUE=grouped_revenue["TOTAL_REVENUE"].round(2)
38+
)
3439

3540
joined_data = bpd.merge(
3641
supplier, grouped_revenue, left_on="S_SUPPKEY", right_on="SUPPLIER_NO"
@@ -43,10 +48,6 @@ def q(project_id: str, dataset_id: str, session: bigframes.Session):
4348
max_revenue_suppliers = joined_data[
4449
joined_data["TOTAL_REVENUE"] == joined_data["MAX_REVENUE"]
4550
]
46-
47-
max_revenue_suppliers["TOTAL_REVENUE"] = max_revenue_suppliers[
48-
"TOTAL_REVENUE"
49-
].round(2)
5051
q_final = max_revenue_suppliers[
5152
["S_SUPPKEY", "S_NAME", "S_ADDRESS", "S_PHONE", "TOTAL_REVENUE"]
5253
].sort_values("S_SUPPKEY")

0 commit comments

Comments
 (0)