chore: extract comparison tool from fuzzer

comphead · comphead · commit ce2a5b261197 · 2025-10-28T10:24:18.000-07:00
diff --git a/dev/benchmarks/tpcbench.py b/dev/benchmarks/tpcbench.py
@@ -21,6 +21,22 @@
 from pyspark.sql import SparkSession
 import time
 
+# rename same columns aliases
+# a, a, b, b -> a, a_1, b, b_1
+#
+# Important for writing data where column name uniqueness is required
+def dedup_columns(df):
+    counts = {}
+    new_cols = []
+    for c in df.columns:
+        if c not in counts:
+            counts[c] = 0
+            new_cols.append(c)
+        else:
+            counts[c] += 1
+            new_cols.append(f"{c}_{counts[c]}")
+    return df.toDF(*new_cols)
+
 def main(benchmark: str, data_path: str, query_path: str, iterations: int, output: str, name: str, query_num: int = None, write_path: str = None):
 
     # Initialize a SparkSession
@@ -91,9 +107,11 @@ def main(benchmark: str, data_path: str, query_path: str, iterations: int, outpu
                         df.explain()
 
                         if write_path is not None:
+                            # skip results with empty schema
+                            # coming across for running DDL stmt
                             if len(df.columns) > 0:
                                 output_path = f"{write_path}/q{query}"
-                                df.coalesce(1).write.mode("overwrite").parquet(output_path)
+                                dedup_columns(df).coalesce(1).write.mode("overwrite").parquet(output_path)
                                 print(f"Query {query} results written to {output_path}")
                             else:
                                 print(f"Skipping write: DataFrame has no schema for {output_path}")
@@ -135,4 +153,5 @@ def main(benchmark: str, data_path: str, query_path: str, iterations: int, outpu
     parser.add_argument("--write", required=False, help="Path to save query results to, in Parquet format.")
     args = parser.parse_args()
 
-    main(args.benchmark, args.data, args.queries, int(args.iterations), args.output, args.name, args.query, args.write)
+    main(args.benchmark, args.data, args.queries, int(args.iterations), args.output, args.name, args.query, args.write)
+
diff --git a/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala b/fuzz-testing/src/main/scala/org/apache/comet/fuzz/QueryRunner.scala
@@ -152,6 +152,10 @@ object QueryComparison {
       while (i < sparkRows.length) {
         val l = sparkRows(i)
         val r = cometRows(i)
+        // Check the schema is equal for first row only
+        if (i == 0)
+          assert(l.schema == r.schema)
+
         assert(l.length == r.length)
         for (j <- 0 until l.length) {
           if (!same(l(j), r(j))) {