Add support for running benchmarks in both single and partitioned modes & add comments

wudidapaopao · wudidapaopao · commit 5b941f3032fe · 2025-04-11T20:24:25.000+08:00
diff --git a/daft-parquet/benchmark.sh b/daft-parquet/benchmark.sh
@@ -1,5 +1,16 @@
 #!/bin/bash
 
+machine=${1:-"c6a.4xlarge"}
+case "$machine" in
+    "c6a.4xlarge"|"c6a.metal")
+        machine_name="$machine"
+        ;;
+    *)
+        echo "Invalid machine parameter. Allowed: c6a.4xlarge or c6a.metal"
+        exit 1
+        ;;
+esac
+
 # Install
 sudo apt-get update
 sudo apt-get install -y python3-pip
@@ -8,10 +19,13 @@ pip install --break-system-packages packaging
 pip install --break-system-packages daft==0.4.9
 
 # Use for Daft (Parquet, partitioned)
-# seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
+seq 0 99 | xargs -P100 -I{} bash -c 'wget --continue https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_{}.parquet'
 
 # Use for Daft (Parquet, single)
 wget --continue https://datasets.clickhouse.com/hits_compatible/athena/hits.parquet
 
 # Run the queries
-./run.sh 2>&1 | tee daft_log.txt
+for mode in partitioned single; do
+    echo "Running $mode mode..."
+    ./run.sh $machine_name $mode 2>&1 | tee "daft_log_${mode}.txt"
+done
diff --git a/daft-parquet/query.py b/daft-parquet/query.py
@@ -1,13 +1,21 @@
 #!/usr/bin/env python3
 
 import daft
+import os
+import sys
 import timeit
 import traceback
 import pandas as pd
 from daft import col, DataType, TimeUnit
-from pathlib import Path
 
 hits = None
+current_dir = os.path.dirname(os.path.abspath(__file__))
+query_idx = int(sys.argv[1]) - 1
+is_single_mode = len(sys.argv) > 2 and sys.argv[2] == "single"
+parquet_path = os.path.join(
+    current_dir,
+    "hits.parquet" if is_single_mode else "hits_*.parquet"
+)
 
 with open("queries.sql") as f:
     sql_list = [q.strip() for q in f.read().split(';') if q.strip()]
@@ -21,8 +29,17 @@ def daft_offset(df, start ,end):
 for idx, sql in enumerate(sql_list):
     query_entry = {"sql": sql}
 
-    if idx+1 in [19, 36, 43]:
-        if idx+1 == 19:
+    # Current limitations and workarounds for Daft execution:
+
+    # 1. Queries q18, q35, q42 require manual API workarounds:
+    #    - q18: The function `extract(minute FROM EventTime)` causes an error:
+    #      `expected input to minute to be temporal, got UInt32`.
+    #    - q35: Error is `duplicate field name ClientIP in the schema`.
+    #      Attempts to alias the column in SQL but still failed.
+    #    - q42: The function `DATE_TRUNC('minute', EventTime)` causes an error:
+    #      `Unsupported SQL: Function date_trunc not found`.
+    if idx in [18, 35, 42]:
+        if idx == 18:
             query_entry["lambda"] = lambda: (
                 hits.with_column("m", col("EventTime").dt.minute())
                     .groupby("UserID", "m", "SearchPhrase")
@@ -31,7 +48,7 @@ def daft_offset(df, start ,end):
                     .limit(10)
                     .select("UserID", "m", "SearchPhrase", "COUNT(*)")
             )
-        elif idx+1 == 36:
+        elif idx == 35:
             query_entry["lambda"] = lambda: (
                 hits.groupby(
                         "ClientIP",
@@ -43,7 +60,7 @@ def daft_offset(df, start ,end):
                     .limit(10)
                     .select("ClientIP", "ClientIP - 1", "ClientIP - 2", "ClientIP - 3", "c")
             )
-        elif idx+1 == 43:
+        elif idx == 42:
             query_entry["lambda"] = lambda: (
                 hits.with_column("M", col("EventTime").dt.truncate("1 minute"))
                     .where("CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0")
@@ -54,16 +71,19 @@ def daft_offset(df, start ,end):
                     .select("M", "PageViews")
             )
 
-    if 39 <= idx+1 <= 43:
-        if idx+1 == 39:
+    # 2. OFFSET operator not supported in Daft:
+    #    For queries q38, q39, q40, q41, q42, after executing the query,
+    #    manually implement the `OFFSET` truncation logic via the API
+    if 38 <= idx <= 42:
+        if idx == 38:
             query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
-        elif idx+1 == 40:
+        elif idx == 39:
             query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
-        elif idx+1 == 41:
+        elif idx == 40:
             query_entry["extra_api"] = lambda df: daft_offset(df, 100, 110)
-        elif idx+1 == 42:
+        elif idx == 41:
             query_entry["extra_api"] = lambda df: daft_offset(df, 10000, 10010)
-        elif idx+1 == 43:
+        elif idx == 42:
             query_entry["extra_api"] = lambda df: daft_offset(df, 1000, 1010)
 
     queries.append(query_entry)
@@ -74,12 +94,7 @@ def run_single_query(query, i):
 
         global hits
         if hits is None:
-            # Use for Daft (Parquet, partitioned)
-            # Use absolute path when using wildcards
-            # hits = daft.read_parquet("/path/to/hits_*.parquet")
-
-            # Use for Daft (Parquet, single)
-            hits = daft.read_parquet("hits.parquet")
+            hits = daft.read_parquet(parquet_path)
             hits = hits.with_column("EventTime", col("EventTime").cast(daft.DataType.timestamp("s")))
             hits = hits.with_column("EventDate", col("EventDate").cast(daft.DataType.date()))
             hits = hits.with_column("URL", col("URL").decode("utf-8"))
@@ -104,13 +119,11 @@ def run_single_query(query, i):
 
         return run_time
     except Exception as e:
-        print(f"Error executing query {query_idx+1}: {str(e)[:100]}", file=sys.stderr)
+        print(f"Error executing query {query_idx}: {str(e)[:100]}", file=sys.stderr)
         traceback.print_exc()
         return None
 
 if __name__ == "__main__":
-    import sys
-    query_idx = int(sys.argv[1]) - 1
     query = queries[query_idx]
 
     times = []
diff --git a/daft-parquet/run.sh b/daft-parquet/run.sh
@@ -1,8 +1,12 @@
 #!/bin/bash
 
+machine_name=${1}
+mode=${2}
+full_machine="${1}, 500gb gp2"
+
 TRIES=3
 QUERY_COUNT=43
-RESULT_FILE="results/c6a.metal.json"
+RESULT_FILE="results/${machine_name}.${mode}.json"
 FILE_SIZE=$(wc -c < hits.parquet | awk '{print $1}')
 
 declare -a results=()
@@ -19,7 +23,7 @@ for ((q=1; q<=QUERY_COUNT; q++)); do
     sync
     echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
 
-    output=$(python3 query.py $q 2>&1)
+    output=$(python3 query.py $q $mode 2>&1)
     IFS=',' read -r t1 t2 t3 <<< "$(echo "$output" | tail -1)"
 
     results[$((q-1))]="[${t1:-null},${t2:-null},${t3:-null}]"
@@ -28,7 +32,7 @@ done
 echo '{
     "system": "Daft",
     "date": "'$(date +%Y-%m-%d)'",
-    "machine": "c6a.4xlarge, 500gb gp2",
+    "machine": "'$full_machine'",
     "cluster_size": 1,
     "comment": "",
     "tags": [