Fix: CI SQL Remote (#5836)

connortsui20 · web-flow · commit a4368bf67c08 · 2025-12-29T11:50:38.000Z
Removes lance from the SQL remote benchmarks on S3 and cleaned up some
commands.

Signed-off-by: Connor Tsui &lt;connor.tsui20@gmail.com&gt;
diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml
@@ -124,9 +124,8 @@ jobs:
             "name": "TPC-H SF=1 on S3",
             "local_dir": "vortex-bench/data/tpch/1.0",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/1.0/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "1.0",
-            "build_lance": true
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
+            "scale_factor": "1.0"
           },
           {
             "id": "tpch-nvme-10",
@@ -142,9 +141,8 @@ jobs:
             "name": "TPC-H SF=10 on S3",
             "local_dir": "vortex-bench/data/tpch/10.0",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
-            "scale_factor": "10.0",
-            "build_lance": true
+            "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact",
+            "scale_factor": "10.0"
           },
           {
             "id": "tpcds-nvme",
diff --git a/.github/workflows/nightly-bench.yml b/.github/workflows/nightly-bench.yml
@@ -31,25 +31,24 @@ jobs:
             "subcommand": "clickbench",
             "name": "Clickbench on NVME",
             "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
-            "build_args": "--features lance"
+            "build_lance": true
           },
           {
             "id": "tpch-nvme",
             "subcommand": "tpch",
             "name": "TPC-H on NVME",
             "targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:duckdb",
             "scale_factor": "10.0",
-            "build_args": "--features lance"
+            "build_lance": true
           },
           {
             "id": "tpch-s3",
             "subcommand": "tpch",
             "name": "TPC-H on S3",
             "local_dir": "vortex-bench/data/tpch/10.0",
             "remote_storage": "s3://vortex-bench-dev-eu/${{github.ref_name}}/${{github.run_id}}/tpch/10.0/",
-            "targets": "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet,duckdb:vortex",
-            "scale_factor": "10.0",
-            "build_args": "--features lance"
+            "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex",
+            "scale_factor": "10.0"
           },
           {
             "id": "tpch-nvme",
diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml
@@ -190,9 +190,14 @@ jobs:
           OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
           OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
         run: |
-          # Extract formats for each engine (filter out lance from df_formats)
-          df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
-          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
+          # Extract formats for each engine from the targets string.
+          # Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
+          #
+          # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
+
+          # Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
+          df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
+          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
           has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
 
           # Build options string if scale_factor is set
@@ -247,9 +252,22 @@ jobs:
           OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
           OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
         run: |
-          # Extract formats for each engine
-          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
+          # Lance on remote storage is not supported. The infrastructure to generate and upload
+          # lance files to S3 does not exist. If you need lance on S3, you must first implement:
+          # 1. Lance data generation in data-gen (or a separate step)
+          # 2. Lance data upload to S3 before this step runs
+          if echo "${{ matrix.targets }}" | grep -q 'lance'; then
+            echo "ERROR: Lance format is not supported for remote storage benchmarks."
+            echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
+            exit 1
+          fi
+
+          # Extract formats for each engine from the targets string.
+          # Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
+          #
+          # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
           df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
+          ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
 
           # Build options string if scale_factor is set
           opts="--opt remote-data-dir=${{ matrix.remote_storage }}"
diff --git a/benchmarks/datafusion-bench/src/lib.rs b/benchmarks/datafusion-bench/src/lib.rs
@@ -107,6 +107,8 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {
         Format::OnDiskVortex | Format::VortexCompact => {
             Arc::new(VortexFormat::new(SESSION.clone()))
         }
-        _ => unimplemented!(),
+        Format::OnDiskDuckDB | Format::Lance => {
+            unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
+        }
     }
 }

Original file line number	Diff line number	Diff line change
`@@ -107,6 +107,8 @@ pub fn format_to_df_format(format: Format) -> Arc<dyn FileFormat> {`
`107`	`107`	`Format::OnDiskVortex \| Format::VortexCompact => {`
`108`	`108`	`Arc::new(VortexFormat::new(SESSION.clone()))`
`109`	`109`	`}`
`110`		`- _ => unimplemented!(),`
	`110`	`+ Format::OnDiskDuckDB \| Format::Lance => {`
	`111`	+ unimplemented!("Format {format} cannot be turned into a DataFusion `FileFormat`")
	`112`	`+ }`
`111`	`113`	`}`
`112`	`114`	`}`