@@ -190,9 +190,14 @@ jobs:
190190 OTEL_EXPORTER_OTLP_HEADERS : " ${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
191191 OTEL_RESOURCE_ATTRIBUTES : " bench-name=${{ matrix.id }}"
192192 run : |
193- # Extract formats for each engine (filter out lance from df_formats)
194- df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
195- ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
193+ # Extract formats for each engine from the targets string.
194+ # Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet"
195+ #
196+ # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
197+
198+ # Lance is filtered out of df_formats because it uses a separate binary (lance-bench).
199+ df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
200+ ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
196201 has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
197202
198203 # Build options string if scale_factor is set
@@ -247,9 +252,22 @@ jobs:
247252 OTEL_EXPORTER_OTLP_HEADERS : " ${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
248253 OTEL_RESOURCE_ATTRIBUTES : " bench-name=${{ matrix.id }}"
249254 run : |
250- # Extract formats for each engine
251- ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
255+ # Lance on remote storage is not supported. The infrastructure to generate and upload
256+ # lance files to S3 does not exist. If you need lance on S3, you must first implement:
257+ # 1. Lance data generation in data-gen (or a separate step)
258+ # 2. Lance data upload to S3 before this step runs
259+ if echo "${{ matrix.targets }}" | grep -q 'lance'; then
260+ echo "ERROR: Lance format is not supported for remote storage benchmarks."
261+ echo "Remove 'datafusion:lance' from targets for benchmark '${{ matrix.id }}'."
262+ exit 1
263+ fi
264+
265+ # Extract formats for each engine from the targets string.
266+ # Example input: "datafusion:parquet,datafusion:vortex,duckdb:parquet"
267+ #
268+ # Pipeline: split by comma -> filter by engine prefix -> remove prefix -> rejoin with commas
252269 df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
270+ ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
253271
254272 # Build options string if scale_factor is set
255273 opts="--opt remote-data-dir=${{ matrix.remote_storage }}"
0 commit comments