Skip to content

Commit 76335c9

Browse files
committed
Fixin some benchmarks
Signed-off-by: Adam Gutglick <[email protected]>
1 parent c207af0 commit 76335c9

File tree

10 files changed

+194
-40
lines changed

10 files changed

+194
-40
lines changed

.github/workflows/sql-benchmarks.yml

Lines changed: 6 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,7 @@ jobs:
169169
target/release_debug/df-bench ${{ matrix.subcommand }} \
170170
--formats "$df_formats" \
171171
-i1 \
172-
-d gh-json \
172+
-d table \
173173
$opts
174174
fi
175175
@@ -178,15 +178,15 @@ jobs:
178178
target/release_debug/ddb-bench ${{ matrix.subcommand }} \
179179
--formats "$ddb_formats" \
180180
-i1 \
181-
-d gh-json \
181+
-d table \
182182
$opts
183183
fi
184184
185185
# Generate data with lance-bench (runs each query once)
186186
if [ "$has_lance" = "true" ] && [ -f "target/release_debug/lance-bench" ]; then
187187
target/release_debug/lance-bench ${{ matrix.subcommand }} \
188188
-i1 \
189-
-d gh-json \
189+
-d table \
190190
$opts
191191
fi
192192
@@ -278,10 +278,9 @@ jobs:
278278
OTEL_EXPORTER_OTLP_HEADERS: "${{ (inputs.mode != 'pr' || github.event.pull_request.head.repo.fork == false) && secrets.OTEL_EXPORTER_OTLP_HEADERS || '' }}"
279279
OTEL_RESOURCE_ATTRIBUTES: "bench-name=${{ matrix.id }}"
280280
run: |
281-
# Extract formats for each engine (filter out lance from df_formats)
282-
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | grep -v ':lance$' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
281+
# Extract formats for each engine
282+
df_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^datafusion:' | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
283283
ddb_formats=$(echo "${{ matrix.targets }}" | tr ',' '\n' | grep '^duckdb:' | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
284-
has_lance=$(echo "${{ matrix.targets }}" | grep -q 'datafusion:lance' && echo "true" || echo "false")
285284
286285
# Build options string if scale_factor is set
287286
opts=""
@@ -310,17 +309,8 @@ jobs:
310309
-o ddb-results.json
311310
fi
312311
313-
# Run lance-bench with remote storage
314-
if [ "$has_lance" = "true" ] && [ -f "target/release_debug/lance-bench" ]; then
315-
target/release_debug/lance-bench ${{ matrix.subcommand }} \
316-
--use-remote-data-dir ${{ matrix.remote_storage }} \
317-
-d gh-json \
318-
$opts \
319-
-o lance-results.json
320-
fi
321-
322312
# Merge results
323-
cat df-results.json ddb-results.json lance-results.json 2>/dev/null > results.json || true
313+
cat df-results.json ddb-results.json 2>/dev/null > results.json || true
324314
325315
- name: Install uv
326316
if: inputs.mode == 'pr'

Cargo.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

benchmarks/ddb-bench/src/main.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@ use clap::Parser;
99
use clap::value_parser;
1010
use ddb_bench::DuckDBCtx;
1111
use vortex_bench::BenchmarkArg;
12-
use vortex_bench::BenchmarkOutput;
1312
use vortex_bench::CompactionStrategy;
1413
use vortex_bench::Engine;
1514
use vortex_bench::Format;
1615
use vortex_bench::Opt;
1716
use vortex_bench::Opts;
1817
use vortex_bench::conversions::convert_parquet_to_vortex;
1918
use vortex_bench::create_benchmark;
19+
use vortex_bench::create_output_writer;
2020
use vortex_bench::display::DisplayFormat;
2121
use vortex_bench::runner::SqlBenchmarkRunner;
2222
use vortex_bench::runner::filter_queries;
@@ -132,8 +132,9 @@ async fn main() -> anyhow::Result<()> {
132132
|ctx, _query_idx, query| ctx.execute_query(query),
133133
)?;
134134

135-
let output = BenchmarkOutput::with_path(benchmark.dataset_name(), args.output_path);
136-
runner.export_to(&args.display_format, output.create_writer()?)?;
135+
let benchmark_id = format!("duckdb-{}", benchmark.dataset_name());
136+
let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?;
137+
runner.export_to(&args.display_format, writer)?;
137138

138139
Ok(())
139140
}

benchmarks/df-bench/Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ version = { workspace = true }
1515

1616
[dependencies]
1717
anyhow = { workspace = true }
18+
arrow-ipc.workspace = true
1819
clap = { workspace = true, features = ["derive"] }
1920
datafusion = { workspace = true, features = [
2021
"parquet",
@@ -24,6 +25,7 @@ datafusion = { workspace = true, features = [
2425
] }
2526
datafusion-common = { workspace = true }
2627
datafusion-physical-plan = { workspace = true }
28+
futures.workspace = true
2729
itertools.workspace = true
2830
object_store = { workspace = true, features = ["aws", "gcp"] }
2931
opentelemetry.workspace = true

benchmarks/df-bench/src/main.rs

Lines changed: 84 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,22 @@ use datafusion::datasource::listing::ListingOptions;
1111
use datafusion::datasource::listing::ListingTable;
1212
use datafusion::datasource::listing::ListingTableConfig;
1313
use datafusion::datasource::listing::ListingTableUrl;
14+
use datafusion::parquet::arrow::ParquetRecordBatchStreamBuilder;
1415
use datafusion::prelude::SessionContext;
1516
use datafusion_physical_plan::ExecutionPlan;
1617
use df_bench::format_to_df_format;
18+
use futures::StreamExt;
19+
use tokio::fs::File;
1720
use vortex_bench::Benchmark;
1821
use vortex_bench::BenchmarkArg;
19-
use vortex_bench::BenchmarkOutput;
2022
use vortex_bench::CompactionStrategy;
2123
use vortex_bench::Engine;
2224
use vortex_bench::Format;
2325
use vortex_bench::Opt;
2426
use vortex_bench::Opts;
2527
use vortex_bench::conversions::convert_parquet_to_vortex;
2628
use vortex_bench::create_benchmark;
29+
use vortex_bench::create_output_writer;
2730
use vortex_bench::display::DisplayFormat;
2831
use vortex_bench::runner::SqlBenchmarkRunner;
2932
use vortex_bench::runner::filter_queries;
@@ -160,8 +163,9 @@ async fn main() -> anyhow::Result<()> {
160163
)
161164
.await?;
162165

163-
let output = BenchmarkOutput::with_path(benchmark.dataset_name(), args.output_path);
164-
runner.export_to(&args.display_format, output.create_writer()?)?;
166+
let benchmark_id = format!("datafusion-{}", benchmark.dataset_name());
167+
let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?;
168+
runner.export_to(&args.display_format, writer)?;
165169

166170
Ok(())
167171
}
@@ -171,26 +175,88 @@ async fn register_benchmark_tables<B: Benchmark + ?Sized>(
171175
benchmark: &B,
172176
format: Format,
173177
) -> anyhow::Result<()> {
174-
let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?;
175-
let file_format = format_to_df_format(format);
178+
if matches!(format, Format::Arrow) {
179+
// For Arrow format, load Arrow IPC files into in-memory tables
180+
register_arrow_tables(session, benchmark).await
181+
} else {
182+
let benchmark_base = benchmark.data_url().join(&format!("{}/", format.name()))?;
183+
let file_format = format_to_df_format(format);
184+
185+
for table in benchmark.table_specs().iter() {
186+
let pattern = benchmark.pattern(table.name, format);
187+
let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?;
188+
189+
let mut config = ListingTableConfig::new(table_url).with_listing_options(
190+
ListingOptions::new(file_format.clone())
191+
.with_session_config_options(session.state().config()),
192+
);
193+
194+
config = match table.schema.as_ref() {
195+
Some(schema) => config.with_schema(Arc::new(schema.clone())),
196+
None => config.infer_schema(&session.state()).await?,
197+
};
198+
199+
let listing_table = Arc::new(ListingTable::try_new(config)?);
200+
201+
session.register_table(table.name, listing_table)?;
202+
}
176203

177-
for table in benchmark.table_specs().iter() {
178-
let pattern = benchmark.pattern(table.name, format);
179-
let table_url = ListingTableUrl::try_new(benchmark_base.clone(), pattern)?;
204+
Ok(())
205+
}
206+
}
180207

181-
let mut config = ListingTableConfig::new(table_url).with_listing_options(
182-
ListingOptions::new(file_format.clone())
183-
.with_session_config_options(session.state().config()),
184-
);
208+
/// Load Arrow IPC files into in-memory DataFusion tables.
209+
async fn register_arrow_tables<B: Benchmark + ?Sized>(
210+
session: &SessionContext,
211+
benchmark: &B,
212+
) -> anyhow::Result<()> {
213+
use datafusion::datasource::MemTable;
185214

186-
config = match table.schema.as_ref() {
187-
Some(schema) => config.with_schema(Arc::new(schema.clone())),
188-
None => config.infer_schema(&session.state()).await?,
189-
};
215+
let parquet_dir = benchmark
216+
.data_url()
217+
.to_file_path()
218+
.map_err(|_| anyhow::anyhow!("Arrow format requires local file path"))?
219+
.join(Format::Parquet.name());
190220

191-
let listing_table = Arc::new(ListingTable::try_new(config)?);
221+
// Read all arrow files from the directory
222+
let data_files = std::fs::read_dir(&parquet_dir)?.collect::<Result<Vec<_>, _>>()?;
192223

193-
session.register_table(table.name, listing_table)?;
224+
for table in benchmark.table_specs().iter() {
225+
let pattern = benchmark.pattern(table.name, Format::Parquet);
226+
227+
// Find files matching this table's pattern
228+
let matching_files: Vec<_> = data_files
229+
.iter()
230+
.filter(|entry| {
231+
let filename = entry.file_name();
232+
let filename_str = filename.to_str().unwrap_or("");
233+
match &pattern {
234+
Some(p) => p.matches(filename_str),
235+
None => filename_str == format!("{}.{}", table.name, Format::Parquet.ext()),
236+
}
237+
})
238+
.collect();
239+
240+
// Load all matching files into memory
241+
let mut all_batches = Vec::new();
242+
let mut schema = None;
243+
244+
for dir_entry in matching_files {
245+
let file = File::open(dir_entry.path()).await?;
246+
let mut reader = ParquetRecordBatchStreamBuilder::new(file).await?.build()?;
247+
if schema.is_none() {
248+
schema = Some(reader.schema()).cloned();
249+
}
250+
251+
while let Some(batch) = reader.next().await {
252+
all_batches.push(batch?);
253+
}
254+
}
255+
256+
if let Some(schema) = schema {
257+
let mem_table = MemTable::try_new(schema, vec![all_batches])?;
258+
session.register_table(table.name, Arc::new(mem_table))?;
259+
}
194260
}
195261

196262
Ok(())

benchmarks/lance-bench/src/main.rs

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,12 @@ use lance_bench::convert::convert_parquet_to_lance;
1515
use tracing::info;
1616
use vortex_bench::Benchmark;
1717
use vortex_bench::BenchmarkArg;
18-
use vortex_bench::BenchmarkOutput;
1918
use vortex_bench::Engine;
2019
use vortex_bench::Format;
2120
use vortex_bench::Opt;
2221
use vortex_bench::Opts;
2322
use vortex_bench::create_benchmark;
23+
use vortex_bench::create_output_writer;
2424
use vortex_bench::display::DisplayFormat;
2525
use vortex_bench::runner::SqlBenchmarkRunner;
2626
use vortex_bench::runner::filter_queries;
@@ -117,8 +117,9 @@ async fn main() -> anyhow::Result<()> {
117117
)
118118
.await?;
119119

120-
let output = BenchmarkOutput::with_path(benchmark.dataset_name(), args.output_path);
121-
runner.export_to(&args.display_format, output.create_writer()?)?;
120+
let benchmark_id = format!("lance-{}", benchmark.dataset_name());
121+
let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?;
122+
runner.export_to(&args.display_format, writer)?;
122123

123124
Ok(())
124125
}

vortex-bench/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ workspace = true
1919
[dependencies]
2020
anyhow = { workspace = true }
2121
arrow-array = { workspace = true }
22+
arrow-ipc = { workspace = true }
2223
arrow-schema = { workspace = true }
2324
arrow-select = { workspace = true }
2425
async-trait = { workspace = true }

vortex-bench/src/conversions.rs

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ use std::path::Path;
77
use std::path::PathBuf;
88

99
use arrow_array::RecordBatchReader;
10+
use arrow_ipc::writer::FileWriter;
1011
use futures::StreamExt;
1112
use futures::TryStreamExt;
1213
use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
@@ -119,3 +120,73 @@ pub async fn convert_parquet_to_vortex(
119120
.await?;
120121
Ok(())
121122
}
123+
124+
/// Convert all Parquet files in a directory to Arrow IPC format.
125+
///
126+
/// This function reads Parquet files from `{input_path}/parquet/` and writes
127+
/// Arrow IPC files to `{input_path}/arrow/`.
128+
///
129+
/// The conversion is idempotent - existing Arrow files will not be regenerated.
130+
pub async fn convert_parquet_to_arrow(input_path: &Path) -> anyhow::Result<()> {
131+
let arrow_dir = input_path.join(Format::Arrow.name());
132+
let parquet_path = input_path.join(Format::Parquet.name());
133+
create_dir_all(&arrow_dir).await?;
134+
135+
let parquet_inputs = fs::read_dir(&parquet_path)?.collect::<std::io::Result<Vec<_>>>()?;
136+
137+
trace!(
138+
"Found {} parquet files in {}",
139+
parquet_inputs.len(),
140+
parquet_path.to_str().unwrap()
141+
);
142+
143+
let iter = parquet_inputs
144+
.iter()
145+
.filter(|entry| entry.path().extension().is_some_and(|e| e == "parquet"));
146+
147+
futures::stream::iter(iter)
148+
.map(|dir_entry| {
149+
let filename = {
150+
let mut temp = dir_entry.path();
151+
temp.set_extension("");
152+
temp.file_name().unwrap().to_str().unwrap().to_string()
153+
};
154+
let parquet_file_path = parquet_path.join(format!("{filename}.parquet"));
155+
let output_path = arrow_dir.join(format!("{filename}.{}", Format::Arrow.ext()));
156+
157+
tokio::spawn(
158+
async move {
159+
idempotent_async(output_path.as_path(), move |arrow_file| async move {
160+
info!("Converting '{filename}' from Parquet to Arrow IPC");
161+
162+
// Read Parquet file
163+
let parquet_file = File::open(&parquet_file_path)?;
164+
let reader =
165+
ParquetRecordBatchReaderBuilder::try_new(parquet_file)?.build()?;
166+
let schema = reader.schema();
167+
168+
// Write Arrow IPC file
169+
let arrow_output = File::create(&arrow_file)?;
170+
let mut writer = FileWriter::try_new(arrow_output, &schema)?;
171+
172+
for batch in reader {
173+
let batch = batch?;
174+
writer.write(&batch)?;
175+
}
176+
177+
writer.finish()?;
178+
179+
anyhow::Ok(())
180+
})
181+
.await
182+
.expect("Failed to write Arrow file")
183+
}
184+
.in_current_span(),
185+
)
186+
})
187+
.buffer_unordered(16)
188+
.try_collect::<Vec<_>>()
189+
.await?;
190+
191+
Ok(())
192+
}

vortex-bench/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ pub use benchmark::TableSpec;
4949
pub use datasets::BenchmarkDataset;
5050
pub use engines::df;
5151
pub use output::BenchmarkOutput;
52+
pub use output::create_output_writer;
5253
use vortex::VortexSessionDefault;
5354
pub use vortex::error::vortex_panic;
5455
use vortex::io::session::RuntimeSessionExt;

0 commit comments

Comments
 (0)