Skip to content

Commit a20ecc9

Browse files
Duckdb run over public_bi (#3143)
1 parent 8b8f0f3 commit a20ecc9

File tree

4 files changed

+54
-27
lines changed

4 files changed

+54
-27
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
3+
set -Eeu -o pipefail -x
4+
5+
# List files in the current directory
6+
readonly dir=$(dirname ${BASH_SOURCE[0]})
7+
readonly files=$(ls $dir/../public_bi/benchmark)
8+
9+
for file in $files; do
10+
echo "Running public BI: $file"
11+
12+
file_lowercase=$(echo "$file" | tr '[:upper:]' '[:lower:]')
13+
14+
cargo run --profile bench --bin public_bi -- --targets=datafusion:vortex,duckdb:vortex -d $file_lowercase -i1
15+
16+
echo ""
17+
done

bench-vortex/src/bin/public_bi.rs

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ use bench_vortex::metrics::MetricsSetExt;
66
use bench_vortex::public_bi::{FileType, PBI_DATASETS, PBIDataset};
77
use bench_vortex::utils::constants::STORAGE_NVME;
88
use bench_vortex::utils::new_tokio_runtime;
9-
use bench_vortex::{Engine, Format, Target, default_env_filter, df};
10-
use clap::Parser;
9+
use bench_vortex::{Format, Target, default_env_filter, df};
10+
use clap::{Parser, value_parser};
1111
use indicatif::ProgressBar;
1212
use itertools::Itertools;
1313
use tracing::info_span;
@@ -22,10 +22,20 @@ struct Args {
2222
iterations: usize,
2323
#[arg(short, long)]
2424
threads: Option<usize>,
25-
#[arg(long, value_delimiter = ',', value_enum, default_values_t = vec![Format::Parquet, Format::OnDiskVortex])]
26-
formats: Vec<Format>,
25+
#[arg(long, value_delimiter = ',', value_parser = value_parser!(Target),
26+
default_values = vec![
27+
"datafusion:parquet",
28+
"datafusion:vortex",
29+
"duckdb:parquet",
30+
"duckdb:vortex",
31+
"duckdb:duckdb"
32+
]
33+
)]
34+
targets: Vec<Target>,
2735
#[arg(short, long)]
2836
verbose: bool,
37+
#[arg(long)]
38+
display_metrics: bool,
2939
#[arg(long, default_value_t, value_enum)]
3040
display_format: DisplayFormat,
3141
#[arg(long, default_value_t = false)]
@@ -76,12 +86,6 @@ fn main() -> anyhow::Result<()> {
7686

7787
let runtime = new_tokio_runtime(args.threads);
7888

79-
let targets = args
80-
.formats
81-
.iter()
82-
.map(|f| Target::new(Engine::DataFusion, *f))
83-
.collect_vec();
84-
8589
let pbi_dataset = PBI_DATASETS.get(args.dataset);
8690
let queries = match args.queries.clone() {
8791
None => pbi_dataset.queries()?,
@@ -92,7 +96,7 @@ fn main() -> anyhow::Result<()> {
9296
.collect(),
9397
};
9498

95-
let progress_bar = ProgressBar::new((queries.len() * args.formats.len()) as u64);
99+
let progress_bar = ProgressBar::new((queries.len() * args.targets.len()) as u64);
96100
let mut all_measurements = Vec::default();
97101
let mut metrics = Vec::new();
98102

@@ -101,7 +105,7 @@ fn main() -> anyhow::Result<()> {
101105
// download csvs, unzip, convert to parquet, and convert that to vortex
102106
runtime.block_on(dataset.write_as_vortex());
103107

104-
for target in &targets {
108+
for target in &args.targets {
105109
let format = target.format();
106110
let session =
107111
df::get_session_context(args.emulate_object_store, args.disable_datafusion_cache);
@@ -164,21 +168,23 @@ fn main() -> anyhow::Result<()> {
164168

165169
match args.display_format {
166170
DisplayFormat::Table => {
167-
for (query, format, metric_sets) in metrics {
168-
println!("\nmetrics for query={query}, {format}:");
169-
for (idx, metric_set) in metric_sets.into_iter().enumerate() {
170-
println!("scan[{idx}]:");
171-
for m in metric_set
172-
.timestamps_removed()
173-
.aggregate()
174-
.sorted_for_display()
175-
.iter()
176-
{
177-
println!("{}", m);
171+
if args.display_metrics {
172+
for (query, format, metric_sets) in metrics {
173+
println!("\nmetrics for query={query}, {format}:");
174+
for (idx, metric_set) in metric_sets.into_iter().enumerate() {
175+
println!("scan[{idx}]:");
176+
for m in metric_set
177+
.timestamps_removed()
178+
.aggregate()
179+
.sorted_for_display()
180+
.iter()
181+
{
182+
println!("{}", m);
183+
}
178184
}
179185
}
180186
}
181-
render_table(all_measurements, &targets)
187+
render_table(all_measurements, &args.targets)
182188
}
183189
DisplayFormat::GhJson => print_measurements_json(all_measurements),
184190
}

bench-vortex/src/public_bi.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ pub static PBI_DATASETS: LazyLock<PBIDatasets> = LazyLock::new(|| {
4242
});
4343

4444
#[derive(Copy, Clone, Debug, Eq, PartialEq, Hash, ValueEnum)]
45-
#[clap(rename_all = "PascalCase")]
45+
#[clap(rename_all = "LowerCase")]
4646
pub enum PBIDataset {
4747
Arade,
4848
Bimbo,
@@ -312,7 +312,9 @@ impl PBIData {
312312
public_bi_csv_to_parquet_file(table, csv, &output_path).await
313313
})
314314
.await
315-
.vortex_expect("failed to create parquet file");
315+
.vortex_expect(
316+
"failed to create parquet file, either the file or duckdb is missing",
317+
);
316318
let pq_size = parquet_file.metadata().unwrap().size();
317319
info!(
318320
"Parquet size: {}, {}B",

vortex-array/src/arrays/extension/compute/to_arrow.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,9 @@ impl ToArrowFn<&ExtensionArray> for ExtensionEncoding {
2424
// NOTE(ngates): this is really gross... but I guess it's ok given how tightly integrated
2525
// we are with Arrow.
2626
if is_temporal_ext_type(array.id()) {
27-
temporal_to_arrow(TemporalArray::try_from(array.to_array())?).map(Some)
27+
// TODO(joe): push this cast into `temporal_to_arrow`
28+
let arrow = temporal_to_arrow(TemporalArray::try_from(array.to_array())?)?;
29+
Ok(arrow_cast::cast(&arrow, data_type).map(Some)?)
2830
} else {
2931
// Convert storage array directly into arrow, losing type information
3032
// that will let us round-trip.

0 commit comments

Comments
 (0)