Skip to content

Commit c3ce79b

Browse files
dankinga10y
andauthored
feat: allow benchmarking against remote files (#2297)
Also with configurable: tracing & scale-factor. When I tried to use DataFusion to write directly into S3 objects, it returned successfully but no objects were created. I think my NamedLocks is a bit janky, but we don't atomic renames in the cloud. --------- Co-authored-by: Andrew Duffy <[email protected]>
1 parent 4d97e22 commit c3ce79b

File tree

5 files changed

+282
-92
lines changed

5 files changed

+282
-92
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

bench-vortex/Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ indicatif = { workspace = true }
4646
itertools = { workspace = true }
4747
log = { workspace = true, features = ["max_level_debug"] }
4848
mimalloc = { workspace = true }
49-
object_store = { workspace = true, features = ["aws"] }
49+
object_store = { workspace = true, features = ["aws", "gcp"] }
5050
parquet = { workspace = true, features = ["async"] }
5151
rand = { workspace = true }
5252
rand_distr = { workspace = true }
@@ -66,6 +66,7 @@ tracing-subscriber = { workspace = true, features = [
6666
"env-filter",
6767
"tracing-log",
6868
] }
69+
url = { workspace = true }
6970
uuid = { workspace = true, features = ["v4"] }
7071
vortex = { workspace = true, features = ["object_store", "parquet"] }
7172
vortex-datafusion = { workspace = true }

bench-vortex/src/bin/tpch_benchmark.rs

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@ use futures::future::try_join_all;
1313
use indicatif::ProgressBar;
1414
use itertools::Itertools;
1515
use tokio::runtime::Builder;
16+
use url::Url;
1617
use vortex::aliases::hash_map::HashMap;
18+
use vortex::error::VortexExpect as _;
1719

1820
feature_flagged_allocator!();
1921

@@ -26,13 +28,19 @@ struct Args {
2628
exclude_queries: Option<Vec<usize>>,
2729
#[arg(short, long)]
2830
threads: Option<usize>,
31+
#[arg(long)]
32+
use_remote_data_dir: Option<String>,
2933
#[arg(short, long, default_value_t = true, default_missing_value = "true", action = ArgAction::Set)]
3034
warmup: bool,
3135
#[arg(short, long, default_value = "5")]
3236
iterations: usize,
37+
#[arg(long, value_delimiter = ',')]
38+
formats: Option<Vec<String>>,
39+
#[arg(long, default_value_t = 1)]
40+
scale_factor: u8,
3341
#[arg(long)]
3442
only_vortex: bool,
35-
#[arg(short, long)]
43+
#[arg(short)]
3644
verbose: bool,
3745
#[arg(short, long, default_value_t, value_enum)]
3846
display_format: DisplayFormat,
@@ -57,45 +65,98 @@ fn main() -> ExitCode {
5765
}
5866
.expect("Failed building the Runtime");
5967

68+
let url = match args.use_remote_data_dir {
69+
None => {
70+
let db_gen_options = DBGenOptions::default().with_scale_factor(args.scale_factor);
71+
let data_dir = DBGen::new(db_gen_options).generate().unwrap();
72+
eprintln!(
73+
"Using existing or generating new files located at {}.",
74+
data_dir.display()
75+
);
76+
Url::parse(
77+
("file:".to_owned() + data_dir.to_str().vortex_expect("path should be utf8") + "/")
78+
.as_ref(),
79+
)
80+
.unwrap()
81+
}
82+
Some(tpch_benchmark_remote_data_dir) => {
83+
// e.g. "s3://vortex-bench-dev/parquet/"
84+
//
85+
// The trailing slash is significant!
86+
//
87+
// The folder must already be populated with data!
88+
if !tpch_benchmark_remote_data_dir.ends_with("/") {
89+
eprintln!("Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev/parquet/");
90+
}
91+
eprintln!(
92+
concat!(
93+
"Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n",
94+
"If it does not, you should kill this command, locally generate the files (by running without\n",
95+
"--use-remote-data-dir) and upload data/tpch/1/ to some remote location.",
96+
),
97+
tpch_benchmark_remote_data_dir,
98+
);
99+
Url::parse(&tpch_benchmark_remote_data_dir).unwrap()
100+
}
101+
};
102+
103+
if args.only_vortex {
104+
panic!("use `--formats vortex,arrow` instead of `--only-vortex`");
105+
}
106+
60107
runtime.block_on(bench_main(
61108
args.queries,
62109
args.exclude_queries,
63110
args.iterations,
64111
args.warmup,
65-
args.only_vortex,
112+
args.formats,
66113
args.display_format,
67114
args.emulate_object_store,
115+
url,
68116
))
69117
}
70118

119+
#[allow(clippy::too_many_arguments)]
71120
async fn bench_main(
72121
queries: Option<Vec<usize>>,
73122
exclude_queries: Option<Vec<usize>>,
74123
iterations: usize,
75124
warmup: bool,
76-
only_vortex: bool,
125+
formats: Option<Vec<String>>,
77126
display_format: DisplayFormat,
78127
emulate_object_store: bool,
128+
url: Url,
79129
) -> ExitCode {
80130
// uncomment the below to enable trace logging of datafusion execution
81131
// let filter = default_env_filter(true);
82132
// setup_logger(filter);
83133

84134
// Run TPC-H data gen.
85-
let data_dir = DBGen::new(DBGenOptions::default()).generate().unwrap();
86135

87136
// The formats to run against (vs the baseline)
88-
let formats = if only_vortex {
89-
vec![Format::Arrow, Format::OnDiskVortex]
90-
} else {
91-
vec![Format::Arrow, Format::Parquet, Format::OnDiskVortex]
137+
let formats = match formats {
138+
None => vec![Format::Arrow, Format::Parquet, Format::OnDiskVortex],
139+
Some(formats) => formats
140+
.into_iter()
141+
.map(|format| match format.as_ref() {
142+
"arrow" => Format::Arrow,
143+
"parquet" => Format::Parquet,
144+
"vortex" => Format::OnDiskVortex,
145+
_ => panic!("unrecognized format: {}", format),
146+
})
147+
.collect::<Vec<_>>(),
92148
};
93149

150+
eprintln!(
151+
"Benchmarking against these formats: {}.",
152+
formats.iter().join(", ")
153+
);
154+
94155
// Load datasets
95156
let ctxs = try_join_all(
96157
formats
97158
.iter()
98-
.map(|format| load_datasets(&data_dir, *format, emulate_object_store)),
159+
.map(|format| load_datasets(&url, *format, emulate_object_store)),
99160
)
100161
.await
101162
.unwrap();

bench-vortex/src/tpch/dbgen.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,14 @@ impl DBGenOptions {
5858
cache_dir: self.cache_dir,
5959
}
6060
}
61+
62+
pub fn with_scale_factor(self, scale_factor: u8) -> Self {
63+
Self {
64+
base_dir: self.base_dir,
65+
scale_factor,
66+
cache_dir: self.cache_dir,
67+
}
68+
}
6169
}
6270

6371
impl DBGen {

0 commit comments

Comments
 (0)