@@ -13,7 +13,9 @@ use futures::future::try_join_all;
1313use indicatif:: ProgressBar ;
1414use itertools:: Itertools ;
1515use tokio:: runtime:: Builder ;
16+ use url:: Url ;
1617use vortex:: aliases:: hash_map:: HashMap ;
18+ use vortex:: error:: VortexExpect as _;
1719
1820feature_flagged_allocator ! ( ) ;
1921
@@ -26,13 +28,19 @@ struct Args {
2628 exclude_queries : Option < Vec < usize > > ,
2729 #[ arg( short, long) ]
2830 threads : Option < usize > ,
31+ #[ arg( long) ]
32+ use_remote_data_dir : Option < String > ,
2933 #[ arg( short, long, default_value_t = true , default_missing_value = "true" , action = ArgAction :: Set ) ]
3034 warmup : bool ,
3135 #[ arg( short, long, default_value = "5" ) ]
3236 iterations : usize ,
37+ #[ arg( long, value_delimiter = ',' ) ]
38+ formats : Option < Vec < String > > ,
39+ #[ arg( long, default_value_t = 1 ) ]
40+ scale_factor : u8 ,
3341 #[ arg( long) ]
3442 only_vortex : bool ,
35- #[ arg( short, long ) ]
43+ #[ arg( short) ]
3644 verbose : bool ,
3745 #[ arg( short, long, default_value_t, value_enum) ]
3846 display_format : DisplayFormat ,
@@ -57,45 +65,98 @@ fn main() -> ExitCode {
5765 }
5866 . expect ( "Failed building the Runtime" ) ;
5967
68+ let url = match args. use_remote_data_dir {
69+ None => {
70+ let db_gen_options = DBGenOptions :: default ( ) . with_scale_factor ( args. scale_factor ) ;
71+ let data_dir = DBGen :: new ( db_gen_options) . generate ( ) . unwrap ( ) ;
72+ eprintln ! (
73+ "Using existing or generating new files located at {}." ,
74+ data_dir. display( )
75+ ) ;
76+ Url :: parse (
77+ ( "file:" . to_owned ( ) + data_dir. to_str ( ) . vortex_expect ( "path should be utf8" ) + "/" )
78+ . as_ref ( ) ,
79+ )
80+ . unwrap ( )
81+ }
82+ Some ( tpch_benchmark_remote_data_dir) => {
83+ // e.g. "s3://vortex-bench-dev/parquet/"
84+ //
85+ // The trailing slash is significant!
86+ //
87+ // The folder must already be populated with data!
88+ if !tpch_benchmark_remote_data_dir. ends_with ( "/" ) {
89+ eprintln ! ( "Supply a --use-remote-data-dir argument which ends in a slash e.g. s3://vortex-bench-dev/parquet/" ) ;
90+ }
91+ eprintln ! (
92+ concat!(
93+ "Assuming data already exists at this remote (e.g. S3, GCS) URL: {}.\n " ,
94+ "If it does not, you should kill this command, locally generate the files (by running without\n " ,
95+ "--use-remote-data-dir) and upload data/tpch/1/ to some remote location." ,
96+ ) ,
97+ tpch_benchmark_remote_data_dir,
98+ ) ;
99+ Url :: parse ( & tpch_benchmark_remote_data_dir) . unwrap ( )
100+ }
101+ } ;
102+
103+ if args. only_vortex {
104+ panic ! ( "use `--formats vortex,arrow` instead of `--only-vortex`" ) ;
105+ }
106+
60107 runtime. block_on ( bench_main (
61108 args. queries ,
62109 args. exclude_queries ,
63110 args. iterations ,
64111 args. warmup ,
65- args. only_vortex ,
112+ args. formats ,
66113 args. display_format ,
67114 args. emulate_object_store ,
115+ url,
68116 ) )
69117}
70118
119+ #[ allow( clippy:: too_many_arguments) ]
71120async fn bench_main (
72121 queries : Option < Vec < usize > > ,
73122 exclude_queries : Option < Vec < usize > > ,
74123 iterations : usize ,
75124 warmup : bool ,
76- only_vortex : bool ,
125+ formats : Option < Vec < String > > ,
77126 display_format : DisplayFormat ,
78127 emulate_object_store : bool ,
128+ url : Url ,
79129) -> ExitCode {
80130 // uncomment the below to enable trace logging of datafusion execution
81131 // let filter = default_env_filter(true);
82132 // setup_logger(filter);
83133
84134 // Run TPC-H data gen.
85- let data_dir = DBGen :: new ( DBGenOptions :: default ( ) ) . generate ( ) . unwrap ( ) ;
86135
87136 // The formats to run against (vs the baseline)
88- let formats = if only_vortex {
89- vec ! [ Format :: Arrow , Format :: OnDiskVortex ]
90- } else {
91- vec ! [ Format :: Arrow , Format :: Parquet , Format :: OnDiskVortex ]
137+ let formats = match formats {
138+ None => vec ! [ Format :: Arrow , Format :: Parquet , Format :: OnDiskVortex ] ,
139+ Some ( formats) => formats
140+ . into_iter ( )
141+ . map ( |format| match format. as_ref ( ) {
142+ "arrow" => Format :: Arrow ,
143+ "parquet" => Format :: Parquet ,
144+ "vortex" => Format :: OnDiskVortex ,
145+ _ => panic ! ( "unrecognized format: {}" , format) ,
146+ } )
147+ . collect :: < Vec < _ > > ( ) ,
92148 } ;
93149
150+ eprintln ! (
151+ "Benchmarking against these formats: {}." ,
152+ formats. iter( ) . join( ", " )
153+ ) ;
154+
94155 // Load datasets
95156 let ctxs = try_join_all (
96157 formats
97158 . iter ( )
98- . map ( |format| load_datasets ( & data_dir , * format, emulate_object_store) ) ,
159+ . map ( |format| load_datasets ( & url , * format, emulate_object_store) ) ,
99160 )
100161 . await
101162 . unwrap ( ) ;
0 commit comments