@@ -5,19 +5,34 @@ use datafusion::{
55 catalog:: { MemTable , TableProvider } ,
66} ;
77
8- pub fn tpch_table ( name : & str ) -> Arc < dyn TableProvider > {
9- let schema = Arc :: new ( get_tpch_table_schema ( name) ) ;
10- Arc :: new ( MemTable :: try_new ( schema, vec ! [ ] ) . unwrap ( ) )
11- }
8+ use std:: fs;
9+
10+ use arrow:: record_batch:: RecordBatch ;
11+ use parquet:: { arrow:: arrow_writer:: ArrowWriter , file:: properties:: WriterProperties } ;
12+ use tpchgen:: generators:: {
13+ CustomerGenerator , LineItemGenerator , NationGenerator , OrderGenerator , PartGenerator ,
14+ PartSuppGenerator , RegionGenerator , SupplierGenerator ,
15+ } ;
16+ use tpchgen_arrow:: {
17+ CustomerArrow , LineItemArrow , NationArrow , OrderArrow , PartArrow , PartSuppArrow , RegionArrow ,
18+ SupplierArrow ,
19+ } ;
1220
13- pub fn tpch_query ( num : u8 ) -> String {
14- // read the query from the test/tpch/queries/ directory and return it
15- let query_path = format ! ( "testing/tpch/queries/q{}.sql" , num) ;
16- std:: fs:: read_to_string ( query_path)
21+ pub fn tpch_query_from_dir ( queries_dir : & std:: path:: Path , num : u8 ) -> String {
22+ let query_path = queries_dir. join ( format ! ( "q{num}.sql" ) ) ;
23+ fs:: read_to_string ( query_path)
1724 . unwrap_or_else ( |_| panic ! ( "Failed to read TPCH query file: q{}.sql" , num) )
1825 . trim ( )
1926 . to_string ( )
2027}
28+ pub const NUM_QUERIES : u8 = 22 ; // number of queries in the TPCH benchmark numbered from 1 to 22
29+
30+ const SCALE_FACTOR : f64 = 0.001 ;
31+
32+ pub fn tpch_table ( name : & str ) -> Arc < dyn TableProvider > {
33+ let schema = Arc :: new ( get_tpch_table_schema ( name) ) ;
34+ Arc :: new ( MemTable :: try_new ( schema, vec ! [ ] ) . unwrap ( ) )
35+ }
2136
2237pub fn get_tpch_table_schema ( table : & str ) -> Schema {
2338 // note that the schema intentionally uses signed integers so that any generated Parquet
@@ -113,3 +128,58 @@ pub fn get_tpch_table_schema(table: &str) -> Schema {
113128 _ => unimplemented ! ( ) ,
114129 }
115130}
131+
132+ // generate_table creates a parquet file in the data directory from an arrow RecordBatch row
133+ // source.
134+ fn generate_table < A > (
135+ mut data_source : A ,
136+ table_name : & str ,
137+ data_dir : & std:: path:: Path ,
138+ ) -> Result < ( ) , Box < dyn std:: error:: Error > >
139+ where
140+ A : Iterator < Item = RecordBatch > ,
141+ {
142+ let output_path = data_dir. join ( format ! ( "{}.parquet" , table_name) ) ;
143+
144+ if let Some ( first_batch) = data_source. next ( ) {
145+ let file = fs:: File :: create ( & output_path) ?;
146+ let props = WriterProperties :: builder ( ) . build ( ) ;
147+ let mut writer = ArrowWriter :: try_new ( file, first_batch. schema ( ) , Some ( props) ) ?;
148+
149+ writer. write ( & first_batch) ?;
150+
151+ while let Some ( batch) = data_source. next ( ) {
152+ writer. write ( & batch) ?;
153+ }
154+
155+ writer. close ( ) ?;
156+ }
157+
158+ Ok ( ( ) )
159+ }
160+
161+ macro_rules! must_generate_tpch_table {
162+ ( $generator: ident, $arrow: ident, $name: literal, $data_dir: expr) => {
163+ generate_table(
164+ // TODO: Consider adjusting the partitions and batch sizes.
165+ $arrow:: new( $generator:: new( SCALE_FACTOR , 1 , 1 ) ) . with_batch_size( 1000 ) ,
166+ $name,
167+ $data_dir,
168+ )
169+ . expect( concat!( "Failed to generate " , $name, " table" ) ) ;
170+ } ;
171+ }
172+
173+ // generate_tpch_data generates all TPC-H tables in the specified data directory.
174+ pub fn generate_tpch_data ( data_dir : & std:: path:: Path ) {
175+ fs:: create_dir_all ( data_dir) . expect ( "Failed to create data directory" ) ;
176+
177+ must_generate_tpch_table ! ( RegionGenerator , RegionArrow , "region" , data_dir) ;
178+ must_generate_tpch_table ! ( NationGenerator , NationArrow , "nation" , data_dir) ;
179+ must_generate_tpch_table ! ( CustomerGenerator , CustomerArrow , "customer" , data_dir) ;
180+ must_generate_tpch_table ! ( SupplierGenerator , SupplierArrow , "supplier" , data_dir) ;
181+ must_generate_tpch_table ! ( PartGenerator , PartArrow , "part" , data_dir) ;
182+ must_generate_tpch_table ! ( PartSuppGenerator , PartSuppArrow , "partsupp" , data_dir) ;
183+ must_generate_tpch_table ! ( OrderGenerator , OrderArrow , "orders" , data_dir) ;
184+ must_generate_tpch_table ! ( LineItemGenerator , LineItemArrow , "lineitem" , data_dir) ;
185+ }
0 commit comments