9494 worker_count: 2
9595"""
9696
97- flags .DEFINE_list (
98- 'bigquery_tables' ,
99- [],
100- 'A list of BigQuery tables to load as Temporary Spark SQL views instead '
101- 'of reading from external Hive tables.' ,
97+ _BIGQUERY_DATASET = flags .DEFINE_string (
98+ 'dpb_sparksql_bigquery_dataset' ,
99+ None ,
100+ 'BigQuery dataset with the tables to load as Temporary Spark SQL views'
101+ ' instead of reading from external Hive tables.' ,
102+ )
103+ _BIGQUERY_TABLES = flags .DEFINE_list (
104+ 'dpb_sparksql_bigquery_tables' ,
105+ None ,
106+ 'BigQuery table names (unqualified) to load as Temporary Spark SQL views'
107+ ' instead of reading from external Hive tables.' ,
102108)
103109flags .DEFINE_string (
104110 'bigquery_record_format' ,
@@ -132,15 +138,30 @@ def CheckPrerequisites(benchmark_config):
132138 raise errors .Config .InvalidValue (
133139 'You cannot create hive tables in a custom database.'
134140 )
141+ if bool (_BIGQUERY_DATASET .value ) != bool (_BIGQUERY_TABLES .value ):
142+ raise errors .Config .InvalidValue (
143+ '--dpb_sparksql_bigquery_dataset and '
144+ '--dpb_sparksql_bigquery_tables must be passed together.'
145+ )
135146 if not (
136147 FLAGS .dpb_sparksql_data
137- or FLAGS . bigquery_tables
148+ or _BIGQUERY_TABLES . value
138149 or FLAGS .dpb_sparksql_database
139150 ):
140151 # In the case of a static dpb_service, data could pre-exist
141152 logging .warning (
142- 'You did not specify --dpb_sparksql_data, --bigquery_tables, '
143- 'or dpb_sparksql_database. You will probably not have data to query!'
153+ 'You did not specify --dpb_sparksql_data,'
154+ ' --dpb_sparksql_bigquery_tables, or dpb_sparksql_database. You will'
155+ ' probably not have data to query!'
156+ )
157+ if sum ([
158+ bool (FLAGS .dpb_sparksql_data ),
159+ bool (_BIGQUERY_TABLES .value ),
160+ bool (FLAGS .dpb_sparksql_database ),
161+ ]) == 1 :
162+ logging .warning (
163+ 'You should only pass one of them: --dpb_sparksql_data,'
164+ ' --dpb_sparksql_bigquery_tables, or --dpb_sparksql_database.'
144165 )
145166 if bool (FLAGS .dpb_sparksql_order ) == bool (FLAGS .dpb_sparksql_streams ):
146167 raise errors .Config .InvalidValue (
@@ -284,7 +305,6 @@ def _GetSampleMetadata(benchmark_spec):
284305def _RunQueries (benchmark_spec ) -> tuple [str , dpb_service .JobResult ]:
285306 """Runs queries. Returns storage path with metrics and JobResult object."""
286307 cluster = benchmark_spec .dpb_service
287- storage_service = cluster .storage_service
288308 report_dir = '/' .join ([cluster .base_dir , f'report-{ int (time .time ()* 1000 )} ' ])
289309 args = ['--sql-scripts-dir' , benchmark_spec .query_dir ]
290310 if FLAGS .dpb_sparksql_simultaneous :
@@ -299,18 +319,29 @@ def _RunQueries(benchmark_spec) -> tuple[str, dpb_service.JobResult]:
299319 args += ['--report-dir' , report_dir ]
300320 if FLAGS .dpb_sparksql_database :
301321 args += ['--database' , FLAGS .dpb_sparksql_database ]
302- table_metadata = _GetTableMetadata (benchmark_spec )
303- if table_metadata :
304- table_metadata_file = '/' .join ([cluster .base_dir , 'metadata.json' ])
305- dpb_sparksql_benchmark_helper .StageMetadata (
306- table_metadata , storage_service , table_metadata_file
307- )
308- args += ['--table-metadata' , table_metadata_file ]
309- else :
310- # If we don't pass in tables, we must be reading from hive.
322+ if FLAGS .dpb_sparksql_create_hive_tables :
311323 # Note you can even read from Hive without --create_hive_tables if they
312324 # were precreated.
313325 args += ['--enable-hive' , 'True' ]
326+ else :
327+ table_names = []
328+ if _BIGQUERY_DATASET .value :
329+ args += ['--bigquery-dataset' , _BIGQUERY_DATASET .value ]
330+ table_names = _BIGQUERY_TABLES .value
331+ elif benchmark_spec .data_dir :
332+ args += ['--table-base-dir' , benchmark_spec .data_dir ]
333+ table_names = benchmark_spec .table_subdirs or []
334+ if table_names :
335+ args += ['--table-names' , * table_names ]
336+ if FLAGS .dpb_sparksql_data_format :
337+ args += ['--table-format' , FLAGS .dpb_sparksql_data_format ]
338+ if (
339+ FLAGS .dpb_sparksql_data_format == 'csv'
340+ and FLAGS .dpb_sparksql_csv_delimiter
341+ ):
342+ args += ['--csv-delim' , FLAGS .dpb_sparksql_csv_delimiter ]
343+ if FLAGS .bigquery_record_format :
344+ args += ['--bigquery-read-data-format' , FLAGS .bigquery_record_format ]
314345 if FLAGS .dpb_sparksql_table_cache :
315346 args += ['--table-cache' , FLAGS .dpb_sparksql_table_cache ]
316347 if dpb_sparksql_benchmark_helper .DUMP_SPARK_CONF .value :
@@ -503,17 +534,6 @@ def _GetDistCpMetadata(base_dir: str, subdirs: List[str], extra_metadata=None):
503534 return metadata
504535
505536
506- def _GetTableMetadata (benchmark_spec ):
507- metadata = dpb_sparksql_benchmark_helper .GetTableMetadata (benchmark_spec )
508- for table in FLAGS .bigquery_tables :
509- name = table .split ('.' )[- 1 ]
510- bq_options = {'table' : table }
511- if FLAGS .bigquery_record_format :
512- bq_options ['readDataFormat' ] = FLAGS .bigquery_record_format
513- metadata [name ] = (FLAGS .dpb_sparksql_data_format or 'bigquery' , bq_options )
514- return metadata
515-
516-
517537def Cleanup (benchmark_spec ):
518538 """Cleans up the Benchmark."""
519539 del benchmark_spec # unused
0 commit comments