Optimize table metadata retrieval for temp view creation in spark_sql_runner.

dorellang · copybara-github · commit 9c7fd5674086 · 2025-01-14T10:20:45.000-08:00
PiperOrigin-RevId: 715431653
diff --git a/perfkitbenchmarker/dpb_sparksql_benchmark_helper.py b/perfkitbenchmarker/dpb_sparksql_benchmark_helper.py
@@ -158,32 +158,6 @@ def GetQueryIdsToStage() -> list[str]:
   return FLAGS.dpb_sparksql_order
 
 
-def GetTableMetadata(benchmark_spec):
-  """Compute map of table metadata for spark_sql_runner --table_metadata."""
-  metadata = {}
-  # TODO(user) : we support CSV format only when create_hive_tables
-  # is false.
-  if not FLAGS.dpb_sparksql_create_hive_tables:
-    for subdir in benchmark_spec.table_subdirs or []:
-      # Subdir is table name
-      option_params = {
-          'path': os.path.join(benchmark_spec.data_dir, subdir),
-      }
-      # support csv data format which contains a header and has delimiter
-      # defined by dpb_sparksql_csv_delimiter flag
-      if FLAGS.dpb_sparksql_data_format == 'csv':
-        # TODO(user): currently we only support csv with a header.
-        # If the csv does not have a header it will not load properly.
-        option_params['header'] = 'true'
-        option_params['delimiter'] = FLAGS.dpb_sparksql_csv_delimiter
-
-      metadata[subdir] = (
-          FLAGS.dpb_sparksql_data_format or 'parquet',
-          option_params,
-      )
-  return metadata
-
-
 def StageMetadata(
     json_metadata: Any,
     storage_service: object_storage_service.ObjectStorageService,
@@ -224,6 +198,7 @@ def Prepare(benchmark_spec):
     storage_service.CopyToBucket(src_url, cluster.bucket, script)
 
   benchmark_spec.table_subdirs = []
+  benchmark_spec.data_dir = None
   if FLAGS.dpb_sparksql_data:
     # Replace s3a scheme (used for S3 Express in Spark) with s3
     table_dir = re.sub(r'^s3a://', 's3://', FLAGS.dpb_sparksql_data)
diff --git a/perfkitbenchmarker/linux_benchmarks/dpb_sparksql_benchmark.py b/perfkitbenchmarker/linux_benchmarks/dpb_sparksql_benchmark.py
@@ -94,11 +94,17 @@
     worker_count: 2
 """
 
-flags.DEFINE_list(
-    'bigquery_tables',
-    [],
-    'A list of BigQuery tables to load as Temporary Spark SQL views instead '
-    'of reading from external Hive tables.',
+_BIGQUERY_DATASET = flags.DEFINE_string(
+    'dpb_sparksql_bigquery_dataset',
+    None,
+    'BigQuery dataset with the tables to load as Temporary Spark SQL views'
+    ' instead of reading from external Hive tables.',
+)
+_BIGQUERY_TABLES = flags.DEFINE_list(
+    'dpb_sparksql_bigquery_tables',
+    None,
+    'BigQuery table names (unqualified) to load as Temporary Spark SQL views'
+    ' instead of reading from external Hive tables.',
 )
 flags.DEFINE_string(
     'bigquery_record_format',
@@ -132,15 +138,30 @@ def CheckPrerequisites(benchmark_config):
     raise errors.Config.InvalidValue(
         'You cannot create hive tables in a custom database.'
     )
+  if bool(_BIGQUERY_DATASET.value) != bool(_BIGQUERY_TABLES.value):
+    raise errors.Config.InvalidValue(
+        '--dpb_sparksql_bigquery_dataset and '
+        '--dpb_sparksql_bigquery_tables must be passed together.'
+    )
   if not (
       FLAGS.dpb_sparksql_data
-      or FLAGS.bigquery_tables
+      or _BIGQUERY_TABLES.value
       or FLAGS.dpb_sparksql_database
   ):
     # In the case of a static dpb_service, data could pre-exist
     logging.warning(
-        'You did not specify --dpb_sparksql_data, --bigquery_tables, '
-        'or dpb_sparksql_database. You will probably not have data to query!'
+        'You did not specify --dpb_sparksql_data,'
+        ' --dpb_sparksql_bigquery_tables, or dpb_sparksql_database. You will'
+        ' probably not have data to query!'
+    )
+  if sum([
+      bool(FLAGS.dpb_sparksql_data),
+      bool(_BIGQUERY_TABLES.value),
+      bool(FLAGS.dpb_sparksql_database),
+  ]) == 1:
+    logging.warning(
+        'You should only pass one of them: --dpb_sparksql_data,'
+        ' --dpb_sparksql_bigquery_tables, or --dpb_sparksql_database.'
     )
   if bool(FLAGS.dpb_sparksql_order) == bool(FLAGS.dpb_sparksql_streams):
     raise errors.Config.InvalidValue(
@@ -284,7 +305,6 @@ def _GetSampleMetadata(benchmark_spec):
 def _RunQueries(benchmark_spec) -> tuple[str, dpb_service.JobResult]:
   """Runs queries. Returns storage path with metrics and JobResult object."""
   cluster = benchmark_spec.dpb_service
-  storage_service = cluster.storage_service
   report_dir = '/'.join([cluster.base_dir, f'report-{int(time.time()*1000)}'])
   args = ['--sql-scripts-dir', benchmark_spec.query_dir]
   if FLAGS.dpb_sparksql_simultaneous:
@@ -299,18 +319,29 @@ def _RunQueries(benchmark_spec) -> tuple[str, dpb_service.JobResult]:
   args += ['--report-dir', report_dir]
   if FLAGS.dpb_sparksql_database:
     args += ['--database', FLAGS.dpb_sparksql_database]
-  table_metadata = _GetTableMetadata(benchmark_spec)
-  if table_metadata:
-    table_metadata_file = '/'.join([cluster.base_dir, 'metadata.json'])
-    dpb_sparksql_benchmark_helper.StageMetadata(
-        table_metadata, storage_service, table_metadata_file
-    )
-    args += ['--table-metadata', table_metadata_file]
-  else:
-    # If we don't pass in tables, we must be reading from hive.
+  if FLAGS.dpb_sparksql_create_hive_tables:
     # Note you can even read from Hive without --create_hive_tables if they
     # were precreated.
     args += ['--enable-hive', 'True']
+  else:
+    table_names = []
+    if _BIGQUERY_DATASET.value:
+      args += ['--bigquery-dataset', _BIGQUERY_DATASET.value]
+      table_names = _BIGQUERY_TABLES.value
+    elif benchmark_spec.data_dir:
+      args += ['--table-base-dir', benchmark_spec.data_dir]
+      table_names = benchmark_spec.table_subdirs or []
+    if table_names:
+      args += ['--table-names', *table_names]
+    if FLAGS.dpb_sparksql_data_format:
+      args += ['--table-format', FLAGS.dpb_sparksql_data_format]
+    if (
+        FLAGS.dpb_sparksql_data_format == 'csv'
+        and FLAGS.dpb_sparksql_csv_delimiter
+    ):
+      args += ['--csv-delim', FLAGS.dpb_sparksql_csv_delimiter]
+    if FLAGS.bigquery_record_format:
+      args += ['--bigquery-read-data-format', FLAGS.bigquery_record_format]
   if FLAGS.dpb_sparksql_table_cache:
     args += ['--table-cache', FLAGS.dpb_sparksql_table_cache]
   if dpb_sparksql_benchmark_helper.DUMP_SPARK_CONF.value:
@@ -503,17 +534,6 @@ def _GetDistCpMetadata(base_dir: str, subdirs: List[str], extra_metadata=None):
   return metadata
 
 
-def _GetTableMetadata(benchmark_spec):
-  metadata = dpb_sparksql_benchmark_helper.GetTableMetadata(benchmark_spec)
-  for table in FLAGS.bigquery_tables:
-    name = table.split('.')[-1]
-    bq_options = {'table': table}
-    if FLAGS.bigquery_record_format:
-      bq_options['readDataFormat'] = FLAGS.bigquery_record_format
-    metadata[name] = (FLAGS.dpb_sparksql_data_format or 'bigquery', bq_options)
-  return metadata
-
-
 def Cleanup(benchmark_spec):
   """Cleans up the Benchmark."""
   del benchmark_spec  # unused
diff --git a/perfkitbenchmarker/scripts/spark_sql_test_scripts/spark_sql_runner.py b/perfkitbenchmarker/scripts/spark_sql_test_scripts/spark_sql_runner.py
@@ -10,7 +10,6 @@
 
 import argparse
 from concurrent import futures
-import json
 import logging
 import os
 import time
@@ -40,19 +39,44 @@ def parse_args(args=None):
       required=True,
       help='Object storage path where the SQL queries are located.',
   )
-  parser.add_argument('--database', help='Hive database to look for data in.')
+  group = parser.add_mutually_exclusive_group()
+  group.add_argument('--database', help='Hive database to look for data in.')
+  group.add_argument(
+      '--table-base-dir',
+      help=(
+          'Base HCFS path containing the table data to be registered into Spark'
+          ' temporary view.'
+      ),
+  )
+  group.add_argument(
+      '--bigquery-dataset',
+      help=(
+          'BQ Dataset containing the tables passed in --table-names to be'
+          ' registered into Spark temporary view.'
+      ),
+  )
+  parser.add_argument(
+      '--table-names',
+      nargs='+',
+      help='Names of the tables to be registered into Spark temporary view.',
+  )
+  parser.add_argument(
+      '--table-format',
+      help=(
+          'Format of data to be registered into Spark temporary view as passed'
+          ' to `spark.read.format()`. Assumed to be "parquet", or "bigquery" if'
+          ' a BQ dataset is also specified.'
+      ),
+  )
+  parser.add_argument(
+      '--bigquery-read-data-format',
+      help=(
+          'The record format to use when connecting to BigQuery storage. See:'
+          ' https://github.com/GoogleCloudDataproc/spark-bigquery-connector#properties'
+      ),
+  )
   parser.add_argument(
-      '--table-metadata',
-      metavar='METADATA_FILE',
-      help="""\
-HCFS file containing JSON Object mapping table names to arrays of length 2.
-The arrays contain the format of the data and the options to pass to the
-dataframe reader. e.g.:
-{
-
-  "my_bq_table": ["bigquery", {"table": "bigquery_public_data:dataset.table"}],
-  "my_parquet_table": ["parquet", {"path": "gs://some/directory"}]
-}""",
+      '--csv-delimiter', help='CSV delimiter to load CSV files', default=','
   )
   parser.add_argument(
       '--enable-hive',
@@ -108,10 +132,7 @@ def main(args):
   spark = builder.getOrCreate()
   if args.database:
     spark.catalog.setCurrentDatabase(args.database)
-  table_metadata = []
-  if args.table_metadata:
-    table_metadata = get_table_metadata(spark, args).items()
-  for name, (fmt, options) in table_metadata:
+  for name, (fmt, options) in get_table_metadata(args).items():
     logging.info('Loading %s', name)
     spark.read.format(fmt).options(**options).load().createTempView(name)
   if args.table_cache:
@@ -154,6 +175,27 @@ def main(args):
   )
 
 
+def get_table_metadata(args):
+  """Gets table metadata to create temporary views according to args passed."""
+  metadata = {}
+  if args.table_base_dir:
+    for table_name in args.table_names:
+      option_params = {'path': os.path.join(args.table_base_dir, table_name)}
+      if args.table_format == 'csv':
+        option_params['header'] = 'true'
+        option_params['delimiter'] = args.csv_delimiter
+      metadata[table_name] = (args.table_format or 'parquet', option_params)
+  elif args.bigquery_dataset:
+    for table_name in args.table_names:
+      bq_options = {
+          'table': '.'.join([args.bigquery_dataset, table_name])
+      }
+      if args.bigquery_read_data_format:
+        bq_options['readDataFormat'] = args.bigquery_read_data_format
+      metadata[table_name] = (args.table_format or 'bigquery', bq_options)
+  return metadata
+
+
 def get_script_streams(args):
   """Gets the script streams to run.
 
@@ -170,11 +212,6 @@ def get_script_streams(args):
   ]
 
 
-def get_table_metadata(spark, args):
-  """Gets table metadata to create temporary views."""
-  return json.loads(_load_file(spark, args.table_metadata))
-
-
 def run_sql_script(
     spark_session, script_stream, stream_id, raise_query_execution_errors
 ):
diff --git a/tests/linux_benchmarks/dpb_sparksql_benchmark_test.py b/tests/linux_benchmarks/dpb_sparksql_benchmark_test.py