[Python] Added the validate support for ReadAllFromBigQuery (#34910)

liferoad · web-flow · commit c2c9275c6806 · 2025-05-11T10:19:54.000-04:00
* added validate

* updated the doc strings
diff --git a/sdks/python/apache_beam/io/gcp/bigquery.py b/sdks/python/apache_beam/io/gcp/bigquery.py
@@ -2744,7 +2744,8 @@ class ReadFromBigQuery(PTransform):
       :data:`True` for most scenarios in order to catch errors as early as
       possible (pipeline construction instead of pipeline execution). It
       should be :data:`False` if the table is created during pipeline
-      execution by a previous step.
+      execution by a previous step. Set this to :data:`False`
+      if the BigQuery export method is slow due to checking file existence.
     coder (~apache_beam.coders.coders.Coder): The coder for the table
       rows. If :data:`None`, then the default coder is
       _JsonToDictCoder, which will interpret every row as a JSON
@@ -3033,7 +3034,8 @@ class ReadAllFromBigQuery(PTransform):
       bucket where the extracted table should be written as a string. If
       :data:`None`, then the temp_location parameter is used.
     validate (bool): If :data:`True`, various checks will be done when source
-      gets initialized (e.g., is table present?).
+      gets initialized (e.g., is table present?). Set this to :data:`False`
+      if the BigQuery export method is slow due to checking file existence.
     kms_key (str): Experimental. Optional Cloud KMS key name for use when
       creating new temporary tables.
    """
@@ -3078,6 +3080,7 @@ def expand(self, pcoll):
         _BigQueryReadSplit(
             options=pcoll.pipeline.options,
             gcs_location=self.gcs_location,
+            validate=self.validate,
             bigquery_job_labels=self.bigquery_job_labels,
             job_name=job_name,
             step_name=step_name,
diff --git a/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py b/sdks/python/apache_beam/io/gcp/bigquery_read_internal.py
@@ -195,6 +195,7 @@ def __init__(
       self,
       options: PipelineOptions,
       gcs_location: Union[str, ValueProvider] = None,
+      validate: bool = False,
       use_json_exports: bool = False,
       bigquery_job_labels: Dict[str, str] = None,
       step_name: str = None,
@@ -205,6 +206,7 @@ def __init__(
       temp_dataset: Union[str, DatasetReference] = None,
       query_priority: Optional[str] = None):
     self.options = options
+    self.validate = validate
     self.use_json_exports = use_json_exports
     self.gcs_location = gcs_location
     self.bigquery_job_labels = bigquery_job_labels or {}
@@ -285,14 +287,15 @@ def _get_bq_metadata(self):
 
   def _create_source(self, path, schema):
     if not self.use_json_exports:
-      return _create_avro_source(path)
+      return _create_avro_source(path, validate=self.validate)
     else:
       return _TextSource(
           path,
           min_bundle_size=0,
           compression_type=CompressionTypes.UNCOMPRESSED,
           strip_trailing_newlines=True,
-          coder=_JsonToDictCoder(schema))
+          coder=_JsonToDictCoder(schema),
+          validate=self.validate)
 
   def _setup_temporary_dataset(
       self,