Skip to content

Commit ac7b1fe

Browse files
feat: Add new options to LoadJobConfig and ExternalConfig
This commit introduces new configuration options for BigQuery load jobs and external table definitions, aligning with recent updates to the underlying protos. New options added: - `time_zone`: Specifies the default timezone for parsing timestamps. (Applies to LoadJobConfig, ExternalConfig; CSV & JSON) - `date_format`: Specifies the format for parsing DATE values. (Applies to LoadJobConfig, ExternalConfig; CSV & JSON) - `datetime_format`: Specifies the format for parsing DATETIME values. (Applies to LoadJobConfig, ExternalConfig; CSV & JSON) - `time_format`: Specifies the format for parsing TIME values. (Applies to LoadJobConfig, ExternalConfig; CSV & JSON) - `timestamp_format`: Specifies the format for parsing TIMESTAMP values. (Applies to LoadJobConfig, ExternalConfig; CSV & JSON) - `null_markers`: A list of strings to be interpreted as NULL. (Applies to LoadJobConfig, ExternalConfig (via CSVOptions); CSV only) - `source_column_name_match_option`: Controls how source columns are matched to the schema. (Applies to LoadJobConfig, ExternalConfig (via CSVOptions); CSV only) Changes include: - Added corresponding properties (getters/setters) to `LoadJobConfig`, `LoadJob`, `ExternalConfig`, and `CSVOptions`. - Updated docstrings and type hints for all new attributes. - Updated unit tests to cover the new options, ensuring they are correctly handled during object initialization, serialization to API representation, and deserialization from API responses.
1 parent cd2e138 commit ac7b1fe

File tree

4 files changed

+365
-1
lines changed

4 files changed

+365
-1
lines changed

google/cloud/bigquery/external_config.py

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import base64
2424
import copy
2525
import typing
26-
from typing import Any, Dict, FrozenSet, Iterable, Optional, Union
26+
from typing import Any, Dict, FrozenSet, Iterable, List, Optional, Union
2727

2828
from google.cloud.bigquery._helpers import _to_bytes
2929
from google.cloud.bigquery._helpers import _bytes_to_json
@@ -474,6 +474,36 @@ def skip_leading_rows(self):
474474
def skip_leading_rows(self, value):
475475
self._properties["skipLeadingRows"] = str(value)
476476

477+
@property
478+
def null_markers(self) -> Optional[List[str]]:
479+
"""Optional[List[str]]: A list of strings represented as SQL NULL value.
480+
481+
See
482+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_marker
483+
(Note: API doc refers to null_marker singular, but proto is null_markers plural and a list)
484+
"""
485+
return self._properties.get("nullMarkers")
486+
487+
@null_markers.setter
488+
def null_markers(self, value: Optional[List[str]]):
489+
self._properties["nullMarkers"] = value
490+
491+
@property
492+
def source_column_name_match_option(self) -> Optional[str]:
493+
"""Optional[str]: Controls the strategy used to match loaded columns to the schema.
494+
Acceptable values are: "POSITION", "NAME".
495+
496+
See
497+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.source_column_match
498+
(Note: This field is documented under ExternalDataConfiguration in the REST API docs but seems
499+
more appropriate here for CSVOptions, matching the proto structure for external tables)
500+
"""
501+
return self._properties.get("sourceColumnMatch")
502+
503+
@source_column_name_match_option.setter
504+
def source_column_name_match_option(self, value: Optional[str]):
505+
self._properties["sourceColumnMatch"] = value
506+
477507
def to_api_repr(self) -> dict:
478508
"""Build an API representation of this object.
479509
@@ -848,6 +878,82 @@ def schema(self, value):
848878
prop = {"fields": [field.to_api_repr() for field in value]}
849879
self._properties["schema"] = prop
850880

881+
@property
882+
def time_zone(self) -> Optional[str]:
883+
"""Optional[str]: Default time zone that will apply when parsing
884+
timestamp values that have no specific time zone.
885+
886+
(Valid for CSV and NEWLINE_DELIMITED_JSON)
887+
888+
See:
889+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_zone
890+
"""
891+
return self._properties.get("timeZone")
892+
893+
@time_zone.setter
894+
def time_zone(self, value: Optional[str]):
895+
self._properties["timeZone"] = value
896+
897+
@property
898+
def date_format(self) -> Optional[str]:
899+
"""Optional[str]: Date format used for parsing DATE values.
900+
901+
(Valid for CSV and NEWLINE_DELIMITED_JSON)
902+
903+
See:
904+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.date_format
905+
"""
906+
return self._properties.get("dateFormat")
907+
908+
@date_format.setter
909+
def date_format(self, value: Optional[str]):
910+
self._properties["dateFormat"] = value
911+
912+
@property
913+
def datetime_format(self) -> Optional[str]:
914+
"""Optional[str]: Date format used for parsing DATETIME values.
915+
916+
(Valid for CSV and NEWLINE_DELIMITED_JSON)
917+
918+
See:
919+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.datetime_format
920+
"""
921+
return self._properties.get("datetimeFormat")
922+
923+
@datetime_format.setter
924+
def datetime_format(self, value: Optional[str]):
925+
self._properties["datetimeFormat"] = value
926+
927+
@property
928+
def time_format(self) -> Optional[str]:
929+
"""Optional[str]: Date format used for parsing TIME values.
930+
931+
(Valid for CSV and NEWLINE_DELIMITED_JSON)
932+
933+
See:
934+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.time_format
935+
"""
936+
return self._properties.get("timeFormat")
937+
938+
@time_format.setter
939+
def time_format(self, value: Optional[str]):
940+
self._properties["timeFormat"] = value
941+
942+
@property
943+
def timestamp_format(self) -> Optional[str]:
944+
"""Optional[str]: Date format used for parsing TIMESTAMP values.
945+
946+
(Valid for CSV and NEWLINE_DELIMITED_JSON)
947+
948+
See:
949+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.timestamp_format
950+
"""
951+
return self._properties.get("timestampFormat")
952+
953+
@timestamp_format.setter
954+
def timestamp_format(self, value: Optional[str]):
955+
self._properties["timestampFormat"] = value
956+
851957
@property
852958
def connection_id(self):
853959
"""Optional[str]: [Experimental] ID of a BigQuery Connection API

google/cloud/bigquery/job/load.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,104 @@ def source_format(self):
548548
def source_format(self, value):
549549
self._set_sub_prop("sourceFormat", value)
550550

551+
@property
552+
def time_zone(self):
553+
"""Optional[str]: Default time zone that will apply when parsing
554+
timestamp values that have no specific time zone.
555+
556+
See:
557+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_zone
558+
"""
559+
return self._get_sub_prop("timeZone")
560+
561+
@time_zone.setter
562+
def time_zone(self, value: Optional[str]):
563+
self._set_sub_prop("timeZone", value)
564+
565+
@property
566+
def date_format(self) -> Optional[str]:
567+
"""Optional[str]: Date format used for parsing DATE values.
568+
569+
See:
570+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.date_format
571+
"""
572+
return self._get_sub_prop("dateFormat")
573+
574+
@date_format.setter
575+
def date_format(self, value: Optional[str]):
576+
self._set_sub_prop("dateFormat", value)
577+
578+
@property
579+
def datetime_format(self) -> Optional[str]:
580+
"""Optional[str]: Date format used for parsing DATETIME values.
581+
582+
See:
583+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.datetime_format
584+
"""
585+
return self._get_sub_prop("datetimeFormat")
586+
587+
@datetime_format.setter
588+
def datetime_format(self, value: Optional[str]):
589+
self._set_sub_prop("datetimeFormat", value)
590+
591+
@property
592+
def time_format(self) -> Optional[str]:
593+
"""Optional[str]: Date format used for parsing TIME values.
594+
595+
See:
596+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_format
597+
"""
598+
return self._get_sub_prop("timeFormat")
599+
600+
@time_format.setter
601+
def time_format(self, value: Optional[str]):
602+
self._set_sub_prop("timeFormat", value)
603+
604+
@property
605+
def timestamp_format(self) -> Optional[str]:
606+
"""Optional[str]: Date format used for parsing TIMESTAMP values.
607+
608+
See:
609+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.timestamp_format
610+
"""
611+
return self._get_sub_prop("timestampFormat")
612+
613+
@timestamp_format.setter
614+
def timestamp_format(self, value: Optional[str]):
615+
self._set_sub_prop("timestampFormat", value)
616+
617+
@property
618+
def null_markers(self) -> Optional[List[str]]:
619+
"""Optional[List[str]]: A list of strings represented as SQL NULL value in a CSV file.
620+
621+
(CSV only).
622+
623+
See:
624+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_markers
625+
"""
626+
return self._get_sub_prop("nullMarkers")
627+
628+
@null_markers.setter
629+
def null_markers(self, value: Optional[List[str]]):
630+
self._set_sub_prop("nullMarkers", value)
631+
632+
@property
633+
def source_column_name_match_option(self) -> Optional[str]:
634+
"""Optional[str]: Controls the strategy used to match loaded columns to the schema.
635+
636+
(CSV only).
637+
Acceptable values are based on the SourceColumnMatch enum in the proto.
638+
Example values: "MATCH_BY_NAME", "MATCH_BY_POSITION".
639+
640+
See:
641+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
642+
"""
643+
return self._get_sub_prop("sourceColumnMatch")
644+
645+
@source_column_name_match_option.setter
646+
def source_column_name_match_option(self, value: Optional[str]):
647+
self._set_sub_prop("sourceColumnMatch", value)
648+
551649
@property
552650
def time_partitioning(self):
553651
"""Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based
@@ -889,6 +987,55 @@ def clustering_fields(self):
889987
"""
890988
return self.configuration.clustering_fields
891989

990+
@property
991+
def time_zone(self):
992+
"""See
993+
:attr:`google.cloud.bigquery.job.LoadJobConfig.time_zone`.
994+
"""
995+
return self.configuration.time_zone
996+
997+
@property
998+
def date_format(self):
999+
"""See
1000+
:attr:`google.cloud.bigquery.job.LoadJobConfig.date_format`.
1001+
"""
1002+
return self.configuration.date_format
1003+
1004+
@property
1005+
def datetime_format(self):
1006+
"""See
1007+
:attr:`google.cloud.bigquery.job.LoadJobConfig.datetime_format`.
1008+
"""
1009+
return self.configuration.datetime_format
1010+
1011+
@property
1012+
def time_format(self):
1013+
"""See
1014+
:attr:`google.cloud.bigquery.job.LoadJobConfig.time_format`.
1015+
"""
1016+
return self.configuration.time_format
1017+
1018+
@property
1019+
def timestamp_format(self):
1020+
"""See
1021+
:attr:`google.cloud.bigquery.job.LoadJobConfig.timestamp_format`.
1022+
"""
1023+
return self.configuration.timestamp_format
1024+
1025+
@property
1026+
def null_markers(self):
1027+
"""See
1028+
:attr:`google.cloud.bigquery.job.LoadJobConfig.null_markers`.
1029+
"""
1030+
return self.configuration.null_markers
1031+
1032+
@property
1033+
def source_column_name_match_option(self):
1034+
"""See
1035+
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_name_match_option`.
1036+
"""
1037+
return self.configuration.source_column_name_match_option
1038+
8921039
@property
8931040
def schema_update_options(self):
8941041
"""See

tests/unit/job/test_load.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,11 +37,25 @@ def _setUpConstants(self):
3737
self.OUTPUT_BYTES = 23456
3838
self.OUTPUT_ROWS = 345
3939
self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference"
40+
self.TIME_ZONE = "UTC"
41+
self.DATE_FORMAT = "%Y-%m-%d"
42+
self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
43+
self.TIME_FORMAT = "%H:%M:%S"
44+
self.TIMESTAMP_FORMAT = "YYYY-MM-DD HH:MM:SS.SSSSSSZ"
45+
self.NULL_MARKERS = ["N/A", "\\N"]
46+
self.SOURCE_COLUMN_NAME_MATCH_OPTION = "MATCH_BY_NAME"
4047

4148
def _make_resource(self, started=False, ended=False):
4249
resource = super(TestLoadJob, self)._make_resource(started, ended)
4350
config = resource["configuration"]["load"]
4451
config["sourceUris"] = [self.SOURCE1]
52+
config["timeZone"] = self.TIME_ZONE
53+
config["dateFormat"] = self.DATE_FORMAT
54+
config["datetimeFormat"] = self.DATETIME_FORMAT
55+
config["timeFormat"] = self.TIME_FORMAT
56+
config["timestampFormat"] = self.TIMESTAMP_FORMAT
57+
config["nullMarkers"] = self.NULL_MARKERS
58+
config["sourceColumnMatch"] = self.SOURCE_COLUMN_NAME_MATCH_OPTION
4559
config["destinationTable"] = {
4660
"projectId": self.PROJECT,
4761
"datasetId": self.DS_ID,
@@ -153,6 +167,37 @@ def _verifyResourceProperties(self, job, resource):
153167
else:
154168
self.assertIsNone(job.destination_encryption_configuration)
155169

170+
if "timeZone" in config:
171+
self.assertEqual(job.time_zone, config["timeZone"])
172+
else:
173+
self.assertIsNone(job.time_zone)
174+
if "dateFormat" in config:
175+
self.assertEqual(job.date_format, config["dateFormat"])
176+
else:
177+
self.assertIsNone(job.date_format)
178+
if "datetimeFormat" in config:
179+
self.assertEqual(job.datetime_format, config["datetimeFormat"])
180+
else:
181+
self.assertIsNone(job.datetime_format)
182+
if "timeFormat" in config:
183+
self.assertEqual(job.time_format, config["timeFormat"])
184+
else:
185+
self.assertIsNone(job.time_format)
186+
if "timestampFormat" in config:
187+
self.assertEqual(job.timestamp_format, config["timestampFormat"])
188+
else:
189+
self.assertIsNone(job.timestamp_format)
190+
if "nullMarkers" in config:
191+
self.assertEqual(job.null_markers, config["nullMarkers"])
192+
else:
193+
self.assertIsNone(job.null_markers)
194+
if "sourceColumnMatch" in config:
195+
self.assertEqual(
196+
job.source_column_name_match_option, config["sourceColumnMatch"]
197+
)
198+
else:
199+
self.assertIsNone(job.source_column_name_match_option)
200+
156201
def test_ctor(self):
157202
client = _make_client(project=self.PROJECT)
158203
job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client)
@@ -194,6 +239,13 @@ def test_ctor(self):
194239
self.assertIsNone(job.clustering_fields)
195240
self.assertIsNone(job.schema_update_options)
196241
self.assertIsNone(job.reference_file_schema_uri)
242+
self.assertIsNone(job.time_zone)
243+
self.assertIsNone(job.date_format)
244+
self.assertIsNone(job.datetime_format)
245+
self.assertIsNone(job.time_format)
246+
self.assertIsNone(job.timestamp_format)
247+
self.assertIsNone(job.null_markers)
248+
self.assertIsNone(job.source_column_name_match_option)
197249

198250
def test_ctor_w_config(self):
199251
from google.cloud.bigquery.schema import SchemaField
@@ -571,6 +623,13 @@ def test_begin_w_alternate_client(self):
571623
]
572624
},
573625
"schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
626+
"timeZone": self.TIME_ZONE,
627+
"dateFormat": self.DATE_FORMAT,
628+
"datetimeFormat": self.DATETIME_FORMAT,
629+
"timeFormat": self.TIME_FORMAT,
630+
"timestampFormat": self.TIMESTAMP_FORMAT,
631+
"nullMarkers": self.NULL_MARKERS,
632+
"sourceColumnMatch": self.SOURCE_COLUMN_NAME_MATCH_OPTION,
574633
}
575634
RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
576635
conn1 = make_connection()
@@ -599,6 +658,13 @@ def test_begin_w_alternate_client(self):
599658
config.write_disposition = WriteDisposition.WRITE_TRUNCATE
600659
config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION]
601660
config.reference_file_schema_uri = "gs://path/to/reference"
661+
config.time_zone = self.TIME_ZONE
662+
config.date_format = self.DATE_FORMAT
663+
config.datetime_format = self.DATETIME_FORMAT
664+
config.time_format = self.TIME_FORMAT
665+
config.timestamp_format = self.TIMESTAMP_FORMAT
666+
config.null_markers = self.NULL_MARKERS
667+
config.source_column_name_match_option = self.SOURCE_COLUMN_NAME_MATCH_OPTION
602668
with mock.patch(
603669
"google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
604670
) as final_attributes:

0 commit comments

Comments
 (0)