Skip to content

Commit d721ea4

Browse files
refactor: Integrate user's changes and new LoadJobConfig tests
This commit restores user changes from commit a9b187f that were inadvertently overwritten. These changes include: - Addition of SourceColumnMatch enum in enums.py. - Updates to docstrings in external_config.py and job/load.py. - Renaming of source_column_name_match_option to source_column_match_strategy in LoadJobConfig and LoadJob, now using the SourceColumnMatch enum. - Adjustments in related unit tests in test_load.py and test_external_config.py. Additionally, this commit incorporates the new unit tests for all recently added LoadJobConfig properties in tests/unit/job/test_load_config.py. Corrections were made to tests/unit/job/test_load.py to align with the renamed source_column_match_strategy property and its enum type.
1 parent c963716 commit d721ea4

File tree

5 files changed

+116
-41
lines changed

5 files changed

+116
-41
lines changed

google/cloud/bigquery/enums.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,23 @@ class JobCreationMode(object):
462462
The conditions under which BigQuery can decide to not create a Job are
463463
subject to change.
464464
"""
465+
466+
467+
class SourceColumnMatch(str, enum.Enum):
468+
"""Uses sensible defaults based on how the schema is provided.
469+
470+
If autodetect is used, then columns are matched by name. Otherwise, columns
471+
are matched by position. This is done to keep the behavior backward-compati
472+
ble.
473+
"""
474+
475+
SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
476+
"""Unspecified column name match option."""
477+
478+
POSITION = "POSITION"
479+
"""Matches by position. This assumes that the columns are ordered the same
480+
way as the schema."""
481+
482+
NAME = "NAME"
483+
"""Matches by name. This reads the header row as column names and reorders
484+
columns to match the field names in the schema."""

google/cloud/bigquery/external_config.py

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -476,11 +476,19 @@ def skip_leading_rows(self, value):
476476

477477
@property
478478
def null_markers(self) -> Optional[List[str]]:
479-
"""Optional[List[str]]: A list of strings represented as SQL NULL value.
479+
"""Optional[List[str]]: A list of strings represented as SQL NULL value in a CSV file.
480+
481+
null_marker and null_markers can't be set at the same time.
482+
If null_marker is set, null_markers has to be not set.
483+
If null_markers is set, null_marker has to be not set.
484+
If both null_marker and null_markers are set at the same time, a user
485+
error would be thrown.
486+
Any strings listed in null_markers, including
487+
empty string would be interpreted as SQL NULL. This applies to all column
488+
types.
480489
481490
See
482-
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_marker
483-
(Note: API doc refers to null_marker singular, but proto is null_markers plural and a list)
491+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_markers
484492
"""
485493
return self._properties.get("nullMarkers")
486494

@@ -490,13 +498,19 @@ def null_markers(self, value: Optional[List[str]]):
490498

491499
@property
492500
def source_column_name_match_option(self) -> Optional[str]:
493-
"""Optional[str]: Controls the strategy used to match loaded columns to the schema.
494-
Acceptable values are: "POSITION", "NAME".
501+
"""Optional[str]: Controls the strategy used to match loaded columns to the schema. If not
502+
set, a sensible default is chosen based on how the schema is provided. If
503+
autodetect is used, then columns are matched by name. Otherwise, columns
504+
are matched by position. This is done to keep the behavior
505+
backward-compatible.
506+
Acceptable values are:
507+
POSITION - matches by position. This assumes that the columns are ordered
508+
the same way as the schema.
509+
NAME - matches by name. This reads the header row as column names and
510+
reorders columns to match the field names in the schema.
495511
496512
See
497513
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.source_column_match
498-
(Note: This field is documented under ExternalDataConfiguration in the REST API docs but seems
499-
more appropriate here for CSVOptions, matching the proto structure for external tables)
500514
"""
501515
return self._properties.get("sourceColumnMatch")
502516

google/cloud/bigquery/job/load.py

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from google.cloud.bigquery.job.base import _JobConfig
3131
from google.cloud.bigquery.job.base import _JobReference
3232
from google.cloud.bigquery.query import ConnectionProperty
33+
from google.cloud.bigquery.enums import SourceColumnMatch
3334

3435

3536
class ColumnNameCharacterMap:
@@ -550,8 +551,9 @@ def source_format(self, value):
550551

551552
@property
552553
def time_zone(self):
553-
"""Optional[str]: Default time zone that will apply when parsing
554-
timestamp values that have no specific time zone.
554+
"""Optional[str]: Default time zone that will apply when parsing timestamp
555+
values that have no specific time zone. This option is valid for CSV and
556+
JSON sources.
555557
556558
See:
557559
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_zone
@@ -565,6 +567,7 @@ def time_zone(self, value: Optional[str]):
565567
@property
566568
def date_format(self) -> Optional[str]:
567569
"""Optional[str]: Date format used for parsing DATE values.
570+
This option is valid for CSV and JSON sources.
568571
569572
See:
570573
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.date_format
@@ -578,6 +581,7 @@ def date_format(self, value: Optional[str]):
578581
@property
579582
def datetime_format(self) -> Optional[str]:
580583
"""Optional[str]: Date format used for parsing DATETIME values.
584+
This option is valid for CSV and JSON sources.
581585
582586
See:
583587
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.datetime_format
@@ -591,6 +595,7 @@ def datetime_format(self, value: Optional[str]):
591595
@property
592596
def time_format(self) -> Optional[str]:
593597
"""Optional[str]: Date format used for parsing TIME values.
598+
This option is valid for CSV and JSON sources.
594599
595600
See:
596601
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_format
@@ -604,6 +609,7 @@ def time_format(self, value: Optional[str]):
604609
@property
605610
def timestamp_format(self) -> Optional[str]:
606611
"""Optional[str]: Date format used for parsing TIMESTAMP values.
612+
This option is valid for CSV and JSON sources.
607613
608614
See:
609615
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.timestamp_format
@@ -620,6 +626,15 @@ def null_markers(self) -> Optional[List[str]]:
620626
621627
(CSV only).
622628
629+
null_marker and null_markers can't be set at the same time.
630+
If null_marker is set, null_markers has to be not set.
631+
If null_markers is set, null_marker has to be not set.
632+
If both null_marker and null_markers are set at the same time, a user
633+
error would be thrown.
634+
Any strings listed in null_markers, including
635+
empty string would be interpreted as SQL NULL. This applies to all column
636+
types.
637+
623638
See:
624639
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_markers
625640
"""
@@ -630,21 +645,30 @@ def null_markers(self, value: Optional[List[str]]):
630645
self._set_sub_prop("nullMarkers", value)
631646

632647
@property
633-
def source_column_name_match_option(self) -> Optional[str]:
634-
"""Optional[str]: Controls the strategy used to match loaded columns to the schema.
648+
def source_column_match_strategy(self) -> Optional[SourceColumnMatch]:
649+
"""Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the strategy
650+
used to match loaded columns to the schema. If not set, a sensible default is
651+
chosen based on how the schema is provided. If autodetect is used, then
652+
columns are matched by name. Otherwise, columns are matched by position.
653+
This is done to keep the behavior backward-compatible.
635654
636655
(CSV only).
637-
Acceptable values are based on the SourceColumnMatch enum in the proto.
638-
Example values: "MATCH_BY_NAME", "MATCH_BY_POSITION".
639656
640657
See:
641-
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
658+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match_strategy
642659
"""
643-
return self._get_sub_prop("sourceColumnMatch")
644-
645-
@source_column_name_match_option.setter
646-
def source_column_name_match_option(self, value: Optional[str]):
647-
self._set_sub_prop("sourceColumnMatch", value)
660+
value = self._get_sub_prop("sourceColumnMatchStrategy")
661+
if value is not None:
662+
return SourceColumnMatch(value)
663+
return None
664+
665+
@source_column_match_strategy.setter
666+
def source_column_match_strategy(self, value: Optional[SourceColumnMatch]):
667+
if value is not None and not isinstance(value, SourceColumnMatch):
668+
raise TypeError(
669+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
670+
)
671+
self._set_sub_prop("sourceColumnMatchStrategy", value.value if value else None)
648672

649673
@property
650674
def time_partitioning(self):
@@ -1030,11 +1054,11 @@ def null_markers(self):
10301054
return self.configuration.null_markers
10311055

10321056
@property
1033-
def source_column_name_match_option(self):
1057+
def source_column_match_strategy(self):
10341058
"""See
1035-
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_name_match_option`.
1059+
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match_strategy`.
10361060
"""
1037-
return self.configuration.source_column_name_match_option
1061+
return self.configuration.source_column_match_strategy
10381062

10391063
@property
10401064
def schema_update_options(self):

tests/unit/job/test_load.py

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,8 @@ def _setUpConstants(self):
4242
self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
4343
self.TIME_FORMAT = "%H:%M:%S"
4444
self.TIMESTAMP_FORMAT = "YYYY-MM-DD HH:MM:SS.SSSSSSZ"
45-
self.NULL_MARKERS = ["N/A", "\\N"]
46-
self.SOURCE_COLUMN_NAME_MATCH_OPTION = "MATCH_BY_NAME"
45+
self.NULL_MARKERS = ["N/A", "NA"]
46+
self.SOURCE_COLUMN_NAME_MATCH_OPTION = "NAME" # Corrected to actual enum value
4747

4848
def _make_resource(self, started=False, ended=False):
4949
resource = super(TestLoadJob, self)._make_resource(started, ended)
@@ -55,7 +55,7 @@ def _make_resource(self, started=False, ended=False):
5555
config["timeFormat"] = self.TIME_FORMAT
5656
config["timestampFormat"] = self.TIMESTAMP_FORMAT
5757
config["nullMarkers"] = self.NULL_MARKERS
58-
config["sourceColumnMatch"] = self.SOURCE_COLUMN_NAME_MATCH_OPTION
58+
config["sourceColumnMatchStrategy"] = self.SOURCE_COLUMN_NAME_MATCH_OPTION # Keep value as string for mock API repr
5959
config["destinationTable"] = {
6060
"projectId": self.PROJECT,
6161
"datasetId": self.DS_ID,
@@ -191,12 +191,14 @@ def _verifyResourceProperties(self, job, resource):
191191
self.assertEqual(job.null_markers, config["nullMarkers"])
192192
else:
193193
self.assertIsNone(job.null_markers)
194-
if "sourceColumnMatch" in config:
194+
if "sourceColumnMatchStrategy" in config:
195+
# job.source_column_match_strategy will be an Enum, config[...] is a string
195196
self.assertEqual(
196-
job.source_column_name_match_option, config["sourceColumnMatch"]
197+
job.source_column_match_strategy.value,
198+
config["sourceColumnMatchStrategy"],
197199
)
198200
else:
199-
self.assertIsNone(job.source_column_name_match_option)
201+
self.assertIsNone(job.source_column_match_strategy)
200202

201203
def test_ctor(self):
202204
client = _make_client(project=self.PROJECT)
@@ -245,7 +247,7 @@ def test_ctor(self):
245247
self.assertIsNone(job.time_format)
246248
self.assertIsNone(job.timestamp_format)
247249
self.assertIsNone(job.null_markers)
248-
self.assertIsNone(job.source_column_name_match_option)
250+
self.assertIsNone(job.source_column_match_strategy)
249251

250252
def test_ctor_w_config(self):
251253
from google.cloud.bigquery.schema import SchemaField
@@ -629,7 +631,7 @@ def test_begin_w_alternate_client(self):
629631
"timeFormat": self.TIME_FORMAT,
630632
"timestampFormat": self.TIMESTAMP_FORMAT,
631633
"nullMarkers": self.NULL_MARKERS,
632-
"sourceColumnMatch": self.SOURCE_COLUMN_NAME_MATCH_OPTION,
634+
"sourceColumnMatchStrategy": self.SOURCE_COLUMN_NAME_MATCH_OPTION, # Keep value as string for mock API repr
633635
}
634636
RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
635637
conn1 = make_connection()
@@ -664,7 +666,9 @@ def test_begin_w_alternate_client(self):
664666
config.time_format = self.TIME_FORMAT
665667
config.timestamp_format = self.TIMESTAMP_FORMAT
666668
config.null_markers = self.NULL_MARKERS
667-
config.source_column_name_match_option = self.SOURCE_COLUMN_NAME_MATCH_OPTION
669+
# Ensure we are setting with the Enum type if that's what the setter expects
670+
from google.cloud.bigquery.enums import SourceColumnMatch
671+
config.source_column_match_strategy = SourceColumnMatch(self.SOURCE_COLUMN_NAME_MATCH_OPTION)
668672
with mock.patch(
669673
"google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
670674
) as final_attributes:

tests/unit/job/test_load_config.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -924,21 +924,34 @@ def test_null_markers_setter(self):
924924
config.null_markers = null_markers
925925
self.assertEqual(config._properties["load"]["nullMarkers"], null_markers)
926926

927-
def test_source_column_name_match_option_missing(self):
927+
def test_source_column_match_strategy_missing(self):
928928
config = self._get_target_class()()
929-
self.assertIsNone(config.source_column_name_match_option)
929+
self.assertIsNone(config.source_column_match_strategy)
930930

931-
def test_source_column_name_match_option_hit(self):
932-
option = "MATCH_BY_NAME"
931+
def test_source_column_match_strategy_hit(self):
932+
from google.cloud.bigquery.enums import SourceColumnMatch
933+
934+
option_enum = SourceColumnMatch.NAME
933935
config = self._get_target_class()()
934-
config._properties["load"]["sourceColumnMatch"] = option
935-
self.assertEqual(config.source_column_name_match_option, option)
936+
# Assume API stores the string value of the enum
937+
config._properties["load"]["sourceColumnMatchStrategy"] = option_enum.value
938+
self.assertEqual(config.source_column_match_strategy, option_enum)
939+
940+
def test_source_column_match_strategy_setter(self):
941+
from google.cloud.bigquery.enums import SourceColumnMatch
936942

937-
def test_source_column_name_match_option_setter(self):
938-
option = "MATCH_BY_POSITION"
943+
option_enum = SourceColumnMatch.POSITION
939944
config = self._get_target_class()()
940-
config.source_column_name_match_option = option
941-
self.assertEqual(config._properties["load"]["sourceColumnMatch"], option)
945+
config.source_column_match_strategy = option_enum
946+
# Assert that the string value of the enum is stored
947+
self.assertEqual(
948+
config._properties["load"]["sourceColumnMatchStrategy"], option_enum.value
949+
)
950+
951+
def test_source_column_match_strategy_setter_invalid_type(self):
952+
config = self._get_target_class()()
953+
with self.assertRaises(TypeError):
954+
config.source_column_match_strategy = "INVALID_STRING_TYPE"
942955

943956
def test_parquet_options_missing(self):
944957
config = self._get_target_class()()

0 commit comments

Comments
 (0)