Skip to content

Commit 6d5d236

Browse files
authored
feat: Adds source_column_match and associated tests (#2227)
This commit introduces new configuration options for BigQuery load jobs and external table definitions, aligning with recent updates to the underlying protos. New options added: - `source_column_name_match_option`: Controls how source columns are matched to the schema. (Applies to LoadJobConfig, ExternalConfig, LoadJob) Changes include: - Added corresponding properties (getters/setters) to `LoadJobConfig`, `LoadJob`, `ExternalConfig`, and `CSVOptions`. - Updated docstrings and type hints for all new attributes. - Updated unit tests to cover the new options, ensuring they are correctly handled during object initialization, serialization to API representation, and deserialization from API responses.
1 parent 289446d commit 6d5d236

File tree

6 files changed

+183
-2
lines changed

6 files changed

+183
-2
lines changed

google/cloud/bigquery/enums.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,21 @@ class JobCreationMode(object):
462462
The conditions under which BigQuery can decide to not create a Job are
463463
subject to change.
464464
"""
465+
466+
467+
class SourceColumnMatch(str, enum.Enum):
468+
"""Uses sensible defaults based on how the schema is provided.
469+
If autodetect is used, then columns are matched by name. Otherwise, columns
470+
are matched by position. This is done to keep the behavior backward-compatible.
471+
"""
472+
473+
SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
474+
"""Unspecified column name match option."""
475+
476+
POSITION = "POSITION"
477+
"""Matches by position. This assumes that the columns are ordered the same
478+
way as the schema."""
479+
480+
NAME = "NAME"
481+
"""Matches by name. This reads the header row as column names and reorders
482+
columns to match the field names in the schema."""

google/cloud/bigquery/external_config.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from google.cloud.bigquery._helpers import _int_or_none
3131
from google.cloud.bigquery._helpers import _str_or_none
3232
from google.cloud.bigquery import _helpers
33+
from google.cloud.bigquery.enums import SourceColumnMatch
3334
from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions
3435
from google.cloud.bigquery import schema
3536
from google.cloud.bigquery.schema import SchemaField
@@ -474,6 +475,39 @@ def skip_leading_rows(self):
474475
def skip_leading_rows(self, value):
475476
self._properties["skipLeadingRows"] = str(value)
476477

478+
@property
479+
def source_column_match(self) -> Optional[SourceColumnMatch]:
480+
"""Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
481+
strategy used to match loaded columns to the schema. If not set, a sensible
482+
default is chosen based on how the schema is provided. If autodetect is
483+
used, then columns are matched by name. Otherwise, columns are matched by
484+
position. This is done to keep the behavior backward-compatible.
485+
486+
Acceptable values are:
487+
488+
SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option.
489+
POSITION: matches by position. This assumes that the columns are ordered
490+
the same way as the schema.
491+
NAME: matches by name. This reads the header row as column names and
492+
reorders columns to match the field names in the schema.
493+
494+
See
495+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.source_column_match
496+
"""
497+
498+
value = self._properties.get("sourceColumnMatch")
499+
return SourceColumnMatch(value) if value is not None else None
500+
501+
@source_column_match.setter
502+
def source_column_match(self, value: Union[SourceColumnMatch, str, None]):
503+
if value is not None and not isinstance(value, (SourceColumnMatch, str)):
504+
raise TypeError(
505+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None"
506+
)
507+
if isinstance(value, SourceColumnMatch):
508+
value = value.value
509+
self._properties["sourceColumnMatch"] = value if value else None
510+
477511
@property
478512
def null_markers(self) -> Optional[Iterable[str]]:
479513
"""Optional[Iterable[str]]: A list of strings represented as SQL NULL values in a CSV file.

google/cloud/bigquery/job/load.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,10 @@
1515
"""Classes for load jobs."""
1616

1717
import typing
18-
from typing import FrozenSet, List, Iterable, Optional
18+
from typing import FrozenSet, List, Iterable, Optional, Union
1919

2020
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
21+
from google.cloud.bigquery.enums import SourceColumnMatch
2122
from google.cloud.bigquery.external_config import HivePartitioningOptions
2223
from google.cloud.bigquery.format_options import ParquetOptions
2324
from google.cloud.bigquery import _helpers
@@ -569,6 +570,39 @@ def source_format(self):
569570
def source_format(self, value):
570571
self._set_sub_prop("sourceFormat", value)
571572

573+
@property
574+
def source_column_match(self) -> Optional[SourceColumnMatch]:
575+
"""Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
576+
strategy used to match loaded columns to the schema. If not set, a sensible
577+
default is chosen based on how the schema is provided. If autodetect is
578+
used, then columns are matched by name. Otherwise, columns are matched by
579+
position. This is done to keep the behavior backward-compatible.
580+
581+
Acceptable values are:
582+
583+
SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option.
584+
POSITION: matches by position. This assumes that the columns are ordered
585+
the same way as the schema.
586+
NAME: matches by name. This reads the header row as column names and
587+
reorders columns to match the field names in the schema.
588+
589+
See:
590+
591+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
592+
"""
593+
value = self._get_sub_prop("sourceColumnMatch")
594+
return SourceColumnMatch(value) if value is not None else None
595+
596+
@source_column_match.setter
597+
def source_column_match(self, value: Union[SourceColumnMatch, str, None]):
598+
if value is not None and not isinstance(value, (SourceColumnMatch, str)):
599+
raise TypeError(
600+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None"
601+
)
602+
if isinstance(value, SourceColumnMatch):
603+
value = value.value
604+
self._set_sub_prop("sourceColumnMatch", value if value else None)
605+
572606
@property
573607
def date_format(self) -> Optional[str]:
574608
"""Optional[str]: Date format used for parsing DATE values.
@@ -983,6 +1017,13 @@ def clustering_fields(self):
9831017
"""
9841018
return self.configuration.clustering_fields
9851019

1020+
@property
1021+
def source_column_match(self) -> Optional[SourceColumnMatch]:
1022+
"""See
1023+
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`.
1024+
"""
1025+
return self.configuration.source_column_match
1026+
9861027
@property
9871028
def date_format(self):
9881029
"""See

tests/unit/job/test_load.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from .helpers import _Base
2121
from .helpers import _make_client
22+
from google.cloud.bigquery.enums import SourceColumnMatch
2223

2324

2425
class TestLoadJob(_Base):
@@ -37,6 +38,7 @@ def _setUpConstants(self):
3738
self.OUTPUT_BYTES = 23456
3839
self.OUTPUT_ROWS = 345
3940
self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference"
41+
self.SOURCE_COLUMN_MATCH = "NAME"
4042
self.DATE_FORMAT = "%Y-%m-%d"
4143
self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
4244
self.TIME_ZONE = "UTC"
@@ -48,6 +50,7 @@ def _make_resource(self, started=False, ended=False):
4850
resource = super(TestLoadJob, self)._make_resource(started, ended)
4951
config = resource["configuration"]["load"]
5052
config["sourceUris"] = [self.SOURCE1]
53+
config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH
5154
config["dateFormat"] = self.DATE_FORMAT
5255
config["datetimeFormat"] = self.DATETIME_FORMAT
5356
config["timeZone"] = self.TIME_ZONE
@@ -189,6 +192,15 @@ def _verifyResourceProperties(self, job, resource):
189192
else:
190193
self.assertIsNone(job.timestamp_format)
191194

195+
if "sourceColumnMatch" in config:
196+
# job.source_column_match will be an Enum, config[...] is a string
197+
self.assertEqual(
198+
job.source_column_match.value,
199+
config["sourceColumnMatch"],
200+
)
201+
else:
202+
self.assertIsNone(job.source_column_match)
203+
192204
def test_ctor(self):
193205
client = _make_client(project=self.PROJECT)
194206
job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client)
@@ -231,6 +243,7 @@ def test_ctor(self):
231243
self.assertIsNone(job.clustering_fields)
232244
self.assertIsNone(job.schema_update_options)
233245
self.assertIsNone(job.reference_file_schema_uri)
246+
self.assertIsNone(job.source_column_match)
234247
self.assertIsNone(job.date_format)
235248
self.assertIsNone(job.datetime_format)
236249
self.assertIsNone(job.time_zone)
@@ -631,6 +644,7 @@ def test_begin_w_alternate_client(self):
631644
]
632645
},
633646
"schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
647+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
634648
"dateFormat": self.DATE_FORMAT,
635649
"datetimeFormat": self.DATETIME_FORMAT,
636650
"timeZone": self.TIME_ZONE,
@@ -665,6 +679,7 @@ def test_begin_w_alternate_client(self):
665679
config.write_disposition = WriteDisposition.WRITE_TRUNCATE
666680
config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION]
667681
config.reference_file_schema_uri = "gs://path/to/reference"
682+
config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH)
668683
config.date_format = self.DATE_FORMAT
669684
config.datetime_format = self.DATETIME_FORMAT
670685
config.time_zone = self.TIME_ZONE

tests/unit/job/test_load_config.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -844,6 +844,38 @@ def test_write_disposition_setter(self):
844844
config._properties["load"]["writeDisposition"], write_disposition
845845
)
846846

847+
def test_source_column_match_missing(self):
848+
config = self._get_target_class()()
849+
self.assertIsNone(config.source_column_match)
850+
851+
def test_source_column_match_hit(self):
852+
from google.cloud.bigquery.enums import SourceColumnMatch
853+
854+
option_enum = SourceColumnMatch.NAME
855+
config = self._get_target_class()()
856+
# Assume API stores the string value of the enum
857+
config._properties["load"]["sourceColumnMatch"] = option_enum.value
858+
self.assertEqual(config.source_column_match, option_enum)
859+
860+
def test_source_column_match_setter(self):
861+
from google.cloud.bigquery.enums import SourceColumnMatch
862+
863+
option_enum = SourceColumnMatch.POSITION
864+
config = self._get_target_class()()
865+
config.source_column_match = option_enum
866+
# Assert that the string value of the enum is stored
867+
self.assertEqual(
868+
config._properties["load"]["sourceColumnMatch"], option_enum.value
869+
)
870+
option_str = "NAME"
871+
config.source_column_match = option_str
872+
self.assertEqual(config._properties["load"]["sourceColumnMatch"], option_str)
873+
874+
def test_source_column_match_setter_invalid_type(self):
875+
config = self._get_target_class()()
876+
with self.assertRaises(TypeError):
877+
config.source_column_match = 3.14
878+
847879
def test_date_format_missing(self):
848880
config = self._get_target_class()()
849881
self.assertIsNone(config.date_format)

tests/unit/test_external_config.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919

2020
from google.cloud.bigquery import external_config
2121
from google.cloud.bigquery import schema
22+
from google.cloud.bigquery.enums import SourceColumnMatch
2223

2324
import pytest
2425

2526

2627
class TestExternalConfig(unittest.TestCase):
2728
SOURCE_URIS = ["gs://foo", "gs://bar"]
29+
SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
2830
DATE_FORMAT = "MM/DD/YYYY"
2931
DATETIME_FORMAT = "MM/DD/YYYY HH24:MI:SS"
3032
TIME_ZONE = "America/Los_Angeles"
@@ -277,6 +279,7 @@ def test_from_api_repr_csv(self):
277279
"allowJaggedRows": False,
278280
"encoding": "encoding",
279281
"preserveAsciiControlCharacters": False,
282+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
280283
"nullMarkers": ["", "NA"],
281284
},
282285
},
@@ -294,6 +297,10 @@ def test_from_api_repr_csv(self):
294297
self.assertEqual(ec.options.allow_jagged_rows, False)
295298
self.assertEqual(ec.options.encoding, "encoding")
296299
self.assertEqual(ec.options.preserve_ascii_control_characters, False)
300+
self.assertEqual(
301+
ec.options.source_column_match,
302+
self.SOURCE_COLUMN_MATCH,
303+
)
297304
self.assertEqual(ec.options.null_markers, ["", "NA"])
298305

299306
got_resource = ec.to_api_repr()
@@ -316,7 +323,9 @@ def test_to_api_repr_csv(self):
316323
options.skip_leading_rows = 123
317324
options.allow_jagged_rows = False
318325
options.preserve_ascii_control_characters = False
326+
options.source_column_match = self.SOURCE_COLUMN_MATCH
319327
options.null_markers = ["", "NA"]
328+
320329
ec.csv_options = options
321330

322331
exp_resource = {
@@ -329,6 +338,7 @@ def test_to_api_repr_csv(self):
329338
"allowJaggedRows": False,
330339
"encoding": "encoding",
331340
"preserveAsciiControlCharacters": False,
341+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
332342
"nullMarkers": ["", "NA"],
333343
},
334344
}
@@ -881,7 +891,9 @@ def test_to_api_repr(self):
881891
)
882892

883893

884-
class CSVOptions(unittest.TestCase):
894+
class TestCSVOptions(unittest.TestCase):
895+
SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
896+
885897
def test_to_api_repr(self):
886898
options = external_config.CSVOptions()
887899
options.field_delimiter = "\t"
@@ -891,6 +903,7 @@ def test_to_api_repr(self):
891903
options.allow_jagged_rows = False
892904
options.encoding = "UTF-8"
893905
options.preserve_ascii_control_characters = False
906+
options.source_column_match = self.SOURCE_COLUMN_MATCH
894907

895908
resource = options.to_api_repr()
896909

@@ -904,9 +917,37 @@ def test_to_api_repr(self):
904917
"allowJaggedRows": False,
905918
"encoding": "UTF-8",
906919
"preserveAsciiControlCharacters": False,
920+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
907921
},
908922
)
909923

924+
def test_source_column_match_None(self):
925+
ec = external_config.CSVOptions()
926+
ec.source_column_match = None
927+
expected = None
928+
result = ec.source_column_match
929+
self.assertEqual(expected, result)
930+
931+
def test_source_column_match_valid_input(self):
932+
ec = external_config.CSVOptions()
933+
ec.source_column_match = SourceColumnMatch.NAME
934+
expected = "NAME"
935+
result = ec.source_column_match
936+
self.assertEqual(expected, result)
937+
938+
ec.source_column_match = "POSITION"
939+
expected = "POSITION"
940+
result = ec.source_column_match
941+
self.assertEqual(expected, result)
942+
943+
def test_source_column_match_invalid_input(self):
944+
ec = external_config.CSVOptions()
945+
with self.assertRaisesRegex(
946+
TypeError,
947+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None",
948+
):
949+
ec.source_column_match = 3.14
950+
910951

911952
class TestGoogleSheetsOptions(unittest.TestCase):
912953
def test_to_api_repr(self):

0 commit comments

Comments
 (0)