feat: Adds source_column_match and associated tests (#2227)

chalmerlowe · web-flow · commit 6d5d23685cd4 · 2025-07-15T22:04:24.000Z
This commit introduces new configuration options for BigQuery load jobs and external table definitions, aligning with recent updates to the underlying protos.

New options added:

- `source_column_name_match_option`: Controls how source columns are matched to the schema. (Applies to LoadJobConfig, ExternalConfig, LoadJob)

Changes include:
- Added corresponding properties (getters/setters) to `LoadJobConfig`, `LoadJob`, `ExternalConfig`, and `CSVOptions`.
- Updated docstrings and type hints for all new attributes.
- Updated unit tests to cover the new options, ensuring they are correctly handled during object initialization, serialization to API representation, and deserialization from API responses.
diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py
@@ -462,3 +462,21 @@ class JobCreationMode(object):
     The conditions under which BigQuery can decide to not create a Job are
     subject to change.
     """
+
+
+class SourceColumnMatch(str, enum.Enum):
+    """Uses sensible defaults based on how the schema is provided.
+    If autodetect is used, then columns are matched by name. Otherwise, columns
+    are matched by position. This is done to keep the behavior backward-compatible.
+    """
+
+    SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
+    """Unspecified column name match option."""
+
+    POSITION = "POSITION"
+    """Matches by position. This assumes that the columns are ordered the same
+    way as the schema."""
+
+    NAME = "NAME"
+    """Matches by name. This reads the header row as column names and reorders
+    columns to match the field names in the schema."""
diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py
@@ -30,6 +30,7 @@
 from google.cloud.bigquery._helpers import _int_or_none
 from google.cloud.bigquery._helpers import _str_or_none
 from google.cloud.bigquery import _helpers
+from google.cloud.bigquery.enums import SourceColumnMatch
 from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions
 from google.cloud.bigquery import schema
 from google.cloud.bigquery.schema import SchemaField
@@ -474,6 +475,39 @@ def skip_leading_rows(self):
     def skip_leading_rows(self, value):
         self._properties["skipLeadingRows"] = str(value)
 
+    @property
+    def source_column_match(self) -> Optional[SourceColumnMatch]:
+        """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
+        strategy used to match loaded columns to the schema. If not set, a sensible
+        default is chosen based on how the schema is provided. If autodetect is
+        used, then columns are matched by name. Otherwise, columns are matched by
+        position. This is done to keep the behavior backward-compatible.
+
+        Acceptable values are:
+
+            SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option.
+            POSITION: matches by position. This assumes that the columns are ordered
+            the same way as the schema.
+            NAME: matches by name. This reads the header row as column names and
+            reorders columns to match the field names in the schema.
+
+        See
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.source_column_match
+        """
+
+        value = self._properties.get("sourceColumnMatch")
+        return SourceColumnMatch(value) if value is not None else None
+
+    @source_column_match.setter
+    def source_column_match(self, value: Union[SourceColumnMatch, str, None]):
+        if value is not None and not isinstance(value, (SourceColumnMatch, str)):
+            raise TypeError(
+                "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None"
+            )
+        if isinstance(value, SourceColumnMatch):
+            value = value.value
+        self._properties["sourceColumnMatch"] = value if value else None
+
     @property
     def null_markers(self) -> Optional[Iterable[str]]:
         """Optional[Iterable[str]]: A list of strings represented as SQL NULL values in a CSV file.
diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py
@@ -15,9 +15,10 @@
 """Classes for load jobs."""
 
 import typing
-from typing import FrozenSet, List, Iterable, Optional
+from typing import FrozenSet, List, Iterable, Optional, Union
 
 from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
+from google.cloud.bigquery.enums import SourceColumnMatch
 from google.cloud.bigquery.external_config import HivePartitioningOptions
 from google.cloud.bigquery.format_options import ParquetOptions
 from google.cloud.bigquery import _helpers
@@ -569,6 +570,39 @@ def source_format(self):
     def source_format(self, value):
         self._set_sub_prop("sourceFormat", value)
 
+    @property
+    def source_column_match(self) -> Optional[SourceColumnMatch]:
+        """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
+        strategy used to match loaded columns to the schema. If not set, a sensible
+        default is chosen based on how the schema is provided. If autodetect is
+        used, then columns are matched by name. Otherwise, columns are matched by
+        position. This is done to keep the behavior backward-compatible.
+
+        Acceptable values are:
+
+            SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option.
+            POSITION: matches by position. This assumes that the columns are ordered
+            the same way as the schema.
+            NAME: matches by name. This reads the header row as column names and
+            reorders columns to match the field names in the schema.
+
+        See:
+
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
+        """
+        value = self._get_sub_prop("sourceColumnMatch")
+        return SourceColumnMatch(value) if value is not None else None
+
+    @source_column_match.setter
+    def source_column_match(self, value: Union[SourceColumnMatch, str, None]):
+        if value is not None and not isinstance(value, (SourceColumnMatch, str)):
+            raise TypeError(
+                "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None"
+            )
+        if isinstance(value, SourceColumnMatch):
+            value = value.value
+        self._set_sub_prop("sourceColumnMatch", value if value else None)
+
     @property
     def date_format(self) -> Optional[str]:
         """Optional[str]: Date format used for parsing DATE values.
@@ -983,6 +1017,13 @@ def clustering_fields(self):
         """
         return self.configuration.clustering_fields
 
+    @property
+    def source_column_match(self) -> Optional[SourceColumnMatch]:
+        """See
+        :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`.
+        """
+        return self.configuration.source_column_match
+
     @property
     def date_format(self):
         """See
diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py
@@ -19,6 +19,7 @@
 
 from .helpers import _Base
 from .helpers import _make_client
+from google.cloud.bigquery.enums import SourceColumnMatch
 
 
 class TestLoadJob(_Base):
@@ -37,6 +38,7 @@ def _setUpConstants(self):
         self.OUTPUT_BYTES = 23456
         self.OUTPUT_ROWS = 345
         self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference"
+        self.SOURCE_COLUMN_MATCH = "NAME"
         self.DATE_FORMAT = "%Y-%m-%d"
         self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
         self.TIME_ZONE = "UTC"
@@ -48,6 +50,7 @@ def _make_resource(self, started=False, ended=False):
         resource = super(TestLoadJob, self)._make_resource(started, ended)
         config = resource["configuration"]["load"]
         config["sourceUris"] = [self.SOURCE1]
+        config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH
         config["dateFormat"] = self.DATE_FORMAT
         config["datetimeFormat"] = self.DATETIME_FORMAT
         config["timeZone"] = self.TIME_ZONE
@@ -189,6 +192,15 @@ def _verifyResourceProperties(self, job, resource):
         else:
             self.assertIsNone(job.timestamp_format)
 
+        if "sourceColumnMatch" in config:
+            # job.source_column_match will be an Enum, config[...] is a string
+            self.assertEqual(
+                job.source_column_match.value,
+                config["sourceColumnMatch"],
+            )
+        else:
+            self.assertIsNone(job.source_column_match)
+
     def test_ctor(self):
         client = _make_client(project=self.PROJECT)
         job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client)
@@ -231,6 +243,7 @@ def test_ctor(self):
         self.assertIsNone(job.clustering_fields)
         self.assertIsNone(job.schema_update_options)
         self.assertIsNone(job.reference_file_schema_uri)
+        self.assertIsNone(job.source_column_match)
         self.assertIsNone(job.date_format)
         self.assertIsNone(job.datetime_format)
         self.assertIsNone(job.time_zone)
@@ -631,6 +644,7 @@ def test_begin_w_alternate_client(self):
                 ]
             },
             "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
+            "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
             "dateFormat": self.DATE_FORMAT,
             "datetimeFormat": self.DATETIME_FORMAT,
             "timeZone": self.TIME_ZONE,
@@ -665,6 +679,7 @@ def test_begin_w_alternate_client(self):
         config.write_disposition = WriteDisposition.WRITE_TRUNCATE
         config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION]
         config.reference_file_schema_uri = "gs://path/to/reference"
+        config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH)
         config.date_format = self.DATE_FORMAT
         config.datetime_format = self.DATETIME_FORMAT
         config.time_zone = self.TIME_ZONE
diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py
@@ -844,6 +844,38 @@ def test_write_disposition_setter(self):
             config._properties["load"]["writeDisposition"], write_disposition
         )
 
+    def test_source_column_match_missing(self):
+        config = self._get_target_class()()
+        self.assertIsNone(config.source_column_match)
+
+    def test_source_column_match_hit(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
+
+        option_enum = SourceColumnMatch.NAME
+        config = self._get_target_class()()
+        # Assume API stores the string value of the enum
+        config._properties["load"]["sourceColumnMatch"] = option_enum.value
+        self.assertEqual(config.source_column_match, option_enum)
+
+    def test_source_column_match_setter(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
+
+        option_enum = SourceColumnMatch.POSITION
+        config = self._get_target_class()()
+        config.source_column_match = option_enum
+        # Assert that the string value of the enum is stored
+        self.assertEqual(
+            config._properties["load"]["sourceColumnMatch"], option_enum.value
+        )
+        option_str = "NAME"
+        config.source_column_match = option_str
+        self.assertEqual(config._properties["load"]["sourceColumnMatch"], option_str)
+
+    def test_source_column_match_setter_invalid_type(self):
+        config = self._get_target_class()()
+        with self.assertRaises(TypeError):
+            config.source_column_match = 3.14
+
     def test_date_format_missing(self):
         config = self._get_target_class()()
         self.assertIsNone(config.date_format)
diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py
@@ -19,12 +19,14 @@
 
 from google.cloud.bigquery import external_config
 from google.cloud.bigquery import schema
+from google.cloud.bigquery.enums import SourceColumnMatch
 
 import pytest
 
 
 class TestExternalConfig(unittest.TestCase):
     SOURCE_URIS = ["gs://foo", "gs://bar"]
+    SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
     DATE_FORMAT = "MM/DD/YYYY"
     DATETIME_FORMAT = "MM/DD/YYYY HH24:MI:SS"
     TIME_ZONE = "America/Los_Angeles"
@@ -277,6 +279,7 @@ def test_from_api_repr_csv(self):
                     "allowJaggedRows": False,
                     "encoding": "encoding",
                     "preserveAsciiControlCharacters": False,
+                    "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
                     "nullMarkers": ["", "NA"],
                 },
             },
@@ -294,6 +297,10 @@ def test_from_api_repr_csv(self):
         self.assertEqual(ec.options.allow_jagged_rows, False)
         self.assertEqual(ec.options.encoding, "encoding")
         self.assertEqual(ec.options.preserve_ascii_control_characters, False)
+        self.assertEqual(
+            ec.options.source_column_match,
+            self.SOURCE_COLUMN_MATCH,
+        )
         self.assertEqual(ec.options.null_markers, ["", "NA"])
 
         got_resource = ec.to_api_repr()
@@ -316,7 +323,9 @@ def test_to_api_repr_csv(self):
         options.skip_leading_rows = 123
         options.allow_jagged_rows = False
         options.preserve_ascii_control_characters = False
+        options.source_column_match = self.SOURCE_COLUMN_MATCH
         options.null_markers = ["", "NA"]
+
         ec.csv_options = options
 
         exp_resource = {
@@ -329,6 +338,7 @@ def test_to_api_repr_csv(self):
                 "allowJaggedRows": False,
                 "encoding": "encoding",
                 "preserveAsciiControlCharacters": False,
+                "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
                 "nullMarkers": ["", "NA"],
             },
         }
@@ -881,7 +891,9 @@ def test_to_api_repr(self):
         )
 
 
-class CSVOptions(unittest.TestCase):
+class TestCSVOptions(unittest.TestCase):
+    SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
+
     def test_to_api_repr(self):
         options = external_config.CSVOptions()
         options.field_delimiter = "\t"
@@ -891,6 +903,7 @@ def test_to_api_repr(self):
         options.allow_jagged_rows = False
         options.encoding = "UTF-8"
         options.preserve_ascii_control_characters = False
+        options.source_column_match = self.SOURCE_COLUMN_MATCH
 
         resource = options.to_api_repr()
 
@@ -904,9 +917,37 @@ def test_to_api_repr(self):
                 "allowJaggedRows": False,
                 "encoding": "UTF-8",
                 "preserveAsciiControlCharacters": False,
+                "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
             },
         )
 
+    def test_source_column_match_None(self):
+        ec = external_config.CSVOptions()
+        ec.source_column_match = None
+        expected = None
+        result = ec.source_column_match
+        self.assertEqual(expected, result)
+
+    def test_source_column_match_valid_input(self):
+        ec = external_config.CSVOptions()
+        ec.source_column_match = SourceColumnMatch.NAME
+        expected = "NAME"
+        result = ec.source_column_match
+        self.assertEqual(expected, result)
+
+        ec.source_column_match = "POSITION"
+        expected = "POSITION"
+        result = ec.source_column_match
+        self.assertEqual(expected, result)
+
+    def test_source_column_match_invalid_input(self):
+        ec = external_config.CSVOptions()
+        with self.assertRaisesRegex(
+            TypeError,
+            "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None",
+        ):
+            ec.source_column_match = 3.14
+
 
 class TestGoogleSheetsOptions(unittest.TestCase):
     def test_to_api_repr(self):