Adds source_column_match and associated tests

chalmerlowe · chalmerlowe · commit fc3dbf7c62ac · 2025-07-02T16:19:30.000Z
diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py
@@ -462,3 +462,21 @@ class JobCreationMode(object):
     The conditions under which BigQuery can decide to not create a Job are
     subject to change.
     """
+
+
+class SourceColumnMatch(str, enum.Enum):
+    """Uses sensible defaults based on how the schema is provided.
+    If autodetect is used, then columns are matched by name. Otherwise, columns
+    are matched by position. This is done to keep the behavior backward-compatible.
+    """
+
+    SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
+    """Unspecified column name match option."""
+
+    POSITION = "POSITION"
+    """Matches by position. This assumes that the columns are ordered the same
+    way as the schema."""
+
+    NAME = "NAME"
+    """Matches by name. This reads the header row as column names and reorders
+    columns to match the field names in the schema."""
diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py
@@ -30,6 +30,7 @@
 from google.cloud.bigquery._helpers import _int_or_none
 from google.cloud.bigquery._helpers import _str_or_none
 from google.cloud.bigquery import _helpers
+from google.cloud.bigquery.enums import SourceColumnMatch
 from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions
 from google.cloud.bigquery import schema
 from google.cloud.bigquery.schema import SchemaField
@@ -474,6 +475,36 @@ def skip_leading_rows(self):
     def skip_leading_rows(self, value):
         self._properties["skipLeadingRows"] = str(value)
 
+    @property
+    def source_column_match(self) -> Optional[SourceColumnMatch]:
+        """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
+        strategy used to match loaded columns to the schema. If not set, a sensible
+        default is chosen based on how the schema is provided. If autodetect is
+        used, then columns are matched by name. Otherwise, columns are matched by
+        position. This is done to keep the behavior backward-compatible.
+        Acceptable values are:
+            SOURCE_COLUMN_MATCH_UNSPECIFIED - Unspecified column name match option.
+            POSITION - matches by position. This assumes that the columns are ordered
+                the same way as the schema.
+            NAME - matches by name. This reads the header row as column names and
+                reorders columns to match the field names in the schema.
+        See
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.source_column_match
+        """
+
+        value = self._properties.get("sourceColumnMatch")
+        if value is not None:
+            return SourceColumnMatch(value)
+        return None
+
+    @source_column_match.setter
+    def source_column_match(self, value: Optional[SourceColumnMatch]):
+        if value is not None and not isinstance(value, SourceColumnMatch):
+            raise TypeError(
+                "value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
+            )
+        self._properties["sourceColumnMatch"] = value.value if value else None
+
     def to_api_repr(self) -> dict:
         """Build an API representation of this object.
 
diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py
@@ -18,6 +18,7 @@
 from typing import FrozenSet, List, Iterable, Optional
 
 from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
+from google.cloud.bigquery.enums import SourceColumnMatch
 from google.cloud.bigquery.external_config import HivePartitioningOptions
 from google.cloud.bigquery.format_options import ParquetOptions
 from google.cloud.bigquery import _helpers
@@ -548,6 +549,35 @@ def source_format(self):
     def source_format(self, value):
         self._set_sub_prop("sourceFormat", value)
 
+    @property
+    def source_column_match(self) -> Optional[SourceColumnMatch]:
+        """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
+        strategy used to match loaded columns to the schema. If not set, a sensible
+        default is chosen based on how the schema is provided. If autodetect is
+        used, then columns are matched by name. Otherwise, columns are matched by
+        position. This is done to keep the behavior backward-compatible.
+        Acceptable values are:
+            SOURCE_COLUMN_MATCH_UNSPECIFIED - Unspecified column name match option.
+            POSITION - matches by position. This assumes that the columns are ordered
+                the same way as the schema.
+            NAME - matches by name. This reads the header row as column names and
+                reorders columns to match the field names in the schema.
+        See:
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
+        """
+        value = self._get_sub_prop("sourceColumnMatch")
+        if value is not None:
+            return SourceColumnMatch(value)
+        return None
+
+    @source_column_match.setter
+    def source_column_match(self, value: Optional[SourceColumnMatch]):
+        if value is not None and not isinstance(value, SourceColumnMatch):
+            raise TypeError(
+                "value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
+            )
+        self._set_sub_prop("sourceColumnMatch", value.value if value else None)
+
     @property
     def time_partitioning(self):
         """Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based
@@ -889,6 +919,13 @@ def clustering_fields(self):
         """
         return self.configuration.clustering_fields
 
+    @property
+    def source_column_match(self):
+        """See
+        :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`.
+        """
+        return self.configuration.source_column_match
+
     @property
     def schema_update_options(self):
         """See
diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py
@@ -19,6 +19,7 @@
 
 from .helpers import _Base
 from .helpers import _make_client
+from google.cloud.bigquery.enums import SourceColumnMatch
 
 
 class TestLoadJob(_Base):
@@ -37,11 +38,13 @@ def _setUpConstants(self):
         self.OUTPUT_BYTES = 23456
         self.OUTPUT_ROWS = 345
         self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference"
+        self.SOURCE_COLUMN_MATCH = "NAME"
 
     def _make_resource(self, started=False, ended=False):
         resource = super(TestLoadJob, self)._make_resource(started, ended)
         config = resource["configuration"]["load"]
         config["sourceUris"] = [self.SOURCE1]
+        config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH
         config["destinationTable"] = {
             "projectId": self.PROJECT,
             "datasetId": self.DS_ID,
@@ -153,6 +156,15 @@ def _verifyResourceProperties(self, job, resource):
         else:
             self.assertIsNone(job.destination_encryption_configuration)
 
+        if "sourceColumnMatch" in config:
+            # job.source_column_match will be an Enum, config[...] is a string
+            self.assertEqual(
+                job.source_column_match.value,
+                config["sourceColumnMatch"],
+            )
+        else:
+            self.assertIsNone(job.source_column_match)
+
     def test_ctor(self):
         client = _make_client(project=self.PROJECT)
         job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client)
@@ -194,6 +206,7 @@ def test_ctor(self):
         self.assertIsNone(job.clustering_fields)
         self.assertIsNone(job.schema_update_options)
         self.assertIsNone(job.reference_file_schema_uri)
+        self.assertIsNone(job.source_column_match)
 
     def test_ctor_w_config(self):
         from google.cloud.bigquery.schema import SchemaField
@@ -571,6 +584,7 @@ def test_begin_w_alternate_client(self):
                 ]
             },
             "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
+            "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
         }
         RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
         conn1 = make_connection()
@@ -599,6 +613,7 @@ def test_begin_w_alternate_client(self):
         config.write_disposition = WriteDisposition.WRITE_TRUNCATE
         config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION]
         config.reference_file_schema_uri = "gs://path/to/reference"
+        config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH)
         with mock.patch(
             "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
         ) as final_attributes:
diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py
@@ -828,6 +828,35 @@ def test_write_disposition_setter(self):
             config._properties["load"]["writeDisposition"], write_disposition
         )
 
+    def test_source_column_match_missing(self):
+        config = self._get_target_class()()
+        self.assertIsNone(config.source_column_match)
+
+    def test_source_column_match_hit(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
+
+        option_enum = SourceColumnMatch.NAME
+        config = self._get_target_class()()
+        # Assume API stores the string value of the enum
+        config._properties["load"]["sourceColumnMatch"] = option_enum.value
+        self.assertEqual(config.source_column_match, option_enum)
+
+    def test_source_column_match_setter(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
+
+        option_enum = SourceColumnMatch.POSITION
+        config = self._get_target_class()()
+        config.source_column_match = option_enum
+        # Assert that the string value of the enum is stored
+        self.assertEqual(
+            config._properties["load"]["sourceColumnMatch"], option_enum.value
+        )
+
+    def test_source_column_match_setter_invalid_type(self):
+        config = self._get_target_class()()
+        with self.assertRaises(TypeError):
+            config.source_column_match = "INVALID_STRING_TYPE"
+
     def test_parquet_options_missing(self):
         config = self._get_target_class()()
         self.assertIsNone(config.parquet_options)
diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py
@@ -19,12 +19,14 @@
 
 from google.cloud.bigquery import external_config
 from google.cloud.bigquery import schema
+from google.cloud.bigquery.enums import SourceColumnMatch
 
 import pytest
 
 
 class TestExternalConfig(unittest.TestCase):
     SOURCE_URIS = ["gs://foo", "gs://bar"]
+    SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
 
     BASE_RESOURCE = {
         "sourceFormat": "",
@@ -121,6 +123,20 @@ def test_schema_empty(self):
         want = {"sourceFormat": "", "schema": {"fields": []}}
         self.assertEqual(got, want)
 
+    def test_source_column_match_None(self):
+        ec = external_config.ExternalConfig("")
+        ec.source_column_match = None
+        expected = None
+        result = ec.source_column_match
+        self.assertEqual(expected, result)
+
+    def test_source_column_match_valid_input(self):
+        ec = external_config.ExternalConfig("")
+        ec.source_column_match = SourceColumnMatch.NAME
+        expected = "NAME"
+        result = ec.source_column_match
+        self.assertEqual(expected, result)
+
     def _verify_base(self, ec):
         self.assertEqual(ec.autodetect, True)
         self.assertEqual(ec.compression, "compression")
@@ -251,6 +267,7 @@ def test_from_api_repr_csv(self):
                     "allowJaggedRows": False,
                     "encoding": "encoding",
                     "preserveAsciiControlCharacters": False,
+                    "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
                 },
             },
         )
@@ -267,6 +284,10 @@ def test_from_api_repr_csv(self):
         self.assertEqual(ec.options.allow_jagged_rows, False)
         self.assertEqual(ec.options.encoding, "encoding")
         self.assertEqual(ec.options.preserve_ascii_control_characters, False)
+        self.assertEqual(
+            ec.options.source_column_match,
+            self.SOURCE_COLUMN_MATCH,
+        )
 
         got_resource = ec.to_api_repr()
 
@@ -288,6 +309,7 @@ def test_to_api_repr_csv(self):
         options.skip_leading_rows = 123
         options.allow_jagged_rows = False
         options.preserve_ascii_control_characters = False
+        options.source_column_match = self.SOURCE_COLUMN_MATCH
         ec.csv_options = options
 
         exp_resource = {
@@ -300,6 +322,7 @@ def test_to_api_repr_csv(self):
                 "allowJaggedRows": False,
                 "encoding": "encoding",
                 "preserveAsciiControlCharacters": False,
+                "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
             },
         }
 
@@ -852,6 +875,8 @@ def test_to_api_repr(self):
 
 
 class CSVOptions(unittest.TestCase):
+    SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
+
     def test_to_api_repr(self):
         options = external_config.CSVOptions()
         options.field_delimiter = "\t"
@@ -861,6 +886,7 @@ def test_to_api_repr(self):
         options.allow_jagged_rows = False
         options.encoding = "UTF-8"
         options.preserve_ascii_control_characters = False
+        options.source_column_match = self.SOURCE_COLUMN_MATCH
 
         resource = options.to_api_repr()
 
@@ -874,6 +900,7 @@ def test_to_api_repr(self):
                 "allowJaggedRows": False,
                 "encoding": "UTF-8",
                 "preserveAsciiControlCharacters": False,
+                "sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
             },
         )