refactor: Integrate user's changes and new LoadJobConfig tests

google-labs-jules[bot] · google-labs-jules[bot] · commit d721ea438328 · 2025-06-26T15:38:14.000Z
This commit restores user changes from commit a9b187f that were inadvertently overwritten. These changes include: - Addition of SourceColumnMatch enum in enums.py. - Updates to docstrings in external_config.py and job/load.py. - Renaming of source_column_name_match_option to source_column_match_strategy in LoadJobConfig and LoadJob, now using the SourceColumnMatch enum. - Adjustments in related unit tests in test_load.py and test_external_config.py. Additionally, this commit incorporates the new unit tests for all recently added LoadJobConfig properties in tests/unit/job/test_load_config.py. Corrections were made to tests/unit/job/test_load.py to align with the renamed source_column_match_strategy property and its enum type.
diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py
@@ -462,3 +462,23 @@ class JobCreationMode(object):
     The conditions under which BigQuery can decide to not create a Job are
     subject to change.
     """
+
+
+class SourceColumnMatch(str, enum.Enum):
+    """Uses sensible defaults based on how the schema is provided.
+
+    If autodetect is used, then columns are matched by name. Otherwise, columns
+    are matched by position. This is done to keep the behavior backward-compati
+ble.
+    """
+
+    SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
+    """Unspecified column name match option."""
+
+    POSITION = "POSITION"
+    """Matches by position. This assumes that the columns are ordered the same
+    way as the schema."""
+
+    NAME = "NAME"
+    """Matches by name. This reads the header row as column names and reorders
+    columns to match the field names in the schema."""
diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py
@@ -476,11 +476,19 @@ def skip_leading_rows(self, value):
 
     @property
     def null_markers(self) -> Optional[List[str]]:
-        """Optional[List[str]]: A list of strings represented as SQL NULL value.
+        """Optional[List[str]]: A list of strings represented as SQL NULL value in a CSV file.
+
+        null_marker and null_markers can't be set at the same time.
+        If null_marker is set, null_markers has to be not set.
+        If null_markers is set, null_marker has to be not set.
+        If both null_marker and null_markers are set at the same time, a user
+        error would be thrown.
+        Any strings listed in null_markers, including
+        empty string would be interpreted as SQL NULL. This applies to all column
+        types.
 
         See
-        https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_marker
-        (Note: API doc refers to null_marker singular, but proto is null_markers plural and a list)
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.null_markers
         """
         return self._properties.get("nullMarkers")
 
@@ -490,13 +498,19 @@ def null_markers(self, value: Optional[List[str]]):
 
     @property
     def source_column_name_match_option(self) -> Optional[str]:
-        """Optional[str]: Controls the strategy used to match loaded columns to the schema.
-        Acceptable values are: "POSITION", "NAME".
+        """Optional[str]: Controls the strategy used to match loaded columns to the schema. If not
+        set, a sensible default is chosen based on how the schema is provided. If
+        autodetect is used, then columns are matched by name. Otherwise, columns
+        are matched by position. This is done to keep the behavior
+        backward-compatible.
+        Acceptable values are:
+          POSITION - matches by position. This assumes that the columns are ordered
+              the same way as the schema.
+          NAME - matches by name. This reads the header row as column names and
+              reorders columns to match the field names in the schema.
 
         See
         https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.source_column_match
-        (Note: This field is documented under ExternalDataConfiguration in the REST API docs but seems
-         more appropriate here for CSVOptions, matching the proto structure for external tables)
         """
         return self._properties.get("sourceColumnMatch")
 
diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py
@@ -30,6 +30,7 @@
 from google.cloud.bigquery.job.base import _JobConfig
 from google.cloud.bigquery.job.base import _JobReference
 from google.cloud.bigquery.query import ConnectionProperty
+from google.cloud.bigquery.enums import SourceColumnMatch
 
 
 class ColumnNameCharacterMap:
@@ -550,8 +551,9 @@ def source_format(self, value):
 
     @property
     def time_zone(self):
-        """Optional[str]: Default time zone that will apply when parsing
-        timestamp values that have no specific time zone.
+        """Optional[str]: Default time zone that will apply when parsing timestamp
+        values that have no specific time zone. This option is valid for CSV and
+        JSON sources.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_zone
@@ -565,6 +567,7 @@ def time_zone(self, value: Optional[str]):
     @property
     def date_format(self) -> Optional[str]:
         """Optional[str]: Date format used for parsing DATE values.
+        This option is valid for CSV and JSON sources.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.date_format
@@ -578,6 +581,7 @@ def date_format(self, value: Optional[str]):
     @property
     def datetime_format(self) -> Optional[str]:
         """Optional[str]: Date format used for parsing DATETIME values.
+        This option is valid for CSV and JSON sources.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.datetime_format
@@ -591,6 +595,7 @@ def datetime_format(self, value: Optional[str]):
     @property
     def time_format(self) -> Optional[str]:
         """Optional[str]: Date format used for parsing TIME values.
+        This option is valid for CSV and JSON sources.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.time_format
@@ -604,6 +609,7 @@ def time_format(self, value: Optional[str]):
     @property
     def timestamp_format(self) -> Optional[str]:
         """Optional[str]: Date format used for parsing TIMESTAMP values.
+        This option is valid for CSV and JSON sources.
 
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.timestamp_format
@@ -620,6 +626,15 @@ def null_markers(self) -> Optional[List[str]]:
 
         (CSV only).
 
+        null_marker and null_markers can't be set at the same time.
+        If null_marker is set, null_markers has to be not set.
+        If null_markers is set, null_marker has to be not set.
+        If both null_marker and null_markers are set at the same time, a user
+        error would be thrown.
+        Any strings listed in null_markers, including
+        empty string would be interpreted as SQL NULL. This applies to all column
+        types.
+
         See:
         https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.null_markers
         """
@@ -630,21 +645,30 @@ def null_markers(self, value: Optional[List[str]]):
         self._set_sub_prop("nullMarkers", value)
 
     @property
-    def source_column_name_match_option(self) -> Optional[str]:
-        """Optional[str]: Controls the strategy used to match loaded columns to the schema.
+    def source_column_match_strategy(self) -> Optional[SourceColumnMatch]:
+        """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the strategy
+        used to match loaded columns to the schema. If not set, a sensible default is
+        chosen based on how the schema is provided. If autodetect is used, then
+        columns are matched by name. Otherwise, columns are matched by position.
+        This is done to keep the behavior backward-compatible.
 
         (CSV only).
-        Acceptable values are based on the SourceColumnMatch enum in the proto.
-        Example values: "MATCH_BY_NAME", "MATCH_BY_POSITION".
 
         See:
-        https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
+        https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match_strategy
         """
-        return self._get_sub_prop("sourceColumnMatch")
-
-    @source_column_name_match_option.setter
-    def source_column_name_match_option(self, value: Optional[str]):
-        self._set_sub_prop("sourceColumnMatch", value)
+        value = self._get_sub_prop("sourceColumnMatchStrategy")
+        if value is not None:
+            return SourceColumnMatch(value)
+        return None
+
+    @source_column_match_strategy.setter
+    def source_column_match_strategy(self, value: Optional[SourceColumnMatch]):
+        if value is not None and not isinstance(value, SourceColumnMatch):
+            raise TypeError(
+                "value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
+            )
+        self._set_sub_prop("sourceColumnMatchStrategy", value.value if value else None)
 
     @property
     def time_partitioning(self):
@@ -1030,11 +1054,11 @@ def null_markers(self):
         return self.configuration.null_markers
 
     @property
-    def source_column_name_match_option(self):
+    def source_column_match_strategy(self):
         """See
-        :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_name_match_option`.
+        :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match_strategy`.
         """
-        return self.configuration.source_column_name_match_option
+        return self.configuration.source_column_match_strategy
 
     @property
     def schema_update_options(self):
diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py
@@ -42,8 +42,8 @@ def _setUpConstants(self):
         self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S"
         self.TIME_FORMAT = "%H:%M:%S"
         self.TIMESTAMP_FORMAT = "YYYY-MM-DD HH:MM:SS.SSSSSSZ"
-        self.NULL_MARKERS = ["N/A", "\\N"]
-        self.SOURCE_COLUMN_NAME_MATCH_OPTION = "MATCH_BY_NAME"
+        self.NULL_MARKERS = ["N/A", "NA"]
+        self.SOURCE_COLUMN_NAME_MATCH_OPTION = "NAME"  # Corrected to actual enum value
 
     def _make_resource(self, started=False, ended=False):
         resource = super(TestLoadJob, self)._make_resource(started, ended)
@@ -55,7 +55,7 @@ def _make_resource(self, started=False, ended=False):
         config["timeFormat"] = self.TIME_FORMAT
         config["timestampFormat"] = self.TIMESTAMP_FORMAT
         config["nullMarkers"] = self.NULL_MARKERS
-        config["sourceColumnMatch"] = self.SOURCE_COLUMN_NAME_MATCH_OPTION
+        config["sourceColumnMatchStrategy"] = self.SOURCE_COLUMN_NAME_MATCH_OPTION  # Keep value as string for mock API repr
         config["destinationTable"] = {
             "projectId": self.PROJECT,
             "datasetId": self.DS_ID,
@@ -191,12 +191,14 @@ def _verifyResourceProperties(self, job, resource):
             self.assertEqual(job.null_markers, config["nullMarkers"])
         else:
             self.assertIsNone(job.null_markers)
-        if "sourceColumnMatch" in config:
+        if "sourceColumnMatchStrategy" in config:
+            # job.source_column_match_strategy will be an Enum, config[...] is a string
             self.assertEqual(
-                job.source_column_name_match_option, config["sourceColumnMatch"]
+                job.source_column_match_strategy.value,
+                config["sourceColumnMatchStrategy"],
             )
         else:
-            self.assertIsNone(job.source_column_name_match_option)
+            self.assertIsNone(job.source_column_match_strategy)
 
     def test_ctor(self):
         client = _make_client(project=self.PROJECT)
@@ -245,7 +247,7 @@ def test_ctor(self):
         self.assertIsNone(job.time_format)
         self.assertIsNone(job.timestamp_format)
         self.assertIsNone(job.null_markers)
-        self.assertIsNone(job.source_column_name_match_option)
+        self.assertIsNone(job.source_column_match_strategy)
 
     def test_ctor_w_config(self):
         from google.cloud.bigquery.schema import SchemaField
@@ -629,7 +631,7 @@ def test_begin_w_alternate_client(self):
             "timeFormat": self.TIME_FORMAT,
             "timestampFormat": self.TIMESTAMP_FORMAT,
             "nullMarkers": self.NULL_MARKERS,
-            "sourceColumnMatch": self.SOURCE_COLUMN_NAME_MATCH_OPTION,
+            "sourceColumnMatchStrategy": self.SOURCE_COLUMN_NAME_MATCH_OPTION,  # Keep value as string for mock API repr
         }
         RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
         conn1 = make_connection()
@@ -664,7 +666,9 @@ def test_begin_w_alternate_client(self):
         config.time_format = self.TIME_FORMAT
         config.timestamp_format = self.TIMESTAMP_FORMAT
         config.null_markers = self.NULL_MARKERS
-        config.source_column_name_match_option = self.SOURCE_COLUMN_NAME_MATCH_OPTION
+        # Ensure we are setting with the Enum type if that's what the setter expects
+        from google.cloud.bigquery.enums import SourceColumnMatch
+        config.source_column_match_strategy = SourceColumnMatch(self.SOURCE_COLUMN_NAME_MATCH_OPTION)
         with mock.patch(
             "google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
         ) as final_attributes:
diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py
@@ -924,21 +924,34 @@ def test_null_markers_setter(self):
         config.null_markers = null_markers
         self.assertEqual(config._properties["load"]["nullMarkers"], null_markers)
 
-    def test_source_column_name_match_option_missing(self):
+    def test_source_column_match_strategy_missing(self):
         config = self._get_target_class()()
-        self.assertIsNone(config.source_column_name_match_option)
+        self.assertIsNone(config.source_column_match_strategy)
 
-    def test_source_column_name_match_option_hit(self):
-        option = "MATCH_BY_NAME"
+    def test_source_column_match_strategy_hit(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
+
+        option_enum = SourceColumnMatch.NAME
         config = self._get_target_class()()
-        config._properties["load"]["sourceColumnMatch"] = option
-        self.assertEqual(config.source_column_name_match_option, option)
+        # Assume API stores the string value of the enum
+        config._properties["load"]["sourceColumnMatchStrategy"] = option_enum.value
+        self.assertEqual(config.source_column_match_strategy, option_enum)
+
+    def test_source_column_match_strategy_setter(self):
+        from google.cloud.bigquery.enums import SourceColumnMatch
 
-    def test_source_column_name_match_option_setter(self):
-        option = "MATCH_BY_POSITION"
+        option_enum = SourceColumnMatch.POSITION
         config = self._get_target_class()()
-        config.source_column_name_match_option = option
-        self.assertEqual(config._properties["load"]["sourceColumnMatch"], option)
+        config.source_column_match_strategy = option_enum
+        # Assert that the string value of the enum is stored
+        self.assertEqual(
+            config._properties["load"]["sourceColumnMatchStrategy"], option_enum.value
+        )
+
+    def test_source_column_match_strategy_setter_invalid_type(self):
+        config = self._get_target_class()()
+        with self.assertRaises(TypeError):
+            config.source_column_match_strategy = "INVALID_STRING_TYPE"
 
     def test_parquet_options_missing(self):
         config = self._get_target_class()()