diff --git a/google/cloud/bigquery/enums.py b/google/cloud/bigquery/enums.py index 9a1e4880c..1b1eb241a 100644 --- a/google/cloud/bigquery/enums.py +++ b/google/cloud/bigquery/enums.py @@ -462,3 +462,21 @@ class JobCreationMode(object): The conditions under which BigQuery can decide to not create a Job are subject to change. """ + + +class SourceColumnMatch(str, enum.Enum): + """Uses sensible defaults based on how the schema is provided. + If autodetect is used, then columns are matched by name. Otherwise, columns + are matched by position. This is done to keep the behavior backward-compatible. + """ + + SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED" + """Unspecified column name match option.""" + + POSITION = "POSITION" + """Matches by position. This assumes that the columns are ordered the same + way as the schema.""" + + NAME = "NAME" + """Matches by name. This reads the header row as column names and reorders + columns to match the field names in the schema.""" diff --git a/google/cloud/bigquery/external_config.py b/google/cloud/bigquery/external_config.py index 69ed72bc9..dc7a33e6a 100644 --- a/google/cloud/bigquery/external_config.py +++ b/google/cloud/bigquery/external_config.py @@ -30,6 +30,7 @@ from google.cloud.bigquery._helpers import _int_or_none from google.cloud.bigquery._helpers import _str_or_none from google.cloud.bigquery import _helpers +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions from google.cloud.bigquery import schema from google.cloud.bigquery.schema import SchemaField @@ -474,6 +475,39 @@ def skip_leading_rows(self): def skip_leading_rows(self, value): self._properties["skipLeadingRows"] = str(value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#CsvOptions.FIELDS.source_column_match + """ + + value = self._properties.get("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._properties["sourceColumnMatch"] = value if value else None + @property def null_markers(self) -> Optional[Iterable[str]]: """Optional[Iterable[str]]: A list of strings represented as SQL NULL values in a CSV file. diff --git a/google/cloud/bigquery/job/load.py b/google/cloud/bigquery/job/load.py index eabc12cfc..8cdb779ac 100644 --- a/google/cloud/bigquery/job/load.py +++ b/google/cloud/bigquery/job/load.py @@ -15,9 +15,10 @@ """Classes for load jobs.""" import typing -from typing import FrozenSet, List, Iterable, Optional +from typing import FrozenSet, List, Iterable, Optional, Union from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration +from google.cloud.bigquery.enums import SourceColumnMatch from google.cloud.bigquery.external_config import HivePartitioningOptions from google.cloud.bigquery.format_options import ParquetOptions from google.cloud.bigquery import _helpers @@ -569,6 +570,39 @@ def source_format(self): def source_format(self, value): self._set_sub_prop("sourceFormat", value) + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the + strategy used to match loaded columns to the schema. If not set, a sensible + default is chosen based on how the schema is provided. If autodetect is + used, then columns are matched by name. Otherwise, columns are matched by + position. This is done to keep the behavior backward-compatible. + + Acceptable values are: + + SOURCE_COLUMN_MATCH_UNSPECIFIED: Unspecified column name match option. + POSITION: matches by position. This assumes that the columns are ordered + the same way as the schema. + NAME: matches by name. This reads the header row as column names and + reorders columns to match the field names in the schema. + + See: + + https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match + """ + value = self._get_sub_prop("sourceColumnMatch") + return SourceColumnMatch(value) if value is not None else None + + @source_column_match.setter + def source_column_match(self, value: Union[SourceColumnMatch, str, None]): + if value is not None and not isinstance(value, (SourceColumnMatch, str)): + raise TypeError( + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None" + ) + if isinstance(value, SourceColumnMatch): + value = value.value + self._set_sub_prop("sourceColumnMatch", value if value else None) + @property def date_format(self) -> Optional[str]: """Optional[str]: Date format used for parsing DATE values. @@ -983,6 +1017,13 @@ def clustering_fields(self): """ return self.configuration.clustering_fields + @property + def source_column_match(self) -> Optional[SourceColumnMatch]: + """See + :attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`. + """ + return self.configuration.source_column_match + @property def date_format(self): """See diff --git a/tests/unit/job/test_load.py b/tests/unit/job/test_load.py index 5d52401c9..b551d52dd 100644 --- a/tests/unit/job/test_load.py +++ b/tests/unit/job/test_load.py @@ -19,6 +19,7 @@ from .helpers import _Base from .helpers import _make_client +from google.cloud.bigquery.enums import SourceColumnMatch class TestLoadJob(_Base): @@ -37,6 +38,7 @@ def _setUpConstants(self): self.OUTPUT_BYTES = 23456 self.OUTPUT_ROWS = 345 self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference" + self.SOURCE_COLUMN_MATCH = "NAME" self.DATE_FORMAT = "%Y-%m-%d" self.DATETIME_FORMAT = "%Y-%m-%dT%H:%M:%S" self.TIME_ZONE = "UTC" @@ -48,6 +50,7 @@ def _make_resource(self, started=False, ended=False): resource = super(TestLoadJob, self)._make_resource(started, ended) config = resource["configuration"]["load"] config["sourceUris"] = [self.SOURCE1] + config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH config["dateFormat"] = self.DATE_FORMAT config["datetimeFormat"] = self.DATETIME_FORMAT config["timeZone"] = self.TIME_ZONE @@ -189,6 +192,15 @@ def _verifyResourceProperties(self, job, resource): else: self.assertIsNone(job.timestamp_format) + if "sourceColumnMatch" in config: + # job.source_column_match will be an Enum, config[...] is a string + self.assertEqual( + job.source_column_match.value, + config["sourceColumnMatch"], + ) + else: + self.assertIsNone(job.source_column_match) + def test_ctor(self): client = _make_client(project=self.PROJECT) job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client) @@ -231,6 +243,7 @@ def test_ctor(self): self.assertIsNone(job.clustering_fields) self.assertIsNone(job.schema_update_options) self.assertIsNone(job.reference_file_schema_uri) + self.assertIsNone(job.source_column_match) self.assertIsNone(job.date_format) self.assertIsNone(job.datetime_format) self.assertIsNone(job.time_zone) @@ -631,6 +644,7 @@ def test_begin_w_alternate_client(self): ] }, "schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION], + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, "dateFormat": self.DATE_FORMAT, "datetimeFormat": self.DATETIME_FORMAT, "timeZone": self.TIME_ZONE, @@ -665,6 +679,7 @@ def test_begin_w_alternate_client(self): config.write_disposition = WriteDisposition.WRITE_TRUNCATE config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION] config.reference_file_schema_uri = "gs://path/to/reference" + config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH) config.date_format = self.DATE_FORMAT config.datetime_format = self.DATETIME_FORMAT config.time_zone = self.TIME_ZONE diff --git a/tests/unit/job/test_load_config.py b/tests/unit/job/test_load_config.py index 8ff9244d2..27d3cead1 100644 --- a/tests/unit/job/test_load_config.py +++ b/tests/unit/job/test_load_config.py @@ -844,6 +844,38 @@ def test_write_disposition_setter(self): config._properties["load"]["writeDisposition"], write_disposition ) + def test_source_column_match_missing(self): + config = self._get_target_class()() + self.assertIsNone(config.source_column_match) + + def test_source_column_match_hit(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.NAME + config = self._get_target_class()() + # Assume API stores the string value of the enum + config._properties["load"]["sourceColumnMatch"] = option_enum.value + self.assertEqual(config.source_column_match, option_enum) + + def test_source_column_match_setter(self): + from google.cloud.bigquery.enums import SourceColumnMatch + + option_enum = SourceColumnMatch.POSITION + config = self._get_target_class()() + config.source_column_match = option_enum + # Assert that the string value of the enum is stored + self.assertEqual( + config._properties["load"]["sourceColumnMatch"], option_enum.value + ) + option_str = "NAME" + config.source_column_match = option_str + self.assertEqual(config._properties["load"]["sourceColumnMatch"], option_str) + + def test_source_column_match_setter_invalid_type(self): + config = self._get_target_class()() + with self.assertRaises(TypeError): + config.source_column_match = 3.14 + def test_date_format_missing(self): config = self._get_target_class()() self.assertIsNone(config.date_format) diff --git a/tests/unit/test_external_config.py b/tests/unit/test_external_config.py index 61532b4b8..ea827a560 100644 --- a/tests/unit/test_external_config.py +++ b/tests/unit/test_external_config.py @@ -19,12 +19,14 @@ from google.cloud.bigquery import external_config from google.cloud.bigquery import schema +from google.cloud.bigquery.enums import SourceColumnMatch import pytest class TestExternalConfig(unittest.TestCase): SOURCE_URIS = ["gs://foo", "gs://bar"] + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME DATE_FORMAT = "MM/DD/YYYY" DATETIME_FORMAT = "MM/DD/YYYY HH24:MI:SS" TIME_ZONE = "America/Los_Angeles" @@ -277,6 +279,7 @@ def test_from_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, "nullMarkers": ["", "NA"], }, }, @@ -294,6 +297,10 @@ def test_from_api_repr_csv(self): self.assertEqual(ec.options.allow_jagged_rows, False) self.assertEqual(ec.options.encoding, "encoding") self.assertEqual(ec.options.preserve_ascii_control_characters, False) + self.assertEqual( + ec.options.source_column_match, + self.SOURCE_COLUMN_MATCH, + ) self.assertEqual(ec.options.null_markers, ["", "NA"]) got_resource = ec.to_api_repr() @@ -316,7 +323,9 @@ def test_to_api_repr_csv(self): options.skip_leading_rows = 123 options.allow_jagged_rows = False options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH options.null_markers = ["", "NA"] + ec.csv_options = options exp_resource = { @@ -329,6 +338,7 @@ def test_to_api_repr_csv(self): "allowJaggedRows": False, "encoding": "encoding", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, "nullMarkers": ["", "NA"], }, } @@ -881,7 +891,9 @@ def test_to_api_repr(self): ) -class CSVOptions(unittest.TestCase): +class TestCSVOptions(unittest.TestCase): + SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME + def test_to_api_repr(self): options = external_config.CSVOptions() options.field_delimiter = "\t" @@ -891,6 +903,7 @@ def test_to_api_repr(self): options.allow_jagged_rows = False options.encoding = "UTF-8" options.preserve_ascii_control_characters = False + options.source_column_match = self.SOURCE_COLUMN_MATCH resource = options.to_api_repr() @@ -904,9 +917,37 @@ def test_to_api_repr(self): "allowJaggedRows": False, "encoding": "UTF-8", "preserveAsciiControlCharacters": False, + "sourceColumnMatch": self.SOURCE_COLUMN_MATCH, }, ) + def test_source_column_match_None(self): + ec = external_config.CSVOptions() + ec.source_column_match = None + expected = None + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_valid_input(self): + ec = external_config.CSVOptions() + ec.source_column_match = SourceColumnMatch.NAME + expected = "NAME" + result = ec.source_column_match + self.assertEqual(expected, result) + + ec.source_column_match = "POSITION" + expected = "POSITION" + result = ec.source_column_match + self.assertEqual(expected, result) + + def test_source_column_match_invalid_input(self): + ec = external_config.CSVOptions() + with self.assertRaisesRegex( + TypeError, + "value must be a google.cloud.bigquery.enums.SourceColumnMatch, str, or None", + ): + ec.source_column_match = 3.14 + class TestGoogleSheetsOptions(unittest.TestCase): def test_to_api_repr(self):