Skip to content

Commit fc3dbf7

Browse files
committed
Adds source_column_match and associated tests
1 parent 37e4e0e commit fc3dbf7

File tree

6 files changed

+157
-0
lines changed

6 files changed

+157
-0
lines changed

google/cloud/bigquery/enums.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,3 +462,21 @@ class JobCreationMode(object):
462462
The conditions under which BigQuery can decide to not create a Job are
463463
subject to change.
464464
"""
465+
466+
467+
class SourceColumnMatch(str, enum.Enum):
468+
"""Uses sensible defaults based on how the schema is provided.
469+
If autodetect is used, then columns are matched by name. Otherwise, columns
470+
are matched by position. This is done to keep the behavior backward-compatible.
471+
"""
472+
473+
SOURCE_COLUMN_MATCH_UNSPECIFIED = "SOURCE_COLUMN_MATCH_UNSPECIFIED"
474+
"""Unspecified column name match option."""
475+
476+
POSITION = "POSITION"
477+
"""Matches by position. This assumes that the columns are ordered the same
478+
way as the schema."""
479+
480+
NAME = "NAME"
481+
"""Matches by name. This reads the header row as column names and reorders
482+
columns to match the field names in the schema."""

google/cloud/bigquery/external_config.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from google.cloud.bigquery._helpers import _int_or_none
3131
from google.cloud.bigquery._helpers import _str_or_none
3232
from google.cloud.bigquery import _helpers
33+
from google.cloud.bigquery.enums import SourceColumnMatch
3334
from google.cloud.bigquery.format_options import AvroOptions, ParquetOptions
3435
from google.cloud.bigquery import schema
3536
from google.cloud.bigquery.schema import SchemaField
@@ -474,6 +475,36 @@ def skip_leading_rows(self):
474475
def skip_leading_rows(self, value):
475476
self._properties["skipLeadingRows"] = str(value)
476477

478+
@property
479+
def source_column_match(self) -> Optional[SourceColumnMatch]:
480+
"""Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
481+
strategy used to match loaded columns to the schema. If not set, a sensible
482+
default is chosen based on how the schema is provided. If autodetect is
483+
used, then columns are matched by name. Otherwise, columns are matched by
484+
position. This is done to keep the behavior backward-compatible.
485+
Acceptable values are:
486+
SOURCE_COLUMN_MATCH_UNSPECIFIED - Unspecified column name match option.
487+
POSITION - matches by position. This assumes that the columns are ordered
488+
the same way as the schema.
489+
NAME - matches by name. This reads the header row as column names and
490+
reorders columns to match the field names in the schema.
491+
See
492+
https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#ExternalDataConfiguration.FIELDS.source_column_match
493+
"""
494+
495+
value = self._properties.get("sourceColumnMatch")
496+
if value is not None:
497+
return SourceColumnMatch(value)
498+
return None
499+
500+
@source_column_match.setter
501+
def source_column_match(self, value: Optional[SourceColumnMatch]):
502+
if value is not None and not isinstance(value, SourceColumnMatch):
503+
raise TypeError(
504+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
505+
)
506+
self._properties["sourceColumnMatch"] = value.value if value else None
507+
477508
def to_api_repr(self) -> dict:
478509
"""Build an API representation of this object.
479510

google/cloud/bigquery/job/load.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
from typing import FrozenSet, List, Iterable, Optional
1919

2020
from google.cloud.bigquery.encryption_configuration import EncryptionConfiguration
21+
from google.cloud.bigquery.enums import SourceColumnMatch
2122
from google.cloud.bigquery.external_config import HivePartitioningOptions
2223
from google.cloud.bigquery.format_options import ParquetOptions
2324
from google.cloud.bigquery import _helpers
@@ -548,6 +549,35 @@ def source_format(self):
548549
def source_format(self, value):
549550
self._set_sub_prop("sourceFormat", value)
550551

552+
@property
553+
def source_column_match(self) -> Optional[SourceColumnMatch]:
554+
"""Optional[google.cloud.bigquery.enums.SourceColumnMatch]: Controls the
555+
strategy used to match loaded columns to the schema. If not set, a sensible
556+
default is chosen based on how the schema is provided. If autodetect is
557+
used, then columns are matched by name. Otherwise, columns are matched by
558+
position. This is done to keep the behavior backward-compatible.
559+
Acceptable values are:
560+
SOURCE_COLUMN_MATCH_UNSPECIFIED - Unspecified column name match option.
561+
POSITION - matches by position. This assumes that the columns are ordered
562+
the same way as the schema.
563+
NAME - matches by name. This reads the header row as column names and
564+
reorders columns to match the field names in the schema.
565+
See:
566+
https://cloud.google.com/bigquery/docs/reference/rest/v2/Job#JobConfigurationLoad.FIELDS.source_column_match
567+
"""
568+
value = self._get_sub_prop("sourceColumnMatch")
569+
if value is not None:
570+
return SourceColumnMatch(value)
571+
return None
572+
573+
@source_column_match.setter
574+
def source_column_match(self, value: Optional[SourceColumnMatch]):
575+
if value is not None and not isinstance(value, SourceColumnMatch):
576+
raise TypeError(
577+
"value must be a google.cloud.bigquery.enums.SourceColumnMatch or None"
578+
)
579+
self._set_sub_prop("sourceColumnMatch", value.value if value else None)
580+
551581
@property
552582
def time_partitioning(self):
553583
"""Optional[google.cloud.bigquery.table.TimePartitioning]: Specifies time-based
@@ -889,6 +919,13 @@ def clustering_fields(self):
889919
"""
890920
return self.configuration.clustering_fields
891921

922+
@property
923+
def source_column_match(self):
924+
"""See
925+
:attr:`google.cloud.bigquery.job.LoadJobConfig.source_column_match`.
926+
"""
927+
return self.configuration.source_column_match
928+
892929
@property
893930
def schema_update_options(self):
894931
"""See

tests/unit/job/test_load.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
from .helpers import _Base
2121
from .helpers import _make_client
22+
from google.cloud.bigquery.enums import SourceColumnMatch
2223

2324

2425
class TestLoadJob(_Base):
@@ -37,11 +38,13 @@ def _setUpConstants(self):
3738
self.OUTPUT_BYTES = 23456
3839
self.OUTPUT_ROWS = 345
3940
self.REFERENCE_FILE_SCHEMA_URI = "gs://path/to/reference"
41+
self.SOURCE_COLUMN_MATCH = "NAME"
4042

4143
def _make_resource(self, started=False, ended=False):
4244
resource = super(TestLoadJob, self)._make_resource(started, ended)
4345
config = resource["configuration"]["load"]
4446
config["sourceUris"] = [self.SOURCE1]
47+
config["sourceColumnMatch"] = self.SOURCE_COLUMN_MATCH
4548
config["destinationTable"] = {
4649
"projectId": self.PROJECT,
4750
"datasetId": self.DS_ID,
@@ -153,6 +156,15 @@ def _verifyResourceProperties(self, job, resource):
153156
else:
154157
self.assertIsNone(job.destination_encryption_configuration)
155158

159+
if "sourceColumnMatch" in config:
160+
# job.source_column_match will be an Enum, config[...] is a string
161+
self.assertEqual(
162+
job.source_column_match.value,
163+
config["sourceColumnMatch"],
164+
)
165+
else:
166+
self.assertIsNone(job.source_column_match)
167+
156168
def test_ctor(self):
157169
client = _make_client(project=self.PROJECT)
158170
job = self._make_one(self.JOB_ID, [self.SOURCE1], self.TABLE_REF, client)
@@ -194,6 +206,7 @@ def test_ctor(self):
194206
self.assertIsNone(job.clustering_fields)
195207
self.assertIsNone(job.schema_update_options)
196208
self.assertIsNone(job.reference_file_schema_uri)
209+
self.assertIsNone(job.source_column_match)
197210

198211
def test_ctor_w_config(self):
199212
from google.cloud.bigquery.schema import SchemaField
@@ -571,6 +584,7 @@ def test_begin_w_alternate_client(self):
571584
]
572585
},
573586
"schemaUpdateOptions": [SchemaUpdateOption.ALLOW_FIELD_ADDITION],
587+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
574588
}
575589
RESOURCE["configuration"]["load"] = LOAD_CONFIGURATION
576590
conn1 = make_connection()
@@ -599,6 +613,7 @@ def test_begin_w_alternate_client(self):
599613
config.write_disposition = WriteDisposition.WRITE_TRUNCATE
600614
config.schema_update_options = [SchemaUpdateOption.ALLOW_FIELD_ADDITION]
601615
config.reference_file_schema_uri = "gs://path/to/reference"
616+
config.source_column_match = SourceColumnMatch(self.SOURCE_COLUMN_MATCH)
602617
with mock.patch(
603618
"google.cloud.bigquery.opentelemetry_tracing._get_final_span_attributes"
604619
) as final_attributes:

tests/unit/job/test_load_config.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,35 @@ def test_write_disposition_setter(self):
828828
config._properties["load"]["writeDisposition"], write_disposition
829829
)
830830

831+
def test_source_column_match_missing(self):
832+
config = self._get_target_class()()
833+
self.assertIsNone(config.source_column_match)
834+
835+
def test_source_column_match_hit(self):
836+
from google.cloud.bigquery.enums import SourceColumnMatch
837+
838+
option_enum = SourceColumnMatch.NAME
839+
config = self._get_target_class()()
840+
# Assume API stores the string value of the enum
841+
config._properties["load"]["sourceColumnMatch"] = option_enum.value
842+
self.assertEqual(config.source_column_match, option_enum)
843+
844+
def test_source_column_match_setter(self):
845+
from google.cloud.bigquery.enums import SourceColumnMatch
846+
847+
option_enum = SourceColumnMatch.POSITION
848+
config = self._get_target_class()()
849+
config.source_column_match = option_enum
850+
# Assert that the string value of the enum is stored
851+
self.assertEqual(
852+
config._properties["load"]["sourceColumnMatch"], option_enum.value
853+
)
854+
855+
def test_source_column_match_setter_invalid_type(self):
856+
config = self._get_target_class()()
857+
with self.assertRaises(TypeError):
858+
config.source_column_match = "INVALID_STRING_TYPE"
859+
831860
def test_parquet_options_missing(self):
832861
config = self._get_target_class()()
833862
self.assertIsNone(config.parquet_options)

tests/unit/test_external_config.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,14 @@
1919

2020
from google.cloud.bigquery import external_config
2121
from google.cloud.bigquery import schema
22+
from google.cloud.bigquery.enums import SourceColumnMatch
2223

2324
import pytest
2425

2526

2627
class TestExternalConfig(unittest.TestCase):
2728
SOURCE_URIS = ["gs://foo", "gs://bar"]
29+
SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
2830

2931
BASE_RESOURCE = {
3032
"sourceFormat": "",
@@ -121,6 +123,20 @@ def test_schema_empty(self):
121123
want = {"sourceFormat": "", "schema": {"fields": []}}
122124
self.assertEqual(got, want)
123125

126+
def test_source_column_match_None(self):
127+
ec = external_config.ExternalConfig("")
128+
ec.source_column_match = None
129+
expected = None
130+
result = ec.source_column_match
131+
self.assertEqual(expected, result)
132+
133+
def test_source_column_match_valid_input(self):
134+
ec = external_config.ExternalConfig("")
135+
ec.source_column_match = SourceColumnMatch.NAME
136+
expected = "NAME"
137+
result = ec.source_column_match
138+
self.assertEqual(expected, result)
139+
124140
def _verify_base(self, ec):
125141
self.assertEqual(ec.autodetect, True)
126142
self.assertEqual(ec.compression, "compression")
@@ -251,6 +267,7 @@ def test_from_api_repr_csv(self):
251267
"allowJaggedRows": False,
252268
"encoding": "encoding",
253269
"preserveAsciiControlCharacters": False,
270+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
254271
},
255272
},
256273
)
@@ -267,6 +284,10 @@ def test_from_api_repr_csv(self):
267284
self.assertEqual(ec.options.allow_jagged_rows, False)
268285
self.assertEqual(ec.options.encoding, "encoding")
269286
self.assertEqual(ec.options.preserve_ascii_control_characters, False)
287+
self.assertEqual(
288+
ec.options.source_column_match,
289+
self.SOURCE_COLUMN_MATCH,
290+
)
270291

271292
got_resource = ec.to_api_repr()
272293

@@ -288,6 +309,7 @@ def test_to_api_repr_csv(self):
288309
options.skip_leading_rows = 123
289310
options.allow_jagged_rows = False
290311
options.preserve_ascii_control_characters = False
312+
options.source_column_match = self.SOURCE_COLUMN_MATCH
291313
ec.csv_options = options
292314

293315
exp_resource = {
@@ -300,6 +322,7 @@ def test_to_api_repr_csv(self):
300322
"allowJaggedRows": False,
301323
"encoding": "encoding",
302324
"preserveAsciiControlCharacters": False,
325+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
303326
},
304327
}
305328

@@ -852,6 +875,8 @@ def test_to_api_repr(self):
852875

853876

854877
class CSVOptions(unittest.TestCase):
878+
SOURCE_COLUMN_MATCH = SourceColumnMatch.NAME
879+
855880
def test_to_api_repr(self):
856881
options = external_config.CSVOptions()
857882
options.field_delimiter = "\t"
@@ -861,6 +886,7 @@ def test_to_api_repr(self):
861886
options.allow_jagged_rows = False
862887
options.encoding = "UTF-8"
863888
options.preserve_ascii_control_characters = False
889+
options.source_column_match = self.SOURCE_COLUMN_MATCH
864890

865891
resource = options.to_api_repr()
866892

@@ -874,6 +900,7 @@ def test_to_api_repr(self):
874900
"allowJaggedRows": False,
875901
"encoding": "UTF-8",
876902
"preserveAsciiControlCharacters": False,
903+
"sourceColumnMatch": self.SOURCE_COLUMN_MATCH,
877904
},
878905
)
879906

0 commit comments

Comments
 (0)