Skip to content

Commit a00eb5b

Browse files
Support BigQuery custom schema's for external data using CSV / NDJSON (#3717) (#2264)
* support custom external_data_configuration.schema for CSV and NDJSON formats * fix linting error * fix mixed indentation * improve on documentation Signed-off-by: Modular Magician <[email protected]>
1 parent 6c0ab3d commit a00eb5b

File tree

4 files changed

+139
-18
lines changed

4 files changed

+139
-18
lines changed

.changelog/3717.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
```release-note:enhancement
2+
bigquery: added support for BigQuery custom schemas for external data using CSV / NDJSON
3+
```

google-beta/resource_bigquery_table.go

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,21 @@ func resourceBigQueryTable() *schema.Resource {
109109
Default: "NONE",
110110
Description: `The compression type of the data source. Valid values are "NONE" or "GZIP".`,
111111
},
112+
// Schema: Optional] The schema for the data.
113+
// Schema is required for CSV and JSON formats if autodetect is not on.
114+
// Schema is disallowed for Google Cloud Bigtable, Cloud Datastore backups, Avro, ORC and Parquet formats.
115+
"schema": {
116+
Type: schema.TypeString,
117+
Optional: true,
118+
Computed: true,
119+
ForceNew: true,
120+
ValidateFunc: validation.ValidateJsonString,
121+
StateFunc: func(v interface{}) string {
122+
json, _ := structure.NormalizeJsonString(v)
123+
return json
124+
},
125+
Description: `A JSON schema for the external table. Schema is required for CSV and JSON formats and is disallowed for Google Cloud Bigtable, Cloud Datastore backups, and Avro formats when using external tables.`,
126+
},
112127
// CsvOptions: [Optional] Additional properties to set if
113128
// sourceFormat is set to CSV.
114129
"csv_options": {
@@ -275,9 +290,6 @@ func resourceBigQueryTable() *schema.Resource {
275290
},
276291

277292
// Schema: [Optional] Describes the schema of this table.
278-
// Schema is required for external tables in CSV and JSON formats
279-
// and disallowed for Google Cloud Bigtable, Cloud Datastore backups,
280-
// and Avro formats.
281293
"schema": {
282294
Type: schema.TypeString,
283295
Optional: true,
@@ -287,7 +299,7 @@ func resourceBigQueryTable() *schema.Resource {
287299
json, _ := structure.NormalizeJsonString(v)
288300
return json
289301
},
290-
Description: `A JSON schema for the table. Schema is required for CSV and JSON formats and is disallowed for Google Cloud Bigtable, Cloud Datastore backups, and Avro formats when using external tables.`,
302+
Description: `A JSON schema for the table.`,
291303
},
292304

293305
// View: [Optional] If specified, configures this table as a view.
@@ -636,7 +648,6 @@ func resourceBigQueryTableCreate(d *schema.ResourceData, meta interface{}) error
636648
}
637649

638650
log.Printf("[INFO] BigQuery table %s has been created", res.Id)
639-
640651
d.SetId(fmt.Sprintf("projects/%s/datasets/%s/tables/%s", res.TableReference.ProjectId, res.TableReference.DatasetId, res.TableReference.TableId))
641652

642653
return resourceBigQueryTableRead(d, meta)
@@ -683,6 +694,24 @@ func resourceBigQueryTableRead(d *schema.ResourceData, meta interface{}) error {
683694
return err
684695
}
685696

697+
if v, ok := d.GetOk("external_data_configuration"); ok {
698+
// The API response doesn't return the `external_data_configuration.schema`
699+
// used when creating the table and it cannot be queried.
700+
// After creation, a computed schema is stored in the toplevel `schema`,
701+
// which combines `external_data_configuration.schema`
702+
// with any hive partioning fields found in the `source_uri_prefix`.
703+
// So just assume the configured schema has been applied after successful
704+
// creation, by copying the configured value back into the resource schema.
705+
// This avoids that reading back this field will be identified as a change.
706+
// The `ForceNew=true` on `external_data_configuration.schema` will ensure
707+
// the users' expectation that changing the configured input schema will
708+
// recreate the resource.
709+
edc := v.([]interface{})[0].(map[string]interface{})
710+
if edc["schema"] != nil {
711+
externalDataConfiguration[0]["schema"] = edc["schema"]
712+
}
713+
}
714+
686715
d.Set("external_data_configuration", externalDataConfiguration)
687716
}
688717

@@ -804,6 +833,13 @@ func expandExternalDataConfiguration(cfg interface{}) (*bigquery.ExternalDataCon
804833
if v, ok := raw["max_bad_records"]; ok {
805834
edc.MaxBadRecords = int64(v.(int))
806835
}
836+
if v, ok := raw["schema"]; ok {
837+
schema, err := expandSchema(v)
838+
if err != nil {
839+
return nil, err
840+
}
841+
edc.Schema = schema
842+
}
807843
if v, ok := raw["source_format"]; ok {
808844
edc.SourceFormat = v.(string)
809845
}

google-beta/resource_bigquery_table_test.go

Lines changed: 82 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,31 @@ func TestAccBigQueryTable_HivePartitioning(t *testing.T) {
119119
})
120120
}
121121

122+
func TestAccBigQueryTable_HivePartitioningCustomSchema(t *testing.T) {
123+
t.Parallel()
124+
bucketName := testBucketName(t)
125+
resourceName := "google_bigquery_table.test"
126+
datasetID := fmt.Sprintf("tf_test_%s", randString(t, 10))
127+
tableID := fmt.Sprintf("tf_test_%s", randString(t, 10))
128+
129+
vcrTest(t, resource.TestCase{
130+
PreCheck: func() { testAccPreCheck(t) },
131+
Providers: testAccProviders,
132+
CheckDestroy: testAccCheckBigQueryTableDestroyProducer(t),
133+
Steps: []resource.TestStep{
134+
{
135+
Config: testAccBigQueryTableHivePartitioningCustomSchema(bucketName, datasetID, tableID),
136+
},
137+
{
138+
ResourceName: resourceName,
139+
ImportState: true,
140+
ImportStateVerify: true,
141+
ImportStateVerifyIgnore: []string{"external_data_configuration.0.schema"},
142+
},
143+
},
144+
})
145+
}
146+
122147
func TestAccBigQueryTable_RangePartitioning(t *testing.T) {
123148
t.Parallel()
124149
resourceName := "google_bigquery_table.test"
@@ -480,23 +505,72 @@ resource "google_storage_bucket_object" "test" {
480505
}
481506
482507
resource "google_bigquery_dataset" "test" {
483-
dataset_id = "%s"
508+
dataset_id = "%s"
484509
}
485510
486511
resource "google_bigquery_table" "test" {
487512
table_id = "%s"
488513
dataset_id = google_bigquery_dataset.test.dataset_id
489514
490515
external_data_configuration {
491-
source_format = "CSV"
492-
autodetect = true
493-
source_uris= ["gs://${google_storage_bucket.test.name}/*"]
516+
source_format = "CSV"
517+
autodetect = true
518+
source_uris= ["gs://${google_storage_bucket.test.name}/*"]
494519
495-
hive_partitioning_options {
496-
mode = "AUTO"
497-
source_uri_prefix = "gs://${google_storage_bucket.test.name}/"
498-
}
520+
hive_partitioning_options {
521+
mode = "AUTO"
522+
source_uri_prefix = "gs://${google_storage_bucket.test.name}/"
523+
}
499524
525+
}
526+
depends_on = ["google_storage_bucket_object.test"]
527+
}
528+
`, bucketName, datasetID, tableID)
529+
}
530+
531+
func testAccBigQueryTableHivePartitioningCustomSchema(bucketName, datasetID, tableID string) string {
532+
return fmt.Sprintf(`
533+
resource "google_storage_bucket" "test" {
534+
name = "%s"
535+
force_destroy = true
536+
}
537+
538+
resource "google_storage_bucket_object" "test" {
539+
name = "key1=20200330/data.json"
540+
content = "{\"name\":\"test\", \"last_modification\":\"2020-04-01\"}"
541+
bucket = google_storage_bucket.test.name
542+
}
543+
544+
resource "google_bigquery_dataset" "test" {
545+
dataset_id = "%s"
546+
}
547+
548+
resource "google_bigquery_table" "test" {
549+
table_id = "%s"
550+
dataset_id = google_bigquery_dataset.test.dataset_id
551+
552+
external_data_configuration {
553+
source_format = "NEWLINE_DELIMITED_JSON"
554+
autodetect = false
555+
source_uris= ["gs://${google_storage_bucket.test.name}/*"]
556+
557+
hive_partitioning_options {
558+
mode = "CUSTOM"
559+
source_uri_prefix = "gs://${google_storage_bucket.test.name}/{key1:STRING}"
560+
}
561+
562+
schema = <<EOH
563+
[
564+
{
565+
"name": "name",
566+
"type": "STRING"
567+
},
568+
{
569+
"name": "last_modification",
570+
"type": "DATE"
571+
}
572+
]
573+
EOH
500574
}
501575
depends_on = ["google_storage_bucket_object.test"]
502576
}

website/docs/r/bigquery_table.html.markdown

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,7 @@ The following arguments are supported:
112112

113113
* `labels` - (Optional) A mapping of labels to assign to the resource.
114114

115-
* `schema` - (Optional) A JSON schema for the table. Schema is required
116-
for CSV and JSON formats and is disallowed for Google Cloud
117-
Bigtable, Cloud Datastore backups, and Avro formats when using
118-
external tables. For more information see the
119-
[BigQuery API documentation](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#resource).
115+
* `schema` - (Optional) A JSON schema for the table.
120116
~>**NOTE**: Because this field expects a JSON string, any changes to the
121117
string will create a diff, even if the JSON itself hasn't changed.
122118
If the API returns a different value for the same schema, e.g. it
@@ -167,6 +163,18 @@ The `external_data_configuration` block supports:
167163
* `max_bad_records` (Optional) - The maximum number of bad records that
168164
BigQuery can ignore when reading data.
169165

166+
* `schema` - (Optional) A JSON schema for the external table. Schema is required
167+
for CSV and JSON formats if autodetect is not on. Schema is disallowed
168+
for Google Cloud Bigtable, Cloud Datastore backups, Avro, ORC and Parquet formats.
169+
~>**NOTE**: Because this field expects a JSON string, any changes to the
170+
string will create a diff, even if the JSON itself hasn't changed.
171+
Furthermore drift for this field cannot not be detected because BigQuery
172+
only uses this schema to compute the effective schema for the table, therefore
173+
any changes on the configured value will force the table to be recreated.
174+
This schema is effectively only applied when creating a table from an external
175+
datasource, after creation the computed schema will be stored in
176+
`google_bigquery_table.schema`
177+
170178
* `source_format` (Required) - The data format. Supported values are:
171179
"CSV", "GOOGLE_SHEETS", "NEWLINE_DELIMITED_JSON", "AVRO", "PARQUET",
172180
and "DATSTORE_BACKUP". To use "GOOGLE_SHEETS"

0 commit comments

Comments
 (0)