|
19 | 19 | import com.google.cloud.bigquery.BigQuery; |
20 | 20 | import com.google.cloud.bigquery.Dataset; |
21 | 21 | import com.google.cloud.bigquery.DatasetId; |
| 22 | +import com.google.cloud.bigquery.Table; |
22 | 23 | import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration; |
23 | 24 | import com.google.cloud.hadoop.io.bigquery.output.BigQueryTableFieldSchema; |
24 | 25 | import com.google.cloud.kms.v1.CryptoKeyName; |
|
47 | 48 | import org.slf4j.LoggerFactory; |
48 | 49 |
|
49 | 50 | import java.io.IOException; |
| 51 | +import java.util.ArrayList; |
| 52 | +import java.util.Arrays; |
| 53 | +import java.util.HashSet; |
50 | 54 | import java.util.List; |
| 55 | +import java.util.Set; |
51 | 56 | import java.util.UUID; |
52 | 57 | import java.util.stream.Collectors; |
53 | 58 |
|
@@ -145,14 +150,27 @@ public void transform(StructuredRecord input, Emitter<KeyValue<StructuredRecord, |
145 | 150 | */ |
146 | 151 | protected final void initOutput(BatchSinkContext context, BigQuery bigQuery, String outputName, String fqn, |
147 | 152 | String tableName, @Nullable Schema tableSchema, String bucket, |
148 | | - FailureCollector collector, @Nullable String marker) throws IOException { |
| 153 | + FailureCollector collector, @Nullable String marker, |
| 154 | + Table table) throws IOException { |
149 | 155 | LOG.debug("Init output for table '{}' with schema: {}", tableName, tableSchema); |
150 | 156 |
|
151 | 157 | List<BigQueryTableFieldSchema> fields = BigQuerySinkUtils.getBigQueryTableFields(bigQuery, tableName, tableSchema, |
152 | 158 | getConfig().isAllowSchemaRelaxation(), getConfig().getDatasetProject(), |
153 | 159 | getConfig().getDataset(), getConfig().isTruncateTableSet(), collector); |
154 | 160 |
|
155 | 161 | Configuration configuration = new Configuration(baseConfiguration); |
| 162 | + if (table != null) { |
| 163 | + com.google.cloud.bigquery.Schema bqSchema = table.getDefinition().getSchema(); |
| 164 | + if (bqSchema != null) { |
| 165 | + String jsonStringFields = BigQuerySinkUtils.getJsonStringFieldsFromBQSchema(bqSchema); |
| 166 | + configuration.set(BigQueryConstants.CONFIG_JSON_STRING_FIELDS, jsonStringFields); |
| 167 | + BigQuerySinkUtils.setJsonStringFields(fields, jsonStringFields); |
| 168 | + } |
| 169 | + } |
| 170 | + |
| 171 | + if (getConfig().getJsonStringFields() != null && !getConfig().getJsonStringFields().isEmpty()) { |
| 172 | + BigQuerySinkUtils.setJsonStringFields(fields, getConfig().getJsonStringFields()); |
| 173 | + } |
156 | 174 |
|
157 | 175 | // Build GCS storage path for this bucket output. |
158 | 176 | String temporaryGcsPath = BigQuerySinkUtils.getTemporaryGcsPath(bucket, runUUID.toString(), tableName); |
@@ -229,6 +247,7 @@ private Configuration getBaseConfiguration(@Nullable CryptoKeyName cmekKeyName) |
229 | 247 | config.isAllowSchemaRelaxation()); |
230 | 248 | baseConfiguration.setStrings(BigQueryConfiguration.OUTPUT_TABLE_WRITE_DISPOSITION.getKey(), |
231 | 249 | config.getWriteDisposition().name()); |
| 250 | + baseConfiguration.setStrings(BigQueryConstants.CONFIG_JSON_STRING_FIELDS, config.getJsonStringFields()); |
232 | 251 | // this setting is needed because gcs has default chunk size of 64MB. This is large default chunk size which can |
233 | 252 | // cause OOM issue if there are many tables being written. See this - CDAP-16670 |
234 | 253 | String gcsChunkSize = "8388608"; |
@@ -310,4 +329,82 @@ protected Configuration getOutputConfiguration() throws IOException { |
310 | 329 | return configuration; |
311 | 330 | } |
312 | 331 |
|
| 332 | + /** |
| 333 | + * Validates that the fields to be converted to JSON strings are present in the Output Schema. |
| 334 | + * @param schema Output Schema. |
| 335 | + * @param jsonStringFields List of fields to be converted to JSON strings comma separated. |
| 336 | + * @param collector FailureCollector to collect errors. |
| 337 | + */ |
| 338 | + public void validateJsonStringFields(Schema schema, |
| 339 | + String jsonStringFields, FailureCollector collector) { |
| 340 | + Set<String> jsonFields = new HashSet<>(Arrays.asList(jsonStringFields.split(","))); |
| 341 | + Set<String> jsonFieldsValidated = new HashSet<>(); |
| 342 | + validateJsonStringFields(schema, jsonFields, new ArrayList<>(), collector, jsonFieldsValidated); |
| 343 | + jsonFields.removeAll(jsonFieldsValidated); |
| 344 | + if (!jsonFields.isEmpty()) { |
| 345 | + collector.addFailure(String.format("Field(s) '%s' are not present in the Output Schema.", |
| 346 | + String.join(", ", jsonFields)), |
| 347 | + "Remove the field(s) from the list of fields to be converted to JSON strings.") |
| 348 | + .withConfigProperty(AbstractBigQuerySinkConfig.NAME_JSON_STRING_FIELDS); |
| 349 | + } |
| 350 | + } |
| 351 | + |
| 352 | + private void validateJsonStringFields(Schema schema, Set<String> jsonFields, ArrayList<String> path, |
| 353 | + FailureCollector collector, Set<String> jsonFieldsValidated) { |
| 354 | + String fieldPath = String.join(".", path); |
| 355 | + String actionMessage = "Only type 'STRING' is supported."; |
| 356 | + |
| 357 | + Schema.LogicalType logicalType = schema.isNullable() ? schema.getNonNullable().getLogicalType() : |
| 358 | + schema.getLogicalType(); |
| 359 | + if (logicalType != null && jsonFields.contains(fieldPath)) { |
| 360 | + collector.addFailure( |
| 361 | + String.format("Field '%s' is of type '%s' which is not supported for conversion to JSON string.", |
| 362 | + fieldPath, logicalType), |
| 363 | + actionMessage).withConfigProperty(AbstractBigQuerySinkConfig.NAME_JSON_STRING_FIELDS); |
| 364 | + return; |
| 365 | + } |
| 366 | + Schema.Type type = getEffectiveType(schema); |
| 367 | + List<Schema.Field> fields = getEffectiveFields(schema); |
| 368 | + String errorMessage = String.format( |
| 369 | + "Field '%s' is of type '%s' which is not supported for conversion to JSON string.", fieldPath, type); |
| 370 | + |
| 371 | + if (type == Schema.Type.RECORD && fields != null) { |
| 372 | + if (jsonFields.contains(fieldPath)) { |
| 373 | + collector.addFailure(errorMessage, actionMessage) |
| 374 | + .withConfigProperty(AbstractBigQuerySinkConfig.NAME_JSON_STRING_FIELDS); |
| 375 | + } |
| 376 | + for (Schema.Field field : fields) { |
| 377 | + path.add(field.getName()); |
| 378 | + validateJsonStringFields(field.getSchema(), jsonFields, path, collector, jsonFieldsValidated); |
| 379 | + path.remove(path.size() - 1); |
| 380 | + } |
| 381 | + } else { |
| 382 | + jsonFieldsValidated.add(fieldPath); |
| 383 | + if (type != Schema.Type.STRING && jsonFields.contains(fieldPath)) { |
| 384 | + collector.addFailure(errorMessage, actionMessage) |
| 385 | + .withConfigProperty(AbstractBigQuerySinkConfig.NAME_JSON_STRING_FIELDS); |
| 386 | + } |
| 387 | + } |
| 388 | + } |
| 389 | + |
| 390 | + private static Schema.Type getEffectiveType(Schema schema) { |
| 391 | + Schema nonNullableSchema = schema.isNullable() ? schema.getNonNullable() : schema; |
| 392 | + if (nonNullableSchema.getType() == Schema.Type.ARRAY && nonNullableSchema.getComponentSchema() != null) { |
| 393 | + return nonNullableSchema.getComponentSchema().isNullable() ? |
| 394 | + nonNullableSchema.getComponentSchema().getNonNullable().getType() : |
| 395 | + nonNullableSchema.getComponentSchema().getType(); |
| 396 | + } |
| 397 | + return nonNullableSchema.getType(); |
| 398 | + } |
| 399 | + |
| 400 | + private static List<Schema.Field> getEffectiveFields(Schema schema) { |
| 401 | + Schema nonNullableSchema = schema.isNullable() ? schema.getNonNullable() : schema; |
| 402 | + if (nonNullableSchema.getType() == Schema.Type.ARRAY && nonNullableSchema.getComponentSchema() != null) { |
| 403 | + return nonNullableSchema.getComponentSchema().isNullable() ? |
| 404 | + nonNullableSchema.getComponentSchema().getNonNullable().getFields() : |
| 405 | + nonNullableSchema.getComponentSchema().getFields(); |
| 406 | + } |
| 407 | + return nonNullableSchema.getFields(); |
| 408 | + } |
| 409 | + |
313 | 410 | } |
0 commit comments