Add support to configure BigQuery clustering (#265)

odracci · web-flow · commit a6dc6f0f9915 · 2020-05-14T11:01:10.000-07:00
diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/BigQuerySinkTask.java
@@ -58,6 +58,7 @@
 import java.time.Instant;
 import java.util.Collection;
 import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
@@ -143,17 +144,17 @@ private PartitionedTableId getRecordTable(SinkRecord record) {
     TableId baseTableId = topicsToBaseTableIds.get(record.topic());
 
     PartitionedTableId.Builder builder = new PartitionedTableId.Builder(baseTableId);
-    if(usePartitionDecorator) {
-    	
-	  if (useMessageTimeDatePartitioning) {
-	    if (record.timestampType() == TimestampType.NO_TIMESTAMP_TYPE) {
-		  throw new ConnectException(
-	      "Message has no timestamp type, cannot use message timestamp to partition.");
-	    }
-	    builder.setDayPartition(record.timestamp());
-	  } else {
-	    builder.setDayPartitionForNow();
-	  }
+    if (usePartitionDecorator) {
+
+      if (useMessageTimeDatePartitioning) {
+        if (record.timestampType() == TimestampType.NO_TIMESTAMP_TYPE) {
+          throw new ConnectException(
+              "Message has no timestamp type, cannot use message timestamp to partition.");
+        }
+        builder.setDayPartition(record.timestamp());
+      } else {
+        builder.setDayPartitionForNow();
+      }
     }
 
     return builder.build();
@@ -266,8 +267,9 @@ private SchemaManager getSchemaManager(BigQuery bigQuery) {
     Optional<String> kafkaKeyFieldName = config.getKafkaKeyFieldName();
     Optional<String> kafkaDataFieldName = config.getKafkaDataFieldName();
     Optional<String> timestampPartitionFieldName = config.getTimestampPartitionFieldName();
+    Optional<List<String>> clusteringFieldName = config.getClusteringPartitionFieldName();
     return new SchemaManager(schemaRetriever, schemaConverter, bigQuery, kafkaKeyFieldName,
-                             kafkaDataFieldName, timestampPartitionFieldName);
+                             kafkaDataFieldName, timestampPartitionFieldName, clusteringFieldName);
   }
 
   private BigQueryWriter getBigQueryWriter() {
diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/SchemaManager.java
@@ -7,6 +7,7 @@
 import com.google.cloud.bigquery.StandardTableDefinition;
 import com.google.cloud.bigquery.TableId;
 import com.google.cloud.bigquery.TableInfo;
+import com.google.cloud.bigquery.Clustering;
 import com.google.cloud.bigquery.TimePartitioning;
 import com.google.cloud.bigquery.TimePartitioning.Type;
 import com.wepay.kafka.connect.bigquery.api.KafkaSchemaRecordType;
@@ -34,6 +35,7 @@ public class SchemaManager {
   private final Optional<String> kafkaKeyFieldName;
   private final Optional<String> kafkaDataFieldName;
   private final Optional<String> timestampPartitionFieldName;
+  private final Optional<List<String>> clusteringFieldName;
 
   /**
    * @param schemaRetriever Used to determine the Kafka Connect Schema that should be used for a
@@ -51,13 +53,15 @@ public SchemaManager(
       BigQuery bigQuery,
       Optional<String> kafkaKeyFieldName,
       Optional<String> kafkaDataFieldName,
-      Optional<String> timestampPartitionFieldName) {
+      Optional<String> timestampPartitionFieldName,
+      Optional<List<String>> clusteringFieldName) {
     this.schemaRetriever = schemaRetriever;
     this.schemaConverter = schemaConverter;
     this.bigQuery = bigQuery;
     this.kafkaKeyFieldName = kafkaKeyFieldName;
     this.kafkaDataFieldName = kafkaDataFieldName;
     this.timestampPartitionFieldName = timestampPartitionFieldName;
+    this.clusteringFieldName = clusteringFieldName;
   }
 
   /**
@@ -90,14 +94,22 @@ TableInfo constructTableInfo(TableId table, Schema kafkaKeySchema, Schema kafkaV
     com.google.cloud.bigquery.Schema bigQuerySchema = getBigQuerySchema(kafkaKeySchema, kafkaValueSchema);
 
     TimePartitioning timePartitioning = TimePartitioning.of(Type.DAY);
-    if (timestampPartitionFieldName.isPresent()){
+    if (timestampPartitionFieldName.isPresent()) {
       timePartitioning = timePartitioning.toBuilder().setField(timestampPartitionFieldName.get()).build();
     }
 
-    StandardTableDefinition tableDefinition = StandardTableDefinition.newBuilder()
+    StandardTableDefinition.Builder builder = StandardTableDefinition.newBuilder()
         .setSchema(bigQuerySchema)
-        .setTimePartitioning(timePartitioning)
-        .build();
+        .setTimePartitioning(timePartitioning);
+
+    if (timestampPartitionFieldName.isPresent() && clusteringFieldName.isPresent()) {
+      Clustering clustering = Clustering.newBuilder()
+          .setFields(clusteringFieldName.get())
+          .build();
+      builder.setClustering(clustering);
+    }
+
+    StandardTableDefinition tableDefinition = builder.build();
     TableInfo.Builder tableInfoBuilder =
         TableInfo.newBuilder(table, tableDefinition);
     if (kafkaValueSchema.doc() != null) {
diff --git a/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java b/kcbq-connector/src/main/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfig.java
@@ -18,6 +18,7 @@
  */
 
 
+import java.util.List;
 import java.util.Optional;
 import org.apache.kafka.common.config.ConfigDef;
 import org.apache.kafka.common.config.ConfigException;
@@ -118,6 +119,14 @@ public class BigQuerySinkTaskConfig extends BigQuerySinkConfig {
           + " and enable timestamp partitioning for each table. Leave this configuration blank,"
           + " to enable ingestion time partitioning for each table.";
 
+  public static final String BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG = "clusteringPartitionFieldNames";
+  private static final ConfigDef.Type BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE = ConfigDef.Type.LIST;
+  private static final List<String> BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT = null;
+  private static final ConfigDef.Importance BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE =
+      ConfigDef.Importance.LOW;
+  private static final String BIGQUERY_CLUSTERING_FIELD_NAMES_DOC =
+      "List of fields on which data should be clustered by in BigQuery, separated by commas";
+
   static {
     config = BigQuerySinkConfig.getConfig()
         .define(
@@ -172,6 +181,12 @@ public class BigQuerySinkTaskConfig extends BigQuerySinkConfig {
             BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DEFAULT,
             BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_IMPORTANCE,
             BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_DOC
+        ).define(
+            BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG,
+            BIGQUERY_CLUSTERING_FIELD_NAMES_TYPE,
+            BIGQUERY_CLUSTERING_FIELD_NAMES_DEFAULT,
+            BIGQUERY_CLUSTERING_FIELD_NAMES_IMPORTANCE,
+            BIGQUERY_CLUSTERING_FIELD_NAMES_DOC
         );
   }
 
@@ -200,18 +215,44 @@ public Optional<String> getTimestampPartitionFieldName() {
     return Optional.ofNullable(getString(BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG));
   }
 
+  /**
+   * Returns the field names to use for clustering.
+   * @return List of Strings that represent the field names.
+   */
+  public Optional<List<String>> getClusteringPartitionFieldName() {
+    return Optional.ofNullable(getList(BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG));
+  }
+
   /**
    * Check the validity of table partitioning configs.
    */
-  private void checkPartitionCofigs() {
-    if (getTimestampPartitionFieldName().isPresent() && getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)){
+  private void checkPartitionConfigs() {
+    if (getTimestampPartitionFieldName().isPresent() && getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)) {
       throw new ConfigException(
           "Only one partitioning configuration mode may be specified for the connector. "
               + "Use either bigQueryPartitionDecorator OR timestampPartitionFieldName."
       );
     }
   }
 
+  /**
+   * Check the validity of table clustering configs.
+   */
+  private void checkClusteringConfigs() {
+    if (getClusteringPartitionFieldName().isPresent()) {
+      if (!getTimestampPartitionFieldName().isPresent() && !getBoolean(BIGQUERY_PARTITION_DECORATOR_CONFIG)) {
+        throw new ConfigException(
+            "Clustering field name may be specified only on a partitioned table."
+        );
+      }
+      if (getClusteringPartitionFieldName().get().size() > 4) {
+        throw new ConfigException(
+            "You can only specify up to four clustering field names."
+        );
+      }
+    }
+  }
+
   public static ConfigDef getConfig() {
     return config;
   }
@@ -222,6 +263,7 @@ public static ConfigDef getConfig() {
   public BigQuerySinkTaskConfig(Map<String, String> properties) {
     super(config, properties);
     checkAutoUpdateSchemas();
-    checkPartitionCofigs();
+    checkPartitionConfigs();
+    checkClusteringConfigs();
   }
 }
diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/SchemaManagerTest.java
@@ -37,6 +37,8 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import java.util.Arrays;
+import java.util.List;
 import java.util.Optional;
 
 public class SchemaManagerTest {
@@ -68,7 +70,7 @@ public void testBQTableDescription() {
     Optional<String> kafkaKeyFieldName = Optional.of("kafkaKey");
     Optional<String> kafkaDataFieldName = Optional.of("kafkaData");
     SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter,
-        mockBigQuery, kafkaKeyFieldName, kafkaDataFieldName, Optional.empty());
+        mockBigQuery, kafkaKeyFieldName, kafkaDataFieldName, Optional.empty(), Optional.empty());
 
     when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema);
     when(mockKafkaSchema.doc()).thenReturn(testDoc);
@@ -86,7 +88,7 @@ public void testBQTableDescription() {
   public void testTimestampPartitionSet() {
     Optional<String> testField = Optional.of("testField");
     SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter,
-        mockBigQuery, Optional.empty(), Optional.empty(), testField);
+        mockBigQuery, Optional.empty(), Optional.empty(), testField, Optional.empty());
 
     when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema);
     when(mockKafkaSchema.doc()).thenReturn(testDoc);
@@ -101,4 +103,25 @@ public void testTimestampPartitionSet() {
         ((StandardTableDefinition) tableInfo.getDefinition()).getTimePartitioning().getField());
   }
 
+  @Test
+  public void testClusteringPartitionSet() {
+    Optional<String> timestampPartitionFieldName = Optional.of("testField");
+    Optional<List<String>> testField = Optional.of(Arrays.asList("column1", "column2"));
+    SchemaManager schemaManager = new SchemaManager(mockSchemaRetriever, mockSchemaConverter,
+        mockBigQuery, Optional.empty(), Optional.empty(), timestampPartitionFieldName, testField);
+
+    when(mockSchemaConverter.convertSchema(mockKafkaSchema)).thenReturn(fakeBigQuerySchema);
+    when(mockKafkaSchema.doc()).thenReturn(testDoc);
+
+    TableInfo tableInfo = schemaManager
+        .constructTableInfo(tableId, mockKafkaSchema, mockKafkaSchema);
+
+    Assert.assertEquals("Kafka doc does not match BigQuery table description",
+        testDoc, tableInfo.getDescription());
+    StandardTableDefinition definition = tableInfo.getDefinition();
+    Assert.assertNotNull(definition.getClustering());
+    Assert.assertEquals("The field name does not match the field name of time partition",
+        testField.get(),
+        definition.getClustering().getFields());
+  }
 }
diff --git a/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfigTest.java b/kcbq-connector/src/test/java/com/wepay/kafka/connect/bigquery/config/BigQuerySinkTaskConfigTest.java
@@ -18,9 +18,9 @@
  */
 
 
+import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
 
 import com.wepay.kafka.connect.bigquery.SinkTaskPropertiesFactory;
 
@@ -29,7 +29,11 @@
 import org.junit.Before;
 import org.junit.Test;
 
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 
 public class BigQuerySinkTaskConfigTest {
   private SinkTaskPropertiesFactory propertiesFactory;
@@ -97,6 +101,68 @@ public void testTimestampPartitionFieldName() {
     assertFalse(testConfig.getBoolean(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG));
   }
 
+  /**
+   * Test the default for the field names is not present.
+   */
+  @Test
+  public void testEmptyClusteringFieldNames() {
+    Map<String, String> configProperties = propertiesFactory.getProperties();
+    BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties);
+    assertFalse(testConfig.getClusteringPartitionFieldName().isPresent());
+  }
+
+  /**
+   * Test if the field names being non-empty and the partitioning is not present errors correctly.
+   */
+  @Test (expected = ConfigException.class)
+  public void testClusteringFieldNamesWithoutTimestampPartitionError() {
+    Map<String, String> configProperties = propertiesFactory.getProperties();
+    configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, null);
+    configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false");
+    configProperties.put(
+        BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG,
+        "column1,column2"
+    );
+    new BigQuerySinkTaskConfig(configProperties);
+  }
+
+  /**
+   * Test if the field names are more than four fields errors correctly.
+   */
+  @Test (expected = ConfigException.class)
+  public void testClusteringPartitionFieldNamesWithMoreThanFourFieldsError() {
+    Map<String, String> configProperties = propertiesFactory.getProperties();
+    configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "true");
+    configProperties.put(
+        BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG,
+        "column1,column2,column3,column4,column5"
+    );
+    new BigQuerySinkTaskConfig(configProperties);
+  }
+
+  /**
+   * Test the field names being non-empty and the partitioning field exists works correctly.
+   */
+  @Test
+  public void testClusteringFieldNames() {
+    Map<String, String> configProperties = propertiesFactory.getProperties();
+    configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_TIMESTAMP_PARTITION_FIELD_NAME_CONFIG, "name");
+    configProperties.put(BigQuerySinkTaskConfig.BIGQUERY_PARTITION_DECORATOR_CONFIG, "false");
+    configProperties.put(
+        BigQuerySinkTaskConfig.BIGQUERY_CLUSTERING_FIELD_NAMES_CONFIG,
+        "column1,column2"
+    );
+
+    ArrayList<String> expectedClusteringPartitionFieldName = new ArrayList<>(
+        Arrays.asList("column1", "column2")
+    );
+
+    BigQuerySinkTaskConfig testConfig = new BigQuerySinkTaskConfig(configProperties);
+    Optional<List<String>> testClusteringPartitionFieldName = testConfig.getClusteringPartitionFieldName();
+    assertTrue(testClusteringPartitionFieldName.isPresent());
+    assertEquals(expectedClusteringPartitionFieldName, testClusteringPartitionFieldName.get());
+  }
+
   @Test(expected = ConfigException.class)
   public void testAutoSchemaUpdateWithoutRetriever() {
     Map<String, String> badConfigProperties = propertiesFactory.getProperties();