[FLINK-36913][pipeline-connector][kafka] Add sink.table.mapping option to allow user to define a custom map for upstream TableId to downstream topic name.

lvyanquan · lvyanquan · commit 399b286950ff · 2024-12-17T21:16:06.000+08:00
diff --git a/docs/content.zh/docs/connectors/pipeline-connectors/kafka.md b/docs/content.zh/docs/connectors/pipeline-connectors/kafka.md
@@ -143,6 +143,13 @@ Pipeline 连接器配置项
       <td>String</td>
       <td>Kafka 记录自定义的 Header。每个 Header 使用 ','分割， 键值使用 ':' 分割。举例来说，可以使用这种方式 'key1:value1,key2:value2'。 </td>
     </tr>
+    <tr>
+      <td>sink.table.mapping</td>
+      <td>optional</td>
+      <td style="word-wrap: break-word;">(none)</td>
+      <td>String</td>
+      <td>自定义的上游表名到下游 Kafka Topic 名的映射关系。 每个映射关系由 `;` 分割，上游表的 TableId 和下游 Kafka 的 Topic 名由 `:` 分割。 举个例子，我们可以配置 `sink.table.mapping` 的值为 `mydb.mytable1:topic1;mydb.mytable2:topic2`。 </td>
+    </tr>
     </tbody>
 </table>    
 </div>
diff --git a/docs/content/docs/connectors/pipeline-connectors/kafka.md b/docs/content/docs/connectors/pipeline-connectors/kafka.md
@@ -141,6 +141,13 @@ Pipeline Connector Options
       <td>String</td>
       <td>custom headers for each kafka record. Each header are separated by ',', separate key and value by ':'. For example, we can set headers like 'key1:value1,key2:value2'. </td>
     </tr>
+    <tr>
+      <td>sink.table.mapping</td>
+      <td>optional</td>
+      <td style="word-wrap: break-word;">(none)</td>
+      <td>String</td>
+      <td>Custom table mappings for each table from upstream tableId to downstream Kafka topic. Each mapping is separated by `;`, separate upstream tableId and downstream Kafka topic by `:`, For example, we can set `sink.table.mapping` like `mydb.mytable1:topic1;mydb.mytable2:topic2`. </td>
+    </tr>
     </tbody>
 </table>    
 </div>
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSink.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSink.java
@@ -53,6 +53,8 @@ public class KafkaDataSink implements DataSink {
 
     final String customHeaders;
 
+    final String tableMapping;
+
     public KafkaDataSink(
             DeliveryGuarantee deliveryGuarantee,
             Properties kafkaProperties,
@@ -62,7 +64,8 @@ public KafkaDataSink(
             SerializationSchema<Event> valueSerialization,
             String topic,
             boolean addTableToHeaderEnabled,
-            String customHeaders) {
+            String customHeaders,
+            String tableMapping) {
         this.deliveryGuarantee = deliveryGuarantee;
         this.kafkaProperties = kafkaProperties;
         this.partitionStrategy = partitionStrategy;
@@ -72,6 +75,7 @@ public KafkaDataSink(
         this.topic = topic;
         this.addTableToHeaderEnabled = addTableToHeaderEnabled;
         this.customHeaders = customHeaders;
+        this.tableMapping = tableMapping;
     }
 
     @Override
@@ -92,7 +96,8 @@ public EventSinkProvider getEventSinkProvider() {
                                         valueSerialization,
                                         topic,
                                         addTableToHeaderEnabled,
-                                        customHeaders))
+                                        customHeaders,
+                                        tableMapping))
                         .build());
     }
 
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkFactory.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkFactory.java
@@ -41,6 +41,7 @@
 import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.PROPERTIES_PREFIX;
 import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.SINK_ADD_TABLEID_TO_HEADER_ENABLED;
 import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.SINK_CUSTOM_HEADER;
+import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.SINK_TABLE_MAPPING;
 import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.TOPIC;
 import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.VALUE_FORMAT;
 
@@ -92,6 +93,7 @@ public DataSink createDataSink(Context context) {
                 context.getFactoryConfiguration().get(KafkaDataSinkOptions.SINK_CUSTOM_HEADER);
         PartitionStrategy partitionStrategy =
                 context.getFactoryConfiguration().get(KafkaDataSinkOptions.PARTITION_STRATEGY);
+        String tableMapping = context.getFactoryConfiguration().get(SINK_TABLE_MAPPING);
         return new KafkaDataSink(
                 deliveryGuarantee,
                 kafkaProperties,
@@ -101,7 +103,8 @@ public DataSink createDataSink(Context context) {
                 valueSerialization,
                 topic,
                 addTableToHeaderEnabled,
-                customHeaders);
+                customHeaders,
+                tableMapping);
     }
 
     @Override
@@ -124,6 +127,7 @@ public Set<ConfigOption<?>> optionalOptions() {
         options.add(SINK_ADD_TABLEID_TO_HEADER_ENABLED);
         options.add(SINK_CUSTOM_HEADER);
         options.add(KafkaDataSinkOptions.DELIVERY_GUARANTEE);
+        options.add(SINK_TABLE_MAPPING);
         return options;
     }
 }
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkOptions.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkOptions.java
@@ -18,6 +18,7 @@
 package org.apache.flink.cdc.connectors.kafka.sink;
 
 import org.apache.flink.cdc.common.configuration.ConfigOption;
+import org.apache.flink.cdc.common.configuration.description.Description;
 import org.apache.flink.cdc.connectors.kafka.json.JsonSerializationType;
 import org.apache.flink.connector.base.DeliveryGuarantee;
 
@@ -29,6 +30,10 @@ public class KafkaDataSinkOptions {
     // Prefix for Kafka specific properties.
     public static final String PROPERTIES_PREFIX = "properties.";
 
+    public static final String DILIMITER_TABLE_MAPPINGS = ";";
+
+    public static final String DILIMITER_SELECTOR_TABLEID = ":";
+
     public static final ConfigOption<DeliveryGuarantee> DELIVERY_GUARANTEE =
             key("sink.delivery-guarantee")
                     .enumType(DeliveryGuarantee.class)
@@ -79,4 +84,20 @@ public class KafkaDataSinkOptions {
                     .defaultValue("")
                     .withDescription(
                             "custom headers for each kafka record. Each header are separated by ',', separate key and value by ':'. For example, we can set headers like 'key1:value1,key2:value2'.");
+
+    public static final ConfigOption<String> SINK_TABLE_MAPPING =
+            key("sink.table.mapping")
+                    .stringType()
+                    .noDefaultValue()
+                    .withDescription(
+                            Description.builder()
+                                    .text(
+                                            "Custom table mappings for each table from upstream tableId to downstream Kafka topic. Each mapping is separated by ")
+                                    .text(DILIMITER_TABLE_MAPPINGS)
+                                    .text(
+                                            ", separate upstream tableId and downstream Kafka topic by ")
+                                    .text(DILIMITER_SELECTOR_TABLEID)
+                                    .text(
+                                            ". For example, we can set sink.table.mapping like 'mydb.mytable1:topic1;mydb.mytable2:topic2'.")
+                                    .build());
 }
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/PipelineKafkaRecordSerializationSchema.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/sink/PipelineKafkaRecordSerializationSchema.java
@@ -22,6 +22,8 @@
 import org.apache.flink.cdc.common.event.Event;
 import org.apache.flink.cdc.common.event.SchemaChangeEvent;
 import org.apache.flink.cdc.common.event.TableId;
+import org.apache.flink.cdc.common.schema.Selectors;
+import org.apache.flink.cdc.connectors.kafka.utils.KafkaSinkUtils;
 import org.apache.flink.connector.kafka.sink.KafkaRecordSerializationSchema;
 
 import org.apache.kafka.clients.producer.ProducerRecord;
@@ -57,6 +59,13 @@ public class PipelineKafkaRecordSerializationSchema
     // key value pairs to be put into Kafka Record Header.
     public final Map<String, String> customHeaders;
 
+    private final String tableMapping;
+
+    private Map<Selectors, TableId> tableMappings;
+
+    // A cache to speed up TableId to Topic mapping.
+    private Map<TableId, String> tableIdToTopicCache;
+
     public static final String NAMESPACE_HEADER_KEY = "namespace";
 
     public static final String SCHEMA_NAME_HEADER_KEY = "schemaName";
@@ -69,7 +78,8 @@ public class PipelineKafkaRecordSerializationSchema
             SerializationSchema<Event> valueSerialization,
             String unifiedTopic,
             boolean addTableToHeaderEnabled,
-            String customHeaderString) {
+            String customHeaderString,
+            String tableMapping) {
         this.keySerialization = keySerialization;
         this.valueSerialization = checkNotNull(valueSerialization);
         this.unifiedTopic = unifiedTopic;
@@ -90,6 +100,7 @@ public class PipelineKafkaRecordSerializationSchema
             }
         }
         partition = partitionStrategy.equals(PartitionStrategy.ALL_TO_ZERO) ? 0 : null;
+        this.tableMapping = tableMapping;
     }
 
     @Override
@@ -102,7 +113,7 @@ public ProducerRecord<byte[], byte[]> serialize(
             // skip sending SchemaChangeEvent.
             return null;
         }
-        String topic = unifiedTopic == null ? changeEvent.tableId().toString() : unifiedTopic;
+        String topic = inferTopicName(changeEvent.tableId());
         RecordHeaders recordHeaders = new RecordHeaders();
         if (addTableToHeaderEnabled) {
             String namespace =
@@ -128,10 +139,30 @@ public ProducerRecord<byte[], byte[]> serialize(
                 topic, partition, null, keySerialized, valueSerialized, recordHeaders);
     }
 
+    private String inferTopicName(TableId tableId) {
+        return tableIdToTopicCache.computeIfAbsent(
+                tableId,
+                (table -> {
+                    if (unifiedTopic != null && !unifiedTopic.isEmpty()) {
+                        return unifiedTopic;
+                    }
+                    if (tableMappings != null && !tableMappings.isEmpty()) {
+                        for (Map.Entry<Selectors, TableId> entry : tableMappings.entrySet()) {
+                            if (entry.getKey().isMatch(tableId)) {
+                                return entry.getValue().toString();
+                            }
+                        }
+                    }
+                    return table.toString();
+                }));
+    }
+
     @Override
     public void open(
             SerializationSchema.InitializationContext context, KafkaSinkContext sinkContext)
             throws Exception {
+        this.tableMappings = KafkaSinkUtils.parseSelectorsToTableIdMapping(tableMapping);
+        this.tableIdToTopicCache = new HashMap<>();
         valueSerialization.open(context);
     }
 }
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/utils/KafkaSinkUtils.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/main/java/org/apache/flink/cdc/connectors/kafka/utils/KafkaSinkUtils.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.flink.cdc.connectors.kafka.utils;
+
+import org.apache.flink.cdc.common.event.TableId;
+import org.apache.flink.cdc.common.schema.Selectors;
+import org.apache.flink.cdc.common.utils.Preconditions;
+import org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions;
+
+import java.util.LinkedHashMap;
+import java.util.Map;
+
+import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.DILIMITER_SELECTOR_TABLEID;
+import static org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSinkOptions.DILIMITER_TABLE_MAPPINGS;
+
+/** Util class for {@link org.apache.flink.cdc.connectors.kafka.sink.KafkaDataSink}. */
+public class KafkaSinkUtils {
+
+    /** Parse the mapping text to a map from Selectors to TableId. */
+    public static Map<Selectors, TableId> parseSelectorsToTableIdMapping(String tableMappings) {
+        // Keep the order.
+        Map<Selectors, TableId> result = new LinkedHashMap<>();
+        if (tableMappings == null || tableMappings.isEmpty()) {
+            return result;
+        }
+        for (String mapping : tableMappings.split(DILIMITER_TABLE_MAPPINGS)) {
+            String[] selectorsAndTableId = mapping.split(DILIMITER_SELECTOR_TABLEID);
+            Preconditions.checkArgument(
+                    selectorsAndTableId.length == 2,
+                    "Please check you configuration of " + KafkaDataSinkOptions.SINK_TABLE_MAPPING);
+            Selectors selectors =
+                    new Selectors.SelectorsBuilder().includeTables(selectorsAndTableId[0]).build();
+            result.put(selectors, TableId.parse(selectorsAndTableId[1]));
+        }
+        return result;
+    }
+}
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/test/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkFactoryTest.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/test/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkFactoryTest.java
@@ -39,6 +39,9 @@ void testCreateDataSink() {
         Assertions.assertThat(sinkFactory).isInstanceOf(KafkaDataSinkFactory.class);
 
         Configuration conf = Configuration.fromMap(ImmutableMap.<String, String>builder().build());
+        conf.set(
+                KafkaDataSinkOptions.SINK_TABLE_MAPPING,
+                "mydb.mytable1:topic1;mydb.mytable2:topic2");
         DataSink dataSink =
                 sinkFactory.createDataSink(
                         new FactoryHelper.DefaultContext(
diff --git a/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/test/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkITCase.java b/flink-cdc-connect/flink-cdc-pipeline-connectors/flink-cdc-pipeline-connector-kafka/src/test/java/org/apache/flink/cdc/connectors/kafka/sink/KafkaDataSinkITCase.java
@@ -560,6 +560,68 @@ void testTopicAndHeaderOption() throws Exception {
         checkProducerLeak();
     }
 
+    @Test
+    void testSINKTABLEMAPPING() throws Exception {
+        final StreamExecutionEnvironment env = new LocalStreamEnvironment();
+        env.enableCheckpointing(1000L);
+        env.setRestartStrategy(RestartStrategies.noRestart());
+        final DataStream<Event> source =
+                env.fromCollection(createSourceEvents(), new EventTypeInfo());
+        Map<String, String> config = new HashMap<>();
+        config.put(
+                KafkaDataSinkOptions.SINK_TABLE_MAPPING.key(),
+                "default_namespace.default_schema_copy.\\.*:test_topic_mapping_copy;default_namespace.default_schema.\\.*:test_topic_mapping");
+        Properties properties = getKafkaClientConfiguration();
+        properties.forEach(
+                (key, value) ->
+                        config.put(
+                                KafkaDataSinkOptions.PROPERTIES_PREFIX + key.toString(),
+                                value.toString()));
+        source.sinkTo(
+                ((FlinkSinkProvider)
+                                (new KafkaDataSinkFactory()
+                                        .createDataSink(
+                                                new FactoryHelper.DefaultContext(
+                                                        Configuration.fromMap(config),
+                                                        Configuration.fromMap(new HashMap<>()),
+                                                        this.getClass().getClassLoader()))
+                                        .getEventSinkProvider()))
+                        .getSink());
+        env.execute();
+
+        final List<ConsumerRecord<byte[], byte[]>> collectedRecords =
+                drainAllRecordsFromTopic("test_topic_mapping", false, 0);
+        final long recordsCount = 5;
+        assertThat(recordsCount).isEqualTo(collectedRecords.size());
+        ObjectMapper mapper =
+                JacksonMapperFactory.createObjectMapper()
+                        .configure(JsonGenerator.Feature.WRITE_BIGDECIMAL_AS_PLAIN, false);
+        List<JsonNode> expected =
+                Arrays.asList(
+                        mapper.readTree(
+                                String.format(
+                                        "{\"before\":null,\"after\":{\"col1\":\"1\",\"col2\":\"1\"},\"op\":\"c\",\"source\":{\"db\":\"default_schema\",\"table\":\"%s\"}}",
+                                        table1.getTableName())),
+                        mapper.readTree(
+                                String.format(
+                                        "{\"before\":null,\"after\":{\"col1\":\"2\",\"col2\":\"2\"},\"op\":\"c\",\"source\":{\"db\":\"default_schema\",\"table\":\"%s\"}}",
+                                        table1.getTableName())),
+                        mapper.readTree(
+                                String.format(
+                                        "{\"before\":null,\"after\":{\"col1\":\"3\",\"col2\":\"3\"},\"op\":\"c\",\"source\":{\"db\":\"default_schema\",\"table\":\"%s\"}}",
+                                        table1.getTableName())),
+                        mapper.readTree(
+                                String.format(
+                                        "{\"before\":{\"col1\":\"1\",\"newCol3\":\"1\"},\"after\":null,\"op\":\"d\",\"source\":{\"db\":\"default_schema\",\"table\":\"%s\"}}",
+                                        table1.getTableName())),
+                        mapper.readTree(
+                                String.format(
+                                        "{\"before\":{\"col1\":\"2\",\"newCol3\":\"\"},\"after\":{\"col1\":\"2\",\"newCol3\":\"x\"},\"op\":\"u\",\"source\":{\"db\":\"default_schema\",\"table\":\"%s\"}}",
+                                        table1.getTableName())));
+        assertThat(deserializeValues(collectedRecords)).containsAll(expected);
+        checkProducerLeak();
+    }
+
     private List<ConsumerRecord<byte[], byte[]>> drainAllRecordsFromTopic(
             String topic, boolean committed, int... partitionArr) {
         Properties properties = getKafkaClientConfiguration();