CC-7162: Add offset recovery and deterministic seeding (#49)

gharris1727 · web-flow · commit 736710844f72 · 2019-12-06T14:56:03.000-08:00
* Add deterministic seed configuration
* Add offset recovery after task failure
* Add tests for determinism
* Depend on avro-random-generator:0.3.0 for reentrancy features
* Migrate from constructor to builder for Avro Random Generator
* Update documentation for new feature &amp; clarify iterations behavior

Signed-off-by: Greg Harris &lt;gregh@confluent.io&gt;
diff --git a/README.md b/README.md
@@ -120,7 +120,7 @@ Parameter | Description | Default
 -|-|-
 `kafka.topic` | Topic to write to | 
 `max.interval` | Max interval between messages (ms) | 500
-`iterations` | Number of messages to send, or less than 1 for unlimited | -1
+`iterations` | Number of messages to send from each task, or less than 1 for unlimited | -1
 `schema.filename` | Filename of schema to use
 `schema.keyfield` | Name of field to use as the message key
 `quickstart` | Name of [quickstart](https://github.com/confluentinc/kafka-connect-datagen/tree/master/src/main/resources) to use
diff --git a/pom.xml b/pom.xml
@@ -32,7 +32,7 @@
     <properties>
         <connect-runtime-version>2.0.0</connect-runtime-version>
         <confluent.version>5.1.0</confluent.version>
-        <confluent.avro.generator.version>0.2.0</confluent.avro.generator.version>
+        <confluent.avro.generator.version>0.3.0</confluent.avro.generator.version>
         <junit.version>4.12</junit.version>
         <avro.version>1.8.1</avro.version>
         <licenses.version>5.1.0</licenses.version>
diff --git a/src/main/java/io/confluent/kafka/connect/datagen/DatagenConnector.java b/src/main/java/io/confluent/kafka/connect/datagen/DatagenConnector.java
@@ -17,6 +17,7 @@
 package io.confluent.kafka.connect.datagen;
 
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
@@ -60,7 +61,9 @@ public Class<? extends Task> taskClass() {
   public List<Map<String, String>> taskConfigs(int maxTasks) {
     List<Map<String, String>> taskConfigs = new ArrayList<>();
     for (int i = 0; i < maxTasks; i++) {
-      taskConfigs.add(this.props);
+      Map<String, String> taskConfig = new HashMap<>(this.props);
+      taskConfig.put(DatagenTask.TASK_ID, Integer.toString(i));
+      taskConfigs.add(taskConfig);
     }
     return taskConfigs;
   }
diff --git a/src/main/java/io/confluent/kafka/connect/datagen/DatagenConnectorConfig.java b/src/main/java/io/confluent/kafka/connect/datagen/DatagenConnectorConfig.java
@@ -30,14 +30,18 @@ public class DatagenConnectorConfig extends AbstractConfig {
   public static final String MAXINTERVAL_CONF = "max.interval";
   private static final String MAXINTERVAL_DOC = "Max interval between messages (ms)";
   public static final String ITERATIONS_CONF = "iterations";
-  private static final String ITERATIONS_DOC = "Number of messages to send, or less than 1 for "
-                                               + "unlimited";
+  private static final String ITERATIONS_DOC = "Number of messages to send from each task, "
+      + "or less than 1 for unlimited";
   public static final String SCHEMA_FILENAME_CONF = "schema.filename";
   private static final String SCHEMA_FILENAME_DOC = "Filename of schema to use";
   public static final String SCHEMA_KEYFIELD_CONF = "schema.keyfield";
   private static final String SCHEMA_KEYFIELD_DOC = "Name of field to use as the message key";
   public static final String QUICKSTART_CONF = "quickstart";
   private static final String QUICKSTART_DOC = "Name of quickstart to use";
+  public static final String RANDOM_SEED_CONF = "random.seed";
+  private static final String RANDOM_SEED_DOC = "Numeric seed for generating random data. "
+      + "Two connectors started with the same seed will deterministically produce the same data. "
+      + "Each task will generate different data than the other tasks in the same connector.";
 
   public DatagenConnectorConfig(ConfigDef config, Map<String, String> parsedConfig) {
     super(config, parsedConfig);
@@ -54,7 +58,8 @@ public static ConfigDef conf() {
         .define(ITERATIONS_CONF, Type.INT, -1, Importance.HIGH, ITERATIONS_DOC)
         .define(SCHEMA_FILENAME_CONF, Type.STRING, "", Importance.HIGH, SCHEMA_FILENAME_DOC)
         .define(SCHEMA_KEYFIELD_CONF, Type.STRING, "", Importance.HIGH, SCHEMA_KEYFIELD_DOC)
-        .define(QUICKSTART_CONF, Type.STRING, "", Importance.HIGH, QUICKSTART_DOC);
+        .define(QUICKSTART_CONF, Type.STRING, "", Importance.HIGH, QUICKSTART_DOC)
+        .define(RANDOM_SEED_CONF, Type.LONG, null, Importance.LOW, RANDOM_SEED_DOC);
   }
 
   public String getKafkaTopic() {
@@ -81,5 +86,9 @@ public String getQuickstart() {
     return this.getString(QUICKSTART_CONF);
   }
 
+  public Long getRandomSeed() {
+    return this.getLong(RANDOM_SEED_CONF);
+  }
+
 }
 
diff --git a/src/main/java/io/confluent/kafka/connect/datagen/DatagenTask.java b/src/main/java/io/confluent/kafka/connect/datagen/DatagenTask.java
@@ -20,6 +20,7 @@
 import java.io.FileInputStream;
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -45,8 +46,10 @@ public class DatagenTask extends SourceTask {
   static final Logger log = LoggerFactory.getLogger(DatagenTask.class);
 
   private static final Schema KEY_SCHEMA = Schema.STRING_SCHEMA;
-  private static final Map<String, ?> SOURCE_PARTITION = Collections.emptyMap();
-  private static final Map<String, ?> SOURCE_OFFSET = Collections.emptyMap();
+  public static final String TASK_ID = "task.id";
+  public static final String TASK_GENERATION = "task.generation";
+  public static final String CURRENT_ITERATION = "current.iteration";
+  public static final String RANDOM_SEED = "random.seed";
 
 
   private DatagenConnectorConfig config;
@@ -61,6 +64,10 @@ public class DatagenTask extends SourceTask {
   private org.apache.avro.Schema avroSchema;
   private org.apache.kafka.connect.data.Schema ksqlSchema;
   private AvroData avroData;
+  private int taskId;
+  private Map<String, Object> sourcePartition;
+  private int taskGeneration;
+  private Random random;
 
   protected enum Quickstart {
     CLICKSTREAM_CODES("clickstream_codes_schema.avro", "code"),
@@ -103,6 +110,26 @@ public void start(Map<String, String> props) {
     maxRecords = config.getIterations();
     schemaFilename = config.getSchemaFilename();
     schemaKeyField = config.getSchemaKeyfield();
+    taskId = Integer.parseInt(props.get(TASK_ID));
+    sourcePartition = Collections.singletonMap(TASK_ID, taskId);
+
+    random = new Random();
+    if (config.getRandomSeed() != null) {
+      random.setSeed(config.getRandomSeed());
+      // Each task will now deterministically advance it's random source
+      // This makes it such that each task will generate different data
+      for (int i = 0; i < taskId; i++) {
+        random.setSeed(random.nextLong());
+      }
+    }
+
+    Map<String, Object> offset = context.offsetStorageReader().offset(sourcePartition);
+    if (offset != null) {
+      //  The offset as it is stored contains our next state, so restore it as-is.
+      taskGeneration = ((Long) offset.get(TASK_GENERATION)).intValue();
+      count = ((Long) offset.get(CURRENT_ITERATION));
+      random.setSeed((Long) offset.get(RANDOM_SEED));
+    }
 
     String quickstartName = config.getQuickstart();
     if (quickstartName != "") {
@@ -112,10 +139,11 @@ public void start(Map<String, String> props) {
           schemaFilename = quickstart.getSchemaFilename();
           schemaKeyField = quickstart.getSchemaKeyField();
           try {
-            generator = new Generator(
-                getClass().getClassLoader().getResourceAsStream(schemaFilename),
-                new Random()
-            );
+            generator = new Generator.Builder()
+                .schemaStream(getClass().getClassLoader().getResourceAsStream(schemaFilename))
+                .random(random)
+                .generation(count)
+                .build();
           } catch (IOException e) {
             throw new ConnectException("Unable to read the '"
                 + schemaFilename + "' schema file", e);
@@ -126,10 +154,11 @@ public void start(Map<String, String> props) {
       }
     } else {
       try {
-        generator = new Generator(
-            new FileInputStream(schemaFilename),
-            new Random()
-        );
+        generator = new Generator.Builder()
+            .schemaStream(new FileInputStream(schemaFilename))
+            .random(random)
+            .generation(count)
+            .build();
       } catch (IOException e) {
         throw new ConnectException("Unable to read the '"
             + schemaFilename + "' schema file", e);
@@ -195,10 +224,24 @@ public List<SourceRecord> poll() throws InterruptedException {
       );
     }
 
+    // Re-seed the random each time so that we can save the seed to the source offsets.
+    long seed = random.nextLong();
+    random.setSeed(seed);
+
+    // The source offsets will be the values that the next task lifetime will restore from
+    // Essentially, the "next" state of the connector after this loop completes
+    Map<String, Object> sourceOffset = new HashMap<>();
+    // The next lifetime will be a member of the next generation.
+    sourceOffset.put(TASK_GENERATION, (long) (taskGeneration + 1));
+    // We will have produced this record
+    sourceOffset.put(CURRENT_ITERATION, count + 1);
+    // This is the seed that we just re-seeded for our own next iteration.
+    sourceOffset.put(RANDOM_SEED, seed);
+
     final List<SourceRecord> records = new ArrayList<>();
     SourceRecord record = new SourceRecord(
-        SOURCE_PARTITION,
-        SOURCE_OFFSET,
+        sourcePartition,
+        sourceOffset,
         topic,
         KEY_SCHEMA,
         keyString,
diff --git a/src/test/java/io/confluent/kafka/connect/datagen/DatagenConnectorTest.java b/src/test/java/io/confluent/kafka/connect/datagen/DatagenConnectorTest.java
@@ -67,8 +67,11 @@ protected void assertTaskConfigs(int maxTasks) {
     List<Map<String, String>> taskConfigs = connector.taskConfigs(maxTasks);
     assertEquals(maxTasks, taskConfigs.size());
     // All task configs should match the connector config
-    for (Map<String, String> taskConfig : taskConfigs) {
-      assertEquals(config, taskConfig);
+    for (int i = 0; i < taskConfigs.size(); i++) {
+      Map<String, String> taskConfig = taskConfigs.get(i);
+      Map<String, String> expectedTaskConfig = new HashMap<>(config);
+      expectedTaskConfig.put(DatagenTask.TASK_ID, Integer.toString(i));
+      assertEquals(expectedTaskConfig, taskConfig);
     }
   }
 
diff --git a/src/test/java/io/confluent/kafka/connect/datagen/DatagenTaskTest.java b/src/test/java/io/confluent/kafka/connect/datagen/DatagenTaskTest.java
@@ -16,8 +16,11 @@
 
 package io.confluent.kafka.connect.datagen;
 
+import io.confluent.kafka.connect.datagen.DatagenTask.Quickstart;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
@@ -27,11 +30,15 @@
 import io.confluent.avro.random.generator.Generator;
 import io.confluent.connect.avro.AvroData;
 
+import java.util.function.Function;
+import java.util.stream.Collectors;
 import org.apache.kafka.connect.data.Field;
 import org.apache.kafka.connect.data.Schema;
 import org.apache.kafka.connect.data.Struct;
 import org.apache.kafka.connect.errors.ConnectException;
 import org.apache.kafka.connect.source.SourceRecord;
+import org.apache.kafka.connect.source.SourceTaskContext;
+import org.apache.kafka.connect.storage.OffsetStorageReader;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;
@@ -46,6 +53,7 @@ public class DatagenTaskTest {
   private static final String TOPIC = "my-topic";
   private static final int NUM_MESSAGES = 100;
   private static final int MAX_INTERVAL_MS = 0;
+  private static final int TASK_ID = 0;
 
   private static final AvroData AVRO_DATA = new AvroData(20);
 
@@ -54,11 +62,13 @@ public class DatagenTaskTest {
   private List<SourceRecord> records;
   private Schema expectedValueConnectSchema;
   private Schema expectedKeyConnectSchema;
+  private Map<String, Object> sourceOffsets;
 
   @Before
   public void setUp() throws Exception {
     config = new HashMap<>();
     records = new ArrayList<>();
+    sourceOffsets = null;
   }
 
   @After
@@ -112,6 +122,36 @@ public void shouldGenerateFilesForStockTradesQuickstart() throws Exception {
     generateAndValidateRecordsFor(DatagenTask.Quickstart.STOCK_TRADES);
   }
 
+  @Test
+  public void shouldRestoreFromSourceOffsets() throws Exception {
+    // Give the task an arbitrary source offset
+    sourceOffsets = new HashMap<>();
+    sourceOffsets.put(DatagenTask.RANDOM_SEED, 100L);
+    sourceOffsets.put(DatagenTask.CURRENT_ITERATION, 50L);
+    sourceOffsets.put(DatagenTask.TASK_GENERATION, 0L);
+    createTaskWith(Quickstart.ORDERS);
+
+    // poll once to advance the generator
+    SourceRecord firstPoll = task.poll().get(0);
+    // poll a second time to predict the future
+    SourceRecord pollA = task.poll().get(0);
+    // extract the offsets after the first poll to restore to the next task instance
+    //noinspection unchecked
+    sourceOffsets = (Map<String, Object>) firstPoll.sourceOffset();
+    createTaskWith(Quickstart.ORDERS);
+    // poll once after the restore
+    SourceRecord pollB = task.poll().get(0);
+
+    // the generation should have incremented, but the remaining details of the record should be identical
+    assertEquals(1L, pollA.sourceOffset().get(DatagenTask.TASK_GENERATION));
+    assertEquals(2L, pollB.sourceOffset().get(DatagenTask.TASK_GENERATION));
+    assertEquals(pollA.sourceOffset().get(DatagenTask.TASK_ID), pollB.sourceOffset().get(DatagenTask.TASK_ID));
+    assertEquals(pollA.sourceOffset().get(DatagenTask.CURRENT_ITERATION), pollB.sourceOffset().get(DatagenTask.CURRENT_ITERATION));
+    assertEquals(pollA.sourcePartition(), pollB.sourcePartition());
+    assertEquals(pollA.valueSchema(), pollB.valueSchema());
+    assertEquals(pollA.value(), pollB.value());
+  }
+
   @Test
   public void shouldFailToGenerateMoreRecordsThanSpecified() throws Exception {
     // Generate the expected number of records
@@ -229,8 +269,40 @@ private void createTask() {
     config.putIfAbsent(DatagenConnectorConfig.KAFKA_TOPIC_CONF, TOPIC);
     config.putIfAbsent(DatagenConnectorConfig.ITERATIONS_CONF, Integer.toString(NUM_MESSAGES));
     config.putIfAbsent(DatagenConnectorConfig.MAXINTERVAL_CONF, Integer.toString(MAX_INTERVAL_MS));
+    config.putIfAbsent(DatagenTask.TASK_ID, Integer.toString(TASK_ID));
 
     task = new DatagenTask();
+    // Initialize an offsetStorageReader that returns mocked sourceOffsets.
+    task.initialize(new SourceTaskContext() {
+      @Override
+      public Map<String, String> configs() {
+        return config;
+      }
+
+      @Override
+      public OffsetStorageReader offsetStorageReader() {
+        return new OffsetStorageReader() {
+          @Override
+          public <T> Map<String, Object> offset(final Map<String, T> partition) {
+            return offsets(Collections.singletonList(partition)).get(partition);
+          }
+
+          @Override
+          public <T> Map<Map<String, T>, Map<String, Object>> offsets(
+              final Collection<Map<String, T>> partitions) {
+            if (sourceOffsets == null) {
+              return Collections.emptyMap();
+            }
+            return partitions
+                .stream()
+                .collect(Collectors.toMap(
+                    Function.identity(),
+                    ignored -> sourceOffsets
+                ));
+          }
+        };
+      }
+    });
     task.start(config);
   }
 

Original file line number	Diff line number	Diff line change
`@@ -67,8 +67,11 @@ protected void assertTaskConfigs(int maxTasks) {`
`67`	`67`	`List<Map<String, String>> taskConfigs = connector.taskConfigs(maxTasks);`
`68`	`68`	`assertEquals(maxTasks, taskConfigs.size());`
`69`	`69`	`// All task configs should match the connector config`
`70`		`- for (Map<String, String> taskConfig : taskConfigs) {`
`71`		`- assertEquals(config, taskConfig);`
	`70`	`+ for (int i = 0; i < taskConfigs.size(); i++) {`
	`71`	`+ Map<String, String> taskConfig = taskConfigs.get(i);`
	`72`	`+ Map<String, String> expectedTaskConfig = new HashMap<>(config);`
	`73`	`+ expectedTaskConfig.put(DatagenTask.TASK_ID, Integer.toString(i));`
	`74`	`+ assertEquals(expectedTaskConfig, taskConfig);`
`72`	`75`	`}`
`73`	`76`	`}`
`74`	`77`