Fixes #227 Introduce batch parallelization control (#232)

jexp · moxious · commit a4c752e2a6c5 · 2019-08-21T12:21:27.000-04:00
* issue #227 add parallelizeBatches config option * document neo4j.batch.parallelize * New Implementation of parallel sink processing, parallelization and error handling * Resolve feedback from PR * Update readme.adoc
diff --git a/kafka-connect-neo4j/docker/readme.adoc b/kafka-connect-neo4j/docker/readme.adoc
@@ -23,6 +23,7 @@ You can set the following configuration values via Confluent Connect UI, or via
 |neo4j.connection.liveness.check.timeout.msecs|Long| The max Neo4j liveness check timeout (default 1 hour)
 |neo4j.connection.max.pool.size|Int| The max pool size (default 100)
 |neo4j.load.balance.strategy|enum[ROUND_ROBIN, LEAST_CONNECTED]| The Neo4j load balance strategy (default LEAST_CONNECTED)
+|neo4j.batch.parallelize|Boolean|(default true) While concurrent batch processing improves throughput, it might cause out-of-order handling of events.  Set to `false` if you need application of messages with strict ordering, e.g. for change-data-capture (CDC) events.
 |===
 
 === Configuring the stack
@@ -530,7 +531,7 @@ CREATE INDEX ON :Person(name)
 
 [source,cypher]
 ----
-CREATE INDEX ON :Family(surname)
+CREATE INDEX ON :Family(name)
 ----
 
 Please type:
diff --git a/kafka-connect-neo4j/src/main/kotlin/streams/kafka/connect/sink/Neo4jService.kt b/kafka-connect-neo4j/src/main/kotlin/streams/kafka/connect/sink/Neo4jService.kt
@@ -1,10 +1,7 @@
 package streams.kafka.connect.sink
 
-import kotlinx.coroutines.Dispatchers
-import kotlinx.coroutines.async
+import kotlinx.coroutines.*
 import kotlinx.coroutines.channels.ticker
-import kotlinx.coroutines.coroutineScope
-import kotlinx.coroutines.runBlocking
 import kotlinx.coroutines.selects.whileSelect
 import org.apache.kafka.common.config.ConfigException
 import org.apache.kafka.connect.errors.ConnectException
@@ -24,7 +21,11 @@ import streams.service.TopicType
 import streams.service.TopicTypeGroup
 import streams.utils.StreamsUtils
 import streams.utils.retryForException
+import java.lang.RuntimeException
+import java.util.concurrent.CompletionException
+import java.util.concurrent.CopyOnWriteArraySet
 import java.util.concurrent.TimeUnit
+import java.util.concurrent.TimeoutException
 
 
 class Neo4jService(private val config: Neo4jSinkConnectorConfig):
@@ -117,35 +118,67 @@ class Neo4jService(private val config: Neo4jSinkConnectorConfig):
         }
     }
 
-    // perhaps better? https://stackoverflow.com/questions/52192752/kotlin-how-to-run-n-coroutines-and-wait-for-first-m-results-or-timeout
-    suspend fun writeData(data: Map<String, List<List<StreamsSinkEntity>>>) = coroutineScope {
-        val timeout = config.batchTimeout
-        val ticker = ticker(timeout)
-        val deferredList = data
-                .flatMap { (topic, records) ->
-                    records.map { async(Dispatchers.IO) { writeForTopic(topic, it) } }
-                }
-        // loops while select<Boolean> returns true
+    // taken from https://stackoverflow.com/questions/52192752/kotlin-how-to-run-n-coroutines-and-wait-for-first-m-results-or-timeout
+    @ObsoleteCoroutinesApi
+    @ExperimentalCoroutinesApi
+    suspend fun <T> List<Deferred<T>>.awaitAll(timeoutMs: Long): List<T> {
+        val jobs = CopyOnWriteArraySet<Deferred<T>>(this)
+        val result = ArrayList<T>(size)
+        val timeout = ticker(timeoutMs)
+
         whileSelect {
-            ticker.onReceive {
-                if (log.isDebugEnabled) {
-                    log.debug("Timeout $timeout occurred while executing queries")
+            jobs.forEach { deferred ->
+                deferred.onAwait {
+                    jobs.remove(deferred)
+                    result.add(it)
+                    result.size != size
                 }
-                deferredList.forEach { deferred -> deferred.cancel() }
-                false // Stops the whileSelect
             }
-            val isAllCompleted = deferredList.all { it.isCompleted } // true when all are completed
-            // selects first that is done and returns !false==true for whileSelect until it's !true when all/last is done
-            deferredList.forEach {
-                it.onAwait { !isAllCompleted } // Stops the whileSelect
+
+            timeout.onReceive {
+                jobs.forEach { it.cancel() }
+                throw TimeoutException("Tasks ${size} cancelled after timeout of $timeoutMs ms.")
             }
         }
-        val exceptionMessages = deferredList
-                .mapNotNull { it.getCompletionExceptionOrNull() }
-                .map { it.message }
-                .joinToString("\n")
-        if (exceptionMessages.isNotBlank()) {
-            throw ConnectException(exceptionMessages)
+
+        return result
+    }
+
+    @ExperimentalCoroutinesApi
+    fun <T> Deferred<T>.errors() = when {
+        isCompleted -> getCompletionExceptionOrNull()
+        isCancelled -> getCompletionExceptionOrNull() // was getCancellationException()
+        isActive -> RuntimeException("Job $this still active")
+        else -> null }
+
+    suspend fun writeData(data: Map<String, List<List<StreamsSinkEntity>>>) {
+        val errors = if (config.parallelBatches) writeDataAsync(data) else writeDataSync(data);
+
+        if (errors.isNotEmpty()) {
+            throw ConnectException(errors.map { it.message }.distinct().joinToString("\n", "Errors executing ${data.values.map { it.size }.sum()} jobs:\n"))
         }
     }
+
+    @ExperimentalCoroutinesApi
+    @ObsoleteCoroutinesApi
+    suspend fun writeDataAsync(data: Map<String, List<List<StreamsSinkEntity>>>) = coroutineScope {
+        val jobs = data
+                .flatMap { (topic, records) ->
+                    records.map { async(Dispatchers.IO) { writeForTopic(topic, it) } }
+                }
+
+        jobs.awaitAll(config.batchTimeout)
+        jobs.mapNotNull { it.errors() }
+    }
+    
+    fun writeDataSync(data: Map<String, List<List<StreamsSinkEntity>>>) =
+            data.flatMap { (topic, records) ->
+                records.map {
+                    try {
+                        writeForTopic(topic, it)
+                    } catch (e: Exception) {
+                        e
+                    }
+                }.filterIsInstance<Throwable>()
+            }
 }
diff --git a/kafka-connect-neo4j/src/main/kotlin/streams/kafka/connect/sink/Neo4jSinkConnectorConfig.kt b/kafka-connect-neo4j/src/main/kotlin/streams/kafka/connect/sink/Neo4jSinkConnectorConfig.kt
@@ -62,6 +62,7 @@ class Neo4jSinkConnectorConfig(originals: Map<*, *>): AbstractConfig(config(), o
 
     val batchTimeout: Long
     val batchSize: Int
+    val parallelBatches: Boolean
 
 //    val cdcTopics: Map<TopicType, Set<String>>
 //
@@ -107,6 +108,7 @@ class Neo4jSinkConnectorConfig(originals: Map<*, *>): AbstractConfig(config(), o
         topics = Topics.from(originals, "streams.sink.", "neo4j.")
         strategyMap = TopicUtils.toStrategyMap(topics, sourceIdStrategyConfig)
 
+        parallelBatches = getBoolean(BATCH_PARALLELIZE)
         validateAllTopics(originals)
     }
 
@@ -149,6 +151,7 @@ class Neo4jSinkConnectorConfig(originals: Map<*, *>): AbstractConfig(config(), o
 
         const val BATCH_SIZE = "neo4j.batch.size"
         const val BATCH_TIMEOUT_MSECS = "neo4j.batch.timeout.msecs"
+        const val BATCH_PARALLELIZE = "neo4j.batch.parallelize"
 
         const val RETRY_BACKOFF_MSECS = "neo4j.retry.backoff.msecs"
         const val RETRY_MAX_ATTEMPTS = "neo4j.retry.max.attemps"
@@ -331,6 +334,10 @@ class Neo4jSinkConnectorConfig(originals: Map<*, *>): AbstractConfig(config(), o
                             .documentation(PropertiesUtil.getProperty(TOPIC_CDC_SCHEMA)).importance(ConfigDef.Importance.HIGH)
                             .defaultValue("").group(ConfigGroup.TOPIC_CYPHER_MAPPING)
                             .build())
+                    .define(ConfigKeyBuilder.of(BATCH_PARALLELIZE, ConfigDef.Type.BOOLEAN)
+                            .documentation(PropertiesUtil.getProperty(BATCH_PARALLELIZE)).importance(ConfigDef.Importance.MEDIUM)
+                            .defaultValue(true).group(ConfigGroup.BATCH)
+                            .build())
         }
     }
 }
diff --git a/kafka-connect-neo4j/src/main/resources/kafka-connect-sink.properties b/kafka-connect-neo4j/src/main/resources/kafka-connect-sink.properties
@@ -13,7 +13,6 @@
 # limitations under the License.
 ##
 
-error.reporting=Type: enum[logging, throwing, deadletter];\nDescription: Error Reporting Mode, one of: logging, throwing, deadletter
 neo4j.server.uri=Type: String;\nDescription: The Bolt URI (default bolt://localhost:7687)
 neo4j.authentication.type=Type: enum[NONE, BASIC, KERBEROS];\nDescription: The authentication type (default BASIC)
 neo4j.batch.size=Type: Int;\nDescription: The max number of events processed by the Cypher query (default 1000)
@@ -35,4 +34,5 @@ neo4j.retry.max.attemps=Type: Int;\nDescription: The maximum number of times to
 neo4j.topic.cdc.sourceId=Type: String;\nDescription: The topic that manages CDC events with the `SourceId` strategy
 neo4j.topic.cdc.sourceId.labelName=Type: String;\nDescription: The label name attached to the events with the `SourceId` strategy (default SourceEvent)
 neo4j.topic.cdc.sourceId.idName=Type: String;\nDescription: The id property name attached to the events with the `SourceId` strategy (default sourceId)
-neo4j.topic.cdc.schema=Type: String;\nDescription: The topic that manages CDC events with the `Schema` strategy
+neo4j.topic.cdc.schema=Type: String;\nDescription: The topic that manages CDC events with the `Schema` strategy
+neo4j.batch.parallelize=Type: Boolean;\nDescription: If enabled messages are processed concurrently in the sink. Non concurrent execution supports in-order processing, e.g. for CDC (default true)