marklogic
diff --git a/‎README.md‎
Lines changed: 2 additions & 2 deletions b/‎README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎build.gradle‎
Lines changed: 10 additions & 5 deletions b/‎build.gradle‎
Lines changed: 10 additions & 5 deletions
diff --git a/‎config/marklogic-sink.properties‎
Lines changed: 18 additions & 1 deletion b/‎config/marklogic-sink.properties‎
Lines changed: 18 additions & 1 deletion
diff --git a/‎src/main/java/com/marklogic/kafka/connect/DatabaseClientConfigBuilder.java‎
Lines changed: 14 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/DatabaseClientConfigBuilder.java‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/DatabaseClientCreator.java‎
Lines changed: 0 additions & 17 deletions b/‎src/main/java/com/marklogic/kafka/connect/DatabaseClientCreator.java‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/DefaultDatabaseClientCreator.java‎ renamed to ‎src/main/java/com/marklogic/kafka/connect/DefaultDatabaseClientConfigBuilder.java‎
Lines changed: 4 additions & 25 deletions b/‎src/main/java/com/marklogic/kafka/connect/DefaultDatabaseClientCreator.java‎ renamed to ‎src/main/java/com/marklogic/kafka/connect/DefaultDatabaseClientConfigBuilder.java‎
Lines changed: 4 additions & 25 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkConfig.java‎
Lines changed: 7 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkConfig.java‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkTask.java‎
Lines changed: 38 additions & 2 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkTask.java‎
Lines changed: 38 additions & 2 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/RunFlowWriteBatchListener.java‎
Lines changed: 124 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/RunFlowWriteBatchListener.java‎
Lines changed: 124 additions & 0 deletions
@@ -10,10 +10,10 @@ This is a connector for subscribing to Kafka queues and pushing messages to Mark
 #### To try this out locally:
 
 1. Configure kafkaHome in gradle-local.properties - e.g. kafkaHome=/Users/myusername/tools/kafka_2.11-2.1.0
-1. Run "./gradlew deploy" to build a jar and copy it and the below property files into the appropriate Kafka directories
+1. Run "./gradlew clean deploy" to build a jar and copy it and the below property files into the appropriate Kafka directories
 
 #### To try this out on a remote Kafka server
-1. Run "./gradlew jar" to build the jar.
+1. Run "./gradlew clean jar" to build the jar.
 1. Copy the jar to the <kafkaHome>/libs on the remote server.
 1. Copy the two properties (config/marklogic-connect-distributed.properties config/marklogic-sink.properties) to <kafkaHome>/config on the remote server.
 
 
@@ -18,11 +18,16 @@ configurations {
 
 dependencies {
     compileOnly "org.apache.kafka:connect-api:2.3.0"
-    compile ("com.marklogic:ml-javaclient-util:3.13.4") {
-        // These are excluded simply to reduce the size of the connector; if included, they do not cause any issues
-        exclude module: "jdom2"
-        exclude module: "marklogic-xcc"
-        exclude module: "spring-context"
+
+    compile ("com.marklogic:marklogic-data-hub:5.2.0") {
+        // Excluding these because there's no need for them
+        exclude module: "spring-boot-autoconfigure"
+        exclude module: "spring-integration-http"
+        exclude module: "jaeger-core"
+        exclude module: "jaeger-thrift"
+
+        // Excluding because it causes Kafka Connect to complain mightily if included
+        exclude module: "logback-classic"
     }
 
     testCompile "org.junit.jupiter:junit-jupiter-api:5.3.0"
 
@@ -78,5 +78,22 @@ ml.document.uriPrefix=/kafka-data/
 # Optional - a suffix to append to each URI
 ml.document.uriSuffix=.json
 
+# Optional - name of a REST transform to use when writing documents
+# For Data Hub, can use mlRunIngest
 ml.dmsdk.transform=
-ml.dmsdk.transformParams=
+
+# Optional - delimited set of transform names and values
+# Data Hub example = flow-name,ingestion_mapping_mastering-flow,step,1
+ml.dmsdk.transformParams=
+
+# Optional - delimiter for transform parameter names and values
+ml.dmsdk.transformParamsDelimiter=,
+
+# Properties for running a Data Hub flow
+# Using examples/dh-5-example in the DH project, could use the following config:
+# ml.datahub.flow.name=ingestion_mapping_mastering-flow
+# ml.datahub.flow.steps=2,3,4
+ml.datahub.flow.name=
+ml.datahub.flow.steps=
+# Whether or not the response data from running a flow should be logged at the info level
+ml.datahub.flow.logResponse=true
@@ -0,0 +1,14 @@
+package com.marklogic.kafka.connect;
+
+import com.marklogic.client.ext.DatabaseClientConfig;
+
+import java.util.Map;
+
+/**
+ * Defines how a map of properties read in by Kafka are used to build an instance of DatabaseClientConfig.
+ */
+public interface DatabaseClientConfigBuilder {
+
+	DatabaseClientConfig buildDatabaseClientConfig(Map<String, String> kafkaConfig);
+
+}
@@ -1,35 +1,19 @@
 package com.marklogic.kafka.connect;
 
 import com.marklogic.client.DatabaseClient;
-import com.marklogic.client.ext.ConfiguredDatabaseClientFactory;
+import com.marklogic.client.DatabaseClientFactory;
 import com.marklogic.client.ext.DatabaseClientConfig;
-import com.marklogic.client.ext.DefaultConfiguredDatabaseClientFactory;
 import com.marklogic.client.ext.SecurityContextType;
 import com.marklogic.kafka.connect.sink.MarkLogicSinkConfig;
 
 import javax.net.ssl.SSLContext;
 import java.security.NoSuchAlgorithmException;
 import java.util.Map;
 
-public class DefaultDatabaseClientCreator implements DatabaseClientCreator {
+public class DefaultDatabaseClientConfigBuilder implements DatabaseClientConfigBuilder {
 
-	private ConfiguredDatabaseClientFactory configuredDatabaseClientFactory;
-
-	public DefaultDatabaseClientCreator() {
-		this.configuredDatabaseClientFactory = new DefaultConfiguredDatabaseClientFactory();
-	}
-
-	/**
-	 * @param kafkaConfig
-	 * @return
-	 */
 	@Override
-	public DatabaseClient createDatabaseClient(Map<String, String> kafkaConfig) {
-		DatabaseClientConfig clientConfig = buildDatabaseClientConfig(kafkaConfig);
-		return configuredDatabaseClientFactory.newDatabaseClient(clientConfig);
-	}
-
-	protected DatabaseClientConfig buildDatabaseClientConfig(Map<String, String> kafkaConfig) {
+	public DatabaseClientConfig buildDatabaseClientConfig(Map<String, String> kafkaConfig) {
 		DatabaseClientConfig clientConfig = new DatabaseClientConfig();
 		clientConfig.setCertFile(kafkaConfig.get(MarkLogicSinkConfig.CONNECTION_CERT_FILE));
 		clientConfig.setCertPassword(kafkaConfig.get(MarkLogicSinkConfig.CONNECTION_CERT_PASSWORD));
@@ -76,11 +60,6 @@ protected void configureSimpleSsl(DatabaseClientConfig clientConfig) {
 			throw new RuntimeException("Unable to get default SSLContext: " + e.getMessage(), e);
 		}
 
-		clientConfig.setSslHostnameVerifier((hostname, cns, subjectAlts) -> {
-		});
-	}
-
-	public void setConfiguredDatabaseClientFactory(ConfiguredDatabaseClientFactory configuredDatabaseClientFactory) {
-		this.configuredDatabaseClientFactory = configuredDatabaseClientFactory;
+		clientConfig.setSslHostnameVerifier(DatabaseClientFactory.SSLHostnameVerifier.ANY);
 	}
 }
@@ -24,6 +24,10 @@ public class MarkLogicSinkConfig extends AbstractConfig {
 	public static final String CONNECTION_CERT_PASSWORD = "ml.connection.certPassword";
 	public static final String CONNECTION_EXTERNAL_NAME = "ml.connection.externalName";
 
+	public static final String DATAHUB_FLOW_NAME = "ml.datahub.flow.name";
+	public static final String DATAHUB_FLOW_STEPS = "ml.datahub.flow.steps";
+	public static final String DATAHUB_FLOW_LOG_RESPONSE = "ml.datahub.flow.logResponse";
+
 	public static final String DMSDK_BATCH_SIZE = "ml.dmsdk.batchSize";
 	public static final String DMSDK_THREAD_COUNT = "ml.dmsdk.threadCount";
 	public static final String DMSDK_TRANSFORM = "ml.dmsdk.transform";
@@ -50,6 +54,9 @@ public class MarkLogicSinkConfig extends AbstractConfig {
 		.define(CONNECTION_CERT_FILE, Type.STRING, Importance.LOW, "Path to a certificate file")
 		.define(CONNECTION_CERT_PASSWORD, Type.STRING, Importance.LOW, "Password for the certificate file")
 		.define(CONNECTION_EXTERNAL_NAME, Type.STRING, Importance.LOW, "External name for Kerberos authentication")
+		.define(DATAHUB_FLOW_NAME, Type.STRING, null, Importance.MEDIUM, "Name of a Data Hub flow to run")
+		.define(DATAHUB_FLOW_STEPS, Type.STRING, null, Importance.MEDIUM, "Comma-delimited names of steps to run")
+		.define(DATAHUB_FLOW_LOG_RESPONSE, Type.BOOLEAN, false, Importance.LOW, "If set to true, the response from running a flow on each ingested batch will be logged at the info level")
 		.define(DMSDK_BATCH_SIZE, Type.INT, 100, Importance.HIGH, "Number of documents to write in each batch")
 		.define(DMSDK_THREAD_COUNT, Type.INT, 8, Importance.HIGH, "Number of threads for DMSDK to use")
 		.define(DMSDK_TRANSFORM, Type.STRING, Importance.MEDIUM, "Name of a REST transform to use when writing documents")
 
@@ -4,13 +4,17 @@
 import com.marklogic.client.datamovement.DataMovementManager;
 import com.marklogic.client.datamovement.WriteBatcher;
 import com.marklogic.client.document.ServerTransform;
-import com.marklogic.kafka.connect.DefaultDatabaseClientCreator;
+import com.marklogic.client.ext.DatabaseClientConfig;
+import com.marklogic.client.ext.DefaultConfiguredDatabaseClientFactory;
+import com.marklogic.kafka.connect.DefaultDatabaseClientConfigBuilder;
 import org.apache.kafka.connect.sink.SinkRecord;
 import org.apache.kafka.connect.sink.SinkTask;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.util.Arrays;
 import java.util.Collection;
+import java.util.List;
 import java.util.Map;
 
 /**
@@ -32,7 +36,9 @@ public void start(final Map<String, String> config) {
 
 		sinkRecordConverter = new DefaultSinkRecordConverter(config);
 
-		databaseClient = new DefaultDatabaseClientCreator().createDatabaseClient(config);
+		DatabaseClientConfig databaseClientConfig = new DefaultDatabaseClientConfigBuilder().buildDatabaseClientConfig(config);
+		databaseClient = new DefaultConfiguredDatabaseClientFactory().newDatabaseClient(databaseClientConfig);
+
 		dataMovementManager = databaseClient.newDataMovementManager();
 		writeBatcher = dataMovementManager.newWriteBatcher()
 			.withBatchSize(Integer.parseInt(config.get(MarkLogicSinkConfig.DMSDK_BATCH_SIZE)))
@@ -43,11 +49,41 @@ public void start(final Map<String, String> config) {
 			writeBatcher.withTransform(transform);
 		}
 
+		final String flowName = config.get(MarkLogicSinkConfig.DATAHUB_FLOW_NAME);
+		if (flowName != null && flowName.trim().length() > 0) {
+			writeBatcher.onBatchSuccess(buildSuccessListener(flowName, config, databaseClientConfig));
+		}
+
 		dataMovementManager.startJob(writeBatcher);
 
 		logger.info("Started");
 	}
 
+	/**
+	 * This is all specific to Kafka, as it involves reading inputs from the Kafka config map and then using them to
+	 * construct the reusable RunFlowWriteBatchListener.
+	 *
+	 * @param flowName
+	 * @param kafkaConfig
+	 * @param databaseClientConfig
+	 */
+	protected RunFlowWriteBatchListener buildSuccessListener(String flowName, Map<String, String> kafkaConfig, DatabaseClientConfig databaseClientConfig) {
+		String logMessage = String.format("After ingesting a batch, will run flow '%s'", flowName);
+		final String flowSteps = kafkaConfig.get(MarkLogicSinkConfig.DATAHUB_FLOW_STEPS);
+		List<String> steps = null;
+		if (flowSteps != null && flowSteps.trim().length() > 0) {
+			steps = Arrays.asList(flowSteps.split(","));
+			logMessage += String.format(" with steps '%s' constrained to the URIs in that batch", steps.toString());
+		}
+		logger.info(logMessage);
+
+		RunFlowWriteBatchListener listener = new RunFlowWriteBatchListener(flowName, steps, databaseClientConfig);
+		if (kafkaConfig.containsKey(MarkLogicSinkConfig.DATAHUB_FLOW_LOG_RESPONSE)) {
+			listener.setLogResponse(Boolean.parseBoolean(kafkaConfig.get(MarkLogicSinkConfig.DATAHUB_FLOW_LOG_RESPONSE)));
+		}
+		return listener;
+	}
+
 	/**
 	 * Builds a REST ServerTransform object based on the DMSDK parameters in the given config. If no transform name
 	 * is configured, then null will be returned.
 
@@ -0,0 +1,124 @@
+package com.marklogic.kafka.connect.sink;
+
+import com.marklogic.client.datamovement.WriteBatch;
+import com.marklogic.client.datamovement.WriteBatchListener;
+import com.marklogic.client.datamovement.WriteEvent;
+import com.marklogic.client.ext.DatabaseClientConfig;
+import com.marklogic.client.ext.helper.LoggingObject;
+import com.marklogic.hub.flow.FlowInputs;
+import com.marklogic.hub.flow.FlowRunner;
+import com.marklogic.hub.flow.RunFlowResponse;
+import com.marklogic.hub.flow.impl.FlowRunnerImpl;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * This class is not specific to Kafka and theoretically could be moved to DHF.
+ */
+public class RunFlowWriteBatchListener extends LoggingObject implements WriteBatchListener {
+
+	private String flowName;
+	private List<String> steps;
+	private DatabaseClientConfig databaseClientConfig;
+	private boolean logResponse;
+
+	/**
+	 * The flowName and steps are assumed to have been read in by the client that is reading from system configuration
+	 * - in the Kafka case, this will be from the Kafka config map that is passed to a source task.
+	 * <p>
+	 * The DatabaseClientConfig object is needed because it's not yet possible for DHF to reuse the DatabaseClient that
+	 * Kafka constructs. While it's assumed that that DatabaseClient will write to staging, DHF needs to be able to
+	 * connect to the staging, final, and job app servers. And in order to do that, it needs all of the authentication
+	 * information that can be held by a DatabaseClientConfig. Though as of 5.2.0, DHF only supports basic/digest
+	 * authentication, and thus it's assumed that username/password will be used for authentication.
+	 *
+	 * @param flowName             required name of the flow to run
+	 * @param steps                optional list of steps
+	 * @param databaseClientConfig
+	 */
+	public RunFlowWriteBatchListener(String flowName, List<String> steps, DatabaseClientConfig databaseClientConfig) {
+		this.databaseClientConfig = databaseClientConfig;
+		this.flowName = flowName;
+		this.steps = steps;
+	}
+
+	/**
+	 * None of this is specific to Kafka. It assumes a pattern of - given the URIs that were just ingested (and are
+	 * available in the given WriteBatch), override the source query for each step to be executed with a document query
+	 * that constrains on those URIs.
+	 * <p>
+	 * The need to construct a source query is unfortunate. When DHF executes a non-ingestion step, it always runs the
+	 * collector. Thus, it's not yet possible to tell DHF - just process these URIs (specifically, it's not yet
+	 * possible to do that via FlowRunner). So it's necessary to use the URIs to construct a document query and override
+	 * each step's source query with that. Ideally, DHF can be enhanced here so a client can just pass in the URIs to
+	 * process, and then there's no call to the collector nor need to override the source query.
+	 *
+	 * @param batch
+	 */
+	@Override
+	public void processEvent(WriteBatch batch) {
+		FlowInputs inputs = buildFlowInputs(batch);
+
+		// DHF 5.2.0 only supports basic/digest auth, so this can safely be done.
+		FlowRunner flowRunner = new FlowRunnerImpl(
+			databaseClientConfig.getHost(),
+			databaseClientConfig.getUsername(),
+			databaseClientConfig.getPassword()
+		);
+
+		RunFlowResponse response = flowRunner.runFlow(inputs);
+		flowRunner.awaitCompletion();
+		if (logResponse) {
+			logger.info(format("Flow response for batch number %d:\n%s", batch.getJobBatchNumber(), response.toJson()));
+		}
+	}
+
+	protected FlowInputs buildFlowInputs(WriteBatch batch) {
+		FlowInputs inputs = new FlowInputs(flowName);
+		if (steps != null) {
+			inputs.setSteps(steps);
+		}
+		inputs.setJobId(batch.getBatcher().getJobId() + "-" + batch.getJobBatchNumber());
+
+		Map<String, Object> options = new HashMap<>();
+		options.put("sourceQuery", buildSourceQuery(batch));
+		inputs.setOptions(options);
+
+		return inputs;
+	}
+
+	protected String buildSourceQuery(WriteBatch batch) {
+		StringBuilder sb = new StringBuilder("cts.documentQuery([");
+		boolean firstOne = true;
+		for (WriteEvent event : batch.getItems()) {
+			if (!firstOne) {
+				sb.append(",");
+			}
+			sb.append(String.format("'%s'", event.getTargetUri()));
+			firstOne = false;
+		}
+		return sb.append("])").toString();
+	}
+
+	public void setLogResponse(boolean logResponse) {
+		this.logResponse = logResponse;
+	}
+
+	public String getFlowName() {
+		return flowName;
+	}
+
+	public List<String> getSteps() {
+		return steps;
+	}
+
+	public DatabaseClientConfig getDatabaseClientConfig() {
+		return databaseClientConfig;
+	}
+
+	public boolean isLogResponse() {
+		return logResponse;
+	}
+}