marklogic
diff --git a/‎gradle.properties‎
Lines changed: 1 addition & 0 deletions b/‎gradle.properties‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/AbstractSinkTask.java‎
Lines changed: 107 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/AbstractSinkTask.java‎
Lines changed: 107 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/BulkDataServicesSinkTask.java‎
Lines changed: 204 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/BulkDataServicesSinkTask.java‎
Lines changed: 204 additions & 0 deletions
diff --git a/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkConfig.java‎
Lines changed: 10 additions & 0 deletions b/‎src/main/java/com/marklogic/kafka/connect/sink/MarkLogicSinkConfig.java‎
Lines changed: 10 additions & 0 deletions
@@ -15,6 +15,7 @@ confluentHome=
 
 # Only used for testing
 mlConfigPaths=src/test/ml-config
+mlModulePaths=src/test/ml-modules
 mlRestPort=8018
 mlTestRestPort=8019
 mlAppName=kafka-test
 
@@ -0,0 +1,107 @@
+package com.marklogic.kafka.connect.sink;
+
+import com.marklogic.kafka.connect.ConfigUtil;
+import org.apache.kafka.connect.sink.SinkRecord;
+import org.apache.kafka.connect.sink.SinkTask;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Base class for concrete SinkTask implementations, providing some generic functionality.
+ */
+abstract class AbstractSinkTask extends SinkTask {
+
+    protected final Logger logger = LoggerFactory.getLogger(getClass());
+
+    private boolean logKeys = false;
+    private boolean logHeaders = false;
+
+    /**
+     * Subclasses implement this to pull their necessary config from Kafka. Invoked by the {@code start} method.
+     *
+     * @param parsedConfig
+     */
+    protected abstract void onStart(Map<String, Object> parsedConfig);
+
+    /**
+     * Subclasses implement this to determine how to write each {@code SinkRecord}. This is invoked by the
+     * {@code put} method, which subclasses can override if necessary - e.g. to provide their own behavior after all
+     * records have been processed.
+     *
+     * @param sinkRecord
+     */
+    protected abstract void writeSinkRecord(SinkRecord sinkRecord);
+
+    /**
+     * Required for a Kafka task.
+     *
+     * @return
+     */
+    @Override
+    public String version() {
+        return MarkLogicSinkConnector.MARKLOGIC_SINK_CONNECTOR_VERSION;
+    }
+
+    /**
+     * Invoked by Kafka when the connector is started by Kafka Connect.
+     *
+     * @param config initial configuration
+     */
+    @Override
+    public final void start(Map<String, String> config) {
+        logger.info("Starting");
+        Map<String, Object> parsedConfig = MarkLogicSinkConfig.CONFIG_DEF.parse(config);
+        logKeys = ConfigUtil.getBoolean(MarkLogicSinkConfig.LOGGING_RECORD_KEY, parsedConfig);
+        logHeaders = ConfigUtil.getBoolean(MarkLogicSinkConfig.LOGGING_RECORD_HEADERS, parsedConfig);
+        this.onStart(parsedConfig);
+        logger.info("Started");
+    }
+
+    /**
+     * Invoked by Kafka each time it determines that it has data to send to a connector.
+     *
+     * @param records the set of records to send
+     */
+    @Override
+    public void put(Collection<SinkRecord> records) {
+        records.forEach(record -> {
+            // It is not known if either of these scenarios will ever occur; it would seem that Kafka would never pass
+            // a null record nor a record with a null value to a connector.
+            if (record == null) {
+                logger.debug("Skipping null record");
+            } else if (record.value() == null) {
+                logger.debug("Skipping record with null value");
+            } else {
+                logRecordBeforeWriting(record);
+                try {
+                    this.writeSinkRecord(record);
+                } catch (Exception ex) {
+                    // Including the stacktrace here as this could happen for a variety of reasons
+                    throw new RuntimeException("Unable to write sink record; record offset: " + record.kafkaOffset() +
+                        "cause: " + ex.getMessage(), ex);
+                }
+            }
+        });
+    }
+
+    private void logRecordBeforeWriting(SinkRecord record) {
+        if (logKeys && record.key() != null) {
+            logger.info("Record key {}", record.key());
+        }
+        if (logHeaders) {
+            List<String> headers = new ArrayList<>();
+            record.headers().forEach(header -> {
+                headers.add(String.format("%s:%s", header.key(), header.value().toString()));
+            });
+            logger.info("Record headers: {}", headers);
+        }
+        if (logger.isDebugEnabled()) {
+            logger.debug("Processing record value {} in topic {}", record.value(), record.topic());
+        }
+    }
+}
@@ -0,0 +1,204 @@
+package com.marklogic.kafka.connect.sink;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import com.marklogic.client.DatabaseClient;
+import com.marklogic.client.dataservices.IOEndpoint;
+import com.marklogic.client.dataservices.InputCaller;
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.ext.DatabaseClientConfig;
+import com.marklogic.client.ext.DefaultConfiguredDatabaseClientFactory;
+import com.marklogic.client.io.BytesHandle;
+import com.marklogic.client.io.Format;
+import com.marklogic.client.io.JacksonHandle;
+import com.marklogic.client.io.StringHandle;
+import com.marklogic.client.io.marker.AbstractWriteHandle;
+import com.marklogic.kafka.connect.DefaultDatabaseClientConfigBuilder;
+import org.apache.kafka.clients.consumer.OffsetAndMetadata;
+import org.apache.kafka.common.TopicPartition;
+import org.apache.kafka.connect.sink.SinkRecord;
+import org.springframework.util.StringUtils;
+
+import java.util.Map;
+
+/**
+ * Uses Bulk Data Services - https://github.com/marklogic/java-client-api/wiki/Bulk-Data-Services - to allow the user
+ * to provide their own endpoint implementation, thus giving the user full control over how data is written to
+ * MarkLogic.
+ */
+class BulkDataServicesSinkTask extends AbstractSinkTask {
+
+    private DatabaseClient databaseClient;
+    private InputCaller.BulkInputCaller<JsonNode> bulkInputCaller;
+    private ObjectMapper objectMapper;
+    private SinkRecordConverter sinkRecordConverter;
+
+    public BulkDataServicesSinkTask() {
+        this.objectMapper = new ObjectMapper();
+    }
+
+    @Override
+    protected void onStart(Map<String, Object> parsedConfig) {
+        DatabaseClientConfig databaseClientConfig = new DefaultDatabaseClientConfigBuilder().buildDatabaseClientConfig(parsedConfig);
+        this.databaseClient = new DefaultConfiguredDatabaseClientFactory().newDatabaseClient(databaseClientConfig);
+
+        JacksonHandle modulesHandle = readApiDeclarationFromMarkLogic(parsedConfig, databaseClientConfig);
+        InputCaller<JsonNode> inputCaller = InputCaller.on(databaseClient, modulesHandle, new JacksonHandle().withFormat(Format.JSON));
+
+        IOEndpoint.CallContext callContext = inputCaller.newCallContext()
+            .withEndpointConstants(new JacksonHandle(buildEndpointConstants(parsedConfig)));
+        this.bulkInputCaller = inputCaller.bulkCaller(callContext);
+        this.configureErrorListenerOnBulkInputCaller();
+
+        this.sinkRecordConverter = new DefaultSinkRecordConverter(parsedConfig);
+    }
+
+    /**
+     * When Kafka calls - the frequency of which can be controlled by the user - perform a synchronous flush of any
+     * records waiting to be written to MarkLogic. {@code BulkInputCaller} does not yet have an asynchronous flush
+     * like DMSDK does, but the use of a synchronous flush seems appropriate - i.e. Kafka seems to be okay with a
+     * synchronous call here, while {@code put} is expected to be async.
+     * <p>
+     * For a good reference, see https://stackoverflow.com/questions/44871377/put-vs-flush-in-kafka-connector-sink-task
+     *
+     * @param currentOffsets
+     */
+    @Override
+    public void flush(Map<TopicPartition, OffsetAndMetadata> currentOffsets) {
+        if (bulkInputCaller != null) {
+            logger.info("Flushing BulkInputCaller");
+            bulkInputCaller.awaitCompletion();
+            logger.info("Finished flushing BulkInputCaller");
+        }
+    }
+
+    @Override
+    public void stop() {
+        flush(null);
+        if (databaseClient != null) {
+            databaseClient.release();
+        }
+    }
+
+    /**
+     * Queues up the sink record for writing to MarkLogic. Once the batch size, as defined in the Bulk API declaration,
+     * is reached, the {@code BulkInputCaller} will write the data to MarkLogic.
+     *
+     * @param sinkRecord
+     */
+    @Override
+    protected void writeSinkRecord(SinkRecord sinkRecord) {
+        DocumentWriteOperation writeOp = sinkRecordConverter.convert(sinkRecord);
+        JsonNode input = buildBulkDataServiceInput(writeOp, sinkRecord);
+        bulkInputCaller.accept(input);
+    }
+
+    /**
+     * Bulk Data Services requires that the API declaration exist in the modules database associated with the app
+     * server that the connector will talk to.
+     *
+     * @param parsedConfig
+     * @param databaseClientConfig
+     * @return
+     */
+    private JacksonHandle readApiDeclarationFromMarkLogic(Map<String, Object> parsedConfig, DatabaseClientConfig databaseClientConfig) {
+        final String bulkApiUri = (String) parsedConfig.get(MarkLogicSinkConfig.BULK_DS_API_URI);
+        final String modulesDatabase = (String) parsedConfig.get(MarkLogicSinkConfig.CONNECTION_MODULES_DATABASE);
+        if (!StringUtils.hasText(modulesDatabase)) {
+            throw new IllegalArgumentException("Cannot read Bulk Data Services API declaration at URI: " + bulkApiUri
+                + "; no modules database configured. Please set the "
+                + MarkLogicSinkConfig.CONNECTION_MODULES_DATABASE + " property.");
+        }
+
+        final String originalDatabase = databaseClientConfig.getDatabase();
+        try {
+            databaseClientConfig.setDatabase(modulesDatabase);
+            DatabaseClient modulesClient = new DefaultConfiguredDatabaseClientFactory().newDatabaseClient(databaseClientConfig);
+            return modulesClient.newJSONDocumentManager().read(bulkApiUri, new JacksonHandle().withFormat(Format.JSON));
+        } catch (Exception ex) {
+            // The stacktrace isn't of any value here for a user; the message below will provide sufficient information
+            // for debugging
+            throw new RuntimeException("Unable to read Bulk Data Services API declaration at URI: " + bulkApiUri +
+                "; modules database: " + modulesDatabase + "; cause: " + ex.getMessage());
+        } finally {
+            databaseClientConfig.setDatabase(originalDatabase);
+        }
+    }
+
+    /**
+     * When using Bulk Data Services, include all "ml.document" config options in the endpoint constants in case the
+     * endpoint developer wishes to use these.
+     *
+     * @param parsedConfig
+     * @return
+     */
+    private ObjectNode buildEndpointConstants(Map<String, Object> parsedConfig) {
+        ObjectNode endpointConstants = this.objectMapper.createObjectNode();
+        for (String key : parsedConfig.keySet()) {
+            if (key.startsWith("ml.document")) {
+                Object value = parsedConfig.get(key);
+                if (value != null) {
+                    endpointConstants.put(key, value.toString());
+                }
+            }
+        }
+        return endpointConstants;
+    }
+
+    /**
+     * An envelope structure is used so that both the content and Kafka metadata from the sink record can be sent to
+     * the endpoint.
+     *
+     * @param writeOp
+     * @param sinkRecord
+     * @return
+     */
+    private JsonNode buildBulkDataServiceInput(DocumentWriteOperation writeOp, SinkRecord sinkRecord) {
+        AbstractWriteHandle handle = writeOp.getContent();
+        // This assumes that the SinkRecordConverter always constructs either a BytesHandle or StringHandle. This is an
+        // implementation detail not exposed to the user, and sufficient testing should ensure that this assumption
+        // holds up over time.
+        String content;
+        if (handle instanceof BytesHandle) {
+            content = new String(((BytesHandle) handle).get());
+        } else {
+            content = ((StringHandle) handle).get();
+        }
+        ObjectNode input = new ObjectMapper().createObjectNode();
+        input.put("content", content);
+
+        ObjectNode kafkaMetadata = input.putObject("kafka-metadata");
+        kafkaMetadata.put("topic", sinkRecord.topic());
+        Object key = sinkRecord.key();
+        if (key != null) {
+            kafkaMetadata.put("key", key.toString());
+        }
+        kafkaMetadata.put("offset", sinkRecord.kafkaOffset());
+        Integer partition = sinkRecord.kafkaPartition();
+        if (partition != null) {
+            kafkaMetadata.put("partition", partition);
+        }
+        Long timestamp = sinkRecord.timestamp();
+        if (timestamp != null) {
+            kafkaMetadata.put("timestamp", timestamp);
+        }
+        return input;
+    }
+
+    /**
+     * For the initial release of this capability, applying the "skip" approach that behaves in the same manner as
+     * the existing WriteBatcher approach - i.e. log the failure and keep processing other records/batches. Can make
+     * this configurable in the future if a client wants "stop all calls" support.
+     */
+    private void configureErrorListenerOnBulkInputCaller() {
+        this.bulkInputCaller.setErrorListener((retryCount, throwable, callContext, input) -> {
+            // The stacktrace is not included here, as it will only contain references to Bulk Data Services code and
+            // connector code, which won't help with debugging. The MarkLogic error log will be of much more value,
+            // along with seeing the error message here.
+            logger.error("Skipping failed write; cause: " + throwable.getMessage() + "; check the MarkLogic error " +
+                "log file for additional information as to the cause of the failed write");
+            return IOEndpoint.BulkIOEndpointCaller.ErrorDisposition.SKIP_CALL;
+        });
+    }
+}
@@ -15,6 +15,7 @@ public class MarkLogicSinkConfig extends AbstractConfig {
     public static final String CONNECTION_HOST = "ml.connection.host";
     public static final String CONNECTION_PORT = "ml.connection.port";
     public static final String CONNECTION_DATABASE = "ml.connection.database";
+    public static final String CONNECTION_MODULES_DATABASE = "ml.connection.modulesDatabase";
     public static final String CONNECTION_SECURITY_CONTEXT_TYPE = "ml.connection.securityContextType";
     public static final String CONNECTION_USERNAME = "ml.connection.username";
     public static final String CONNECTION_PASSWORD = "ml.connection.password";
@@ -35,6 +36,8 @@ public class MarkLogicSinkConfig extends AbstractConfig {
     public static final String DMSDK_TRANSFORM_PARAMS_DELIMITER = "ml.dmsdk.transformParamsDelimiter";
     public static final String DMSDK_INCLUDE_KAFKA_METADATA = "ml.dmsdk.includeKafkaMetadata";
 
+    public static final String BULK_DS_API_URI = "ml.sink.bulkds.apiUri";
+
     public static final String DOCUMENT_COLLECTIONS_ADD_TOPIC = "ml.document.addTopicToCollections";
     public static final String DOCUMENT_COLLECTIONS = "ml.document.collections";
     public static final String DOCUMENT_TEMPORAL_COLLECTION = "ml.document.temporalCollection";
@@ -74,6 +77,8 @@ public class MarkLogicSinkConfig extends AbstractConfig {
             "External name for 'KERBEROS' authentication")
         .define(CONNECTION_DATABASE, Type.STRING, null, Importance.LOW,
             "Name of a database to connect to. If your REST API server has a content database matching that of the one that you want to write documents to, you do not need to set this.")
+        .define(CONNECTION_MODULES_DATABASE, Type.STRING, null, Importance.MEDIUM,
+            "Name of the modules database associated with the app server; required if using Bulk Data Services so that the API module can be retrieved")
         .define(CONNECTION_TYPE, Type.STRING, null, Importance.MEDIUM,
             "Set to 'GATEWAY' when the host identified by ml.connection.host is a load balancer. See https://docs.marklogic.com/guide/java/data-movement#id_26583 for more information.")
         // Boolean fields must have a default value of null; otherwise, Confluent Platform, at least in version 7.2.1,
@@ -124,6 +129,11 @@ public class MarkLogicSinkConfig extends AbstractConfig {
         .define(DMSDK_INCLUDE_KAFKA_METADATA, Type.BOOLEAN, null, Importance.LOW,
             "Set to true so that Kafka record metadata is added to document metadata before it is written. If the document fails to be written, the Kafka record metadata will be logged as well.")
 
+        // TODO Need more info here on the API declaration itself?
+        .define(BULK_DS_API_URI, Type.STRING, null, Importance.LOW,
+            "Defines the URI of a Bulk Data Services API declaration. If set, all DMSDK properties will be ignored as Bulk Data Services will be used instead of DMSDK. " +
+                "Also, ml.connection.modulesDatabase must be defined so that the API declaration can be retrieved from the modules database.")
+
         .define(LOGGING_RECORD_KEY, Type.BOOLEAN, null, Importance.LOW,
             "Set to true to log at the info level the key of each record")
         .define(LOGGING_RECORD_HEADERS, Type.BOOLEAN, null, Importance.LOW,