marklogic
diff --git a/‎AWS-CloudFormation/s3Resources/marklogic-sink.properties‎
Lines changed: 1 addition & 85 deletions b/‎AWS-CloudFormation/s3Resources/marklogic-sink.properties‎
Lines changed: 1 addition & 85 deletions
diff --git a/‎CONTRIBUTING.md‎
Lines changed: 56 additions & 4 deletions b/‎CONTRIBUTING.md‎
Lines changed: 56 additions & 4 deletions
@@ -2,105 +2,21 @@
 
 name=marklogic-sink
 connector.class=com.marklogic.kafka.connect.sink.MarkLogicSinkConnector
-
-# Should only need one task since it's using a WriteBatcher, which is multi-threaded
 tasks.max=1
-
 # Topics to consume from [comma separated list for multiple topics]
 topics=marklogic
 
-
 # MarkLogic connector-specific properties
+# See ./config/marklogic-sink.properties for information on each of these
 
-# A MarkLogic host to connect to. The connector uses the Data Movement SDK, and thus it will connect to each of the
-# hosts in a cluster.
 ml.connection.host=172.31.48.57
-
-# The port of a REST API server to connect to.
 ml.connection.port=8003
-
-# Optional - the name of a database to connect to. If your REST API server has a content database matching that of the
-# one that you want to write documents to, you do not need to set this.
 ml.connection.database=Kafka
-
-# Optional - set to "gateway" when using a load balancer, else leave blank.
-# See https://docs.marklogic.com/guide/java/data-movement#id_26583 for more information.
-ml.connection.type=
-
-# Either DIGEST, BASIC, CERTIFICATE, KERBEROS, or NONE
 ml.connection.securityContextType=DIGEST
-
-# Set these based on the security context type defined above
 ml.connection.username=admin
 ml.connection.password=admin
-ml.connection.certFile=
-ml.connection.certPassword=
-ml.connection.externalName=
-
-# Set "ml.connection.simpleSsl" to "true" for a "simple" SSL strategy that uses the JVM's default SslContext and
-# X509TrustManager and a "trust everything" HostnameVerifier. Further customization of an SSL connection via properties
-# is not supported. If you need to do so, consider using the source code for this connector as a starting point.
-ml.connection.simpleSsl=false
-# You must also ensure that the server cert or the signing CA cert is imported in the JVMs cacerts file.
-# These commands may be used to get the server cert and to import it into your cacerts file.
-# Don't forget to customize the commands for your particular case.
-#   openssl x509 -in <(openssl s_client -connect <server>:8004 -prexit 2>/dev/null) -out ~/example.crt
-#   sudo keytool -importcert -file ~/example.crt -alias <server> -keystore /path/to/java/lib/security/cacerts -storepass <storepass-password>
-
-# Sets the number of documents to be written in a batch to MarkLogic. This may not have any impact depending on the
-# connector receives data from Kafka, as the connector calls flushAsync on the DMSDK WriteBatcher after processing every
-# collection of records. Thus, if the connector never receives at one time more than the value of this property, then
-# the value of this property will have no impact.
-ml.dmsdk.batchSize=100
-
-# Sets the number of threads used by the Data Movement SDK for parallelizing writes to MarkLogic. Similar to the batch
-# size property above, this may never come into play depending on how many records the connector receives at once.
-ml.dmsdk.threadCount=8
-
-# Optional - a comma-separated list of collections that each document should be written to
 ml.document.collections=kafka-data
-
-# Optional - set this to true so that the name of the topic that the connector reads from is added as a collection to each document inserted by the connector
-ml.document.addTopicToCollections=false
-
-# Optional - specify the format of each document; either JSON, XML, BINARY, TEXT, or UNKNOWN
 ml.document.format=JSON
-
-# Optional - specify a mime type for each document; typically the format property above will be used instead of this
-ml.document.mimeType=
-
-# Optional - a comma-separated list of roles and capabilities that define the permissions for each document written to MarkLogic
 ml.document.permissions=rest-reader,read,rest-writer,update
-
-# Optional - a prefix to prepend to each URI; the URI itself is a UUID
 ml.document.uriPrefix=/kafka-data/
-
-# Optional - a suffix to append to each URI
 ml.document.uriSuffix=.json
-
-# Optional - name of a REST transform to use when writing documents
-# For Data Hub, can use mlRunIngest
-ml.dmsdk.transform=
-
-# Optional - delimited set of transform names and values
-# Data Hub example = flow-name,ingestion_mapping_mastering-flow,step,1
-ml.dmsdk.transformParams=
-
-# Optional - delimiter for transform parameter names and values
-ml.dmsdk.transformParamsDelimiter=,
-
-# Properties for running a Data Hub flow
-# Using examples/dh-5-example in the DH project, could use the following config:
-# ml.datahub.flow.name=ingestion_mapping_mastering-flow
-# ml.datahub.flow.steps=2,3,4
-ml.datahub.flow.name=
-ml.datahub.flow.steps=
-# Whether or not the response data from running a flow should be logged at the info level
-ml.datahub.flow.logResponse=true
-
-ml.id.strategy=
-ml.id.strategy.paths=
-ml.connection.enableCustomSsl=false
-ml.connection.customSsl.tlsVersion=
-ml.connection.customSsl.hostNameVerifier=
-ml.connection.customSsl.mutualAuth=false
@@ -1,4 +1,5 @@
-This guide describes how to develop and contribute pull requests to this connector.
+This guide describes how to develop and contribute pull requests to this connector. The focus is currently on how to
+develop and test the connector, either via a local install of Confluent Platform or of the regular Kafka distribution.
 
 # Running the test suite
 
@@ -27,8 +28,11 @@ Alternatively, you can import this project into an IDE such as IntelliJ and run
 # Testing with Confluent Platform
 
 [Confluent Platform](https://docs.confluent.io/platform/7.2.1/overview.html) provides an easy mechanism for running
-Kafka locally via a single application. To try out the MarkLogic Kafka connector via the Confluent Platform, follow
-the steps below.
+Kafka locally via a single application. A primary benefit of testing with Confluent Platform is to test configuring the
+MarkLogic Kafka connector via the [Confluent Control Center](https://docs.confluent.io/platform/current/control-center/index.html) 
+web application. 
+
+To try out the MarkLogic Kafka connector via the Confluent Platform, follow the steps below.
 
 ## Install Confluent Platform with the MarkLogic Kafka connector
 
@@ -173,4 +177,52 @@ services (sometimes Schema Registry, sometimes Control Center) usually stops wor
 
 # Testing with Apache Kafka
 
-TODO, will borrow a lot of content from the README.
+The primary reason to test the MarkLogic Kafka connector via a regular Kafka distribution is that the development 
+cycle is much faster and more reliable - i.e. you can repeatedly redeploy the connector and restart Kafka Connect to 
+test changes, and Kafka Connect will continue to work fine. This is particularly useful when the changes you're testing
+do not require testing the GUI provided by Confluent Control Center.
+
+To get started, these instructions assume that you already have an instance of Apache Kafka installed; the 
+[Kafka Quickstart](https://kafka.apache.org/quickstart) instructions provide an easy way of accomplishing this. Perform 
+step 1 of these instructions before proceeding.
+
+Next, configure your Gradle properties to point to your Kafka installation and deploy the connector there:
+
+1. Configure `kafkaHome` in gradle-local.properties - e.g. `kafkaHome=/Users/myusername/kafka_2.13-2.8.1`
+2. Configure `kafkaMlUsername` and `kafkaMlPassword` in gradle-local.properties, setting these to a MarkLogic user that
+   is able to write documents to MarkLogic. These values will be used to populate the 
+   `ml.connection.username` and `ml.connection.password` connector properties.
+3. Run `./gradlew clean deploy` to build a jar and copy it and the config property files to your Kafka installation
+
+[Step 2 in the Kafka Quickstart guide](https://kafka.apache.org/quickstart) provides the instructions for starting the
+separate Zookeeper and Kafka server processes. You'll need to run these commands from your Kafka installation 
+directory. As of August 2022, those commands are (these seem very unlikely to change and thus are included here for 
+convenience):
+
+    bin/zookeeper-server-start.sh config/zookeeper.properties
+
+and 
+
+    bin/kafka-server-start.sh config/server.properties
+
+Next, start the Kafka connector in standalone mode (also from the Kafka home directory):
+
+    bin/connect-standalone.sh config/marklogic-connect-standalone.properties config/marklogic-sink.properties
+
+You'll see a fair amount of logging from Kafka itself; near the end of the logging, look for messages from
+`MarkLogicSinkTask` and MarkLogic Java Client classes such as `WriteBatcherImpl` to ensure that the connector has
+started up correctly.
+
+To test out the connector, you can use the following command to enter a CLI that allows you to manually send 
+messages to the `marklogic` topic that the connector is configured by default to read from:
+
+    bin/kafka-console-producer.sh --broker-list localhost:9092 --topic marklogic
+
+Be sure that the messages you send are consistent with your configuration properties - i.e. if you've set a format of
+JSON, you should send properly formed JSON objects.
+
+When a document is received and written by the connector, you'll see logging like this:
+
+```
+[2018-12-20 12:54:13,561] INFO flushing 1 queued docs (com.marklogic.client.datamovement.impl.WriteBatcherImpl:549)
+```