MLE-26420 Can now perform incremental writes

rjrudin · rjrudin · commit f6cfa96d8170 · 2025-12-31T10:03:20.000-05:00
IncrementalWriteFilter is the entry point, with a Builder for customizing its behavior.
diff --git a/marklogic-client-api/build.gradle b/marklogic-client-api/build.gradle
@@ -37,6 +37,11 @@ dependencies {
 	implementation "com.fasterxml.jackson.core:jackson-databind:${jacksonVersion}"
 	implementation "com.fasterxml.jackson.dataformat:jackson-dataformat-csv:${jacksonVersion}"
 
+	// Dependencies for hash generation. Can be safely omitted if not using the incremental write feature. But neither
+	// has any transitive dependencies, and thus their impact on the dependency tree is minimal.
+	implementation "io.github.erdtman:java-json-canonicalization:1.1"
+	implementation "net.openhft:zero-allocation-hashing:0.27ea1"
+
 	// Only used by extras (which some examples then depend on)
 	compileOnly 'org.jdom:jdom2:2.0.6.1'
 	compileOnly 'org.dom4j:dom4j:2.2.0'
diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.marklogic.client.datamovement.DocumentWriteSetFilter;
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.io.JacksonHandle;
+
+import java.util.function.Consumer;
+
+/**
+ * Uses server-side JavaScript code to get the existing hash values for a set of URIs.
+ *
+ * @since 8.1.0
+ */
+class IncrementalWriteEvalFilter extends IncrementalWriteFilter {
+
+	private static final String EVAL_SCRIPT = """
+		const tuples = cts.valueTuples([cts.uriReference(), cts.fieldReference(fieldName)], null, cts.documentQuery(uris));
+		const response = {};
+		for (var tuple of tuples) {
+		  response[tuple[0]] = tuple[1];
+		}
+		response
+		""";
+
+	IncrementalWriteEvalFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+	}
+
+	@Override
+	public DocumentWriteSet apply(DocumentWriteSetFilter.Context context) {
+		ArrayNode uris = new ObjectMapper().createArrayNode();
+		for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
+			if (DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
+				uris.add(doc.getUri());
+			}
+		}
+
+		JsonNode response = context.getDatabaseClient().newServerEval().javascript(EVAL_SCRIPT)
+			.addVariable("fieldName", fieldName)
+			.addVariable("uris", new JacksonHandle(uris))
+			.evalAs(JsonNode.class);
+
+		return filterDocuments(context, uri -> {
+			if (response.has(uri)) {
+				return response.get(uri).asText();
+			}
+			return null;
+		});
+	}
+}
diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.marklogic.client.datamovement.DocumentWriteSetFilter;
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.impl.DocumentWriteOperationImpl;
+import com.marklogic.client.impl.HandleAccessor;
+import com.marklogic.client.io.BaseHandle;
+import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.client.io.Format;
+import net.openhft.hashing.LongHashFunction;
+import org.erdtman.jcs.JsonCanonicalizer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.Consumer;
+import java.util.function.Function;
+
+/**
+ * A DocumentWriteSetFilter that skips writing documents whose content has not changed since the last write
+ * based on a hash value stored in a MarkLogic field.
+ *
+ * @since 8.1.0
+ */
+public abstract class IncrementalWriteFilter implements DocumentWriteSetFilter {
+
+	protected final Logger logger = LoggerFactory.getLogger(this.getClass());
+
+	public static Builder newBuilder() {
+		return new Builder();
+	}
+
+	public static class Builder {
+
+		private String fieldName = "incrementalWriteHash";
+		private boolean canonicalizeJson = true;
+		private boolean useEvalQuery = false;
+		private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
+
+		/**
+		 * @param fieldName the name of the MarkLogic field that will hold the hash value; defaults to "incrementalWriteHash".
+		 */
+		public Builder fieldName(String fieldName) {
+			this.fieldName = fieldName;
+			return this;
+		}
+
+		/**
+		 * @param canonicalizeJson whether to canonicalize JSON content before hashing; defaults to true.
+		 *                         Delegates to https://github.com/erdtman/java-json-canonicalization for canonicalization.
+		 */
+		public Builder canonicalizeJson(boolean canonicalizeJson) {
+			this.canonicalizeJson = canonicalizeJson;
+			return this;
+		}
+
+		/**
+		 * @param useEvalQuery if true, evaluate server-side JavaScript instead of an Optic query for retrieving hash values; defaults to false.
+		 */
+		public Builder useEvalQuery(boolean useEvalQuery) {
+			this.useEvalQuery = useEvalQuery;
+			return this;
+		}
+
+		/**
+		 * @param skippedDocumentsConsumer a consumer that will be called with any documents in a batch that were skipped because their content had not changed.
+		 */
+		public Builder onDocumentsSkipped(Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+			this.skippedDocumentsConsumer = skippedDocumentsConsumer;
+			return this;
+		}
+
+		public IncrementalWriteFilter build() {
+			if (useEvalQuery) {
+				return new IncrementalWriteEvalFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+			}
+			return new IncrementalWriteOpticFilter(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+		}
+	}
+
+	protected final String fieldName;
+	private final boolean canonicalizeJson;
+	private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
+
+	// Hardcoding this for now, with a good general purpose hashing function.
+	// See https://xxhash.com for benchmarks.
+	private final LongHashFunction hashFunction = LongHashFunction.xx3();
+
+	public IncrementalWriteFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		this.fieldName = fieldName;
+		this.canonicalizeJson = canonicalizeJson;
+		this.skippedDocumentsConsumer = skippedDocumentsConsumer;
+	}
+
+	protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) {
+		final DocumentWriteSet newWriteSet = context.getDatabaseClient().newDocumentManager().newWriteSet();
+		final List<DocumentWriteOperation> skippedDocuments = new ArrayList<>();
+
+		for (DocumentWriteOperation doc : context.getDocumentWriteSet()) {
+			if (!DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(doc.getOperationType())) {
+				newWriteSet.add(doc);
+				continue;
+			}
+
+			final String contentHash = serializeContent(doc);
+			final String existingHash = hashRetriever.apply(doc.getUri());
+			if (logger.isTraceEnabled()) {
+				logger.trace("URI: {}, existing Hash: {}, new Hash: {}", doc.getUri(), existingHash, contentHash);
+			}
+
+			if (existingHash != null) {
+				if (!existingHash.equals(contentHash)) {
+					newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
+				} else if (skippedDocumentsConsumer != null) {
+					skippedDocuments.add(doc);
+				} else {
+					// No consumer, so skip the document silently.
+				}
+			} else {
+				newWriteSet.add(addHashToMetadata(doc, fieldName, contentHash));
+			}
+		}
+
+		if (!skippedDocuments.isEmpty()) {
+			skippedDocumentsConsumer.accept(skippedDocuments.toArray(new DocumentWriteOperation[0]));
+		}
+
+		return newWriteSet;
+	}
+
+	private String serializeContent(DocumentWriteOperation doc) {
+		String content = HandleAccessor.contentAsString(doc.getContent());
+
+		Format format = null;
+		if (doc.getContent() instanceof BaseHandle<?, ?> baseHandle) {
+			format = baseHandle.getFormat();
+		}
+
+		if (canonicalizeJson && (Format.JSON.equals(format) || isPossiblyJsonContent(content))) {
+			JsonCanonicalizer jc;
+			try {
+				jc = new JsonCanonicalizer(content);
+				return jc.getEncodedString();
+			} catch (IOException e) {
+				// Going to improve this in the next PR, as I think we can throw an exception if Format = JSON.
+				logger.warn("Unable to canonicalize JSON content for URI {}, using original content for hashing; cause: {}",
+					doc.getUri(), e.getMessage());
+			}
+		}
+
+		return content;
+	}
+
+	private boolean isPossiblyJsonContent(String content) {
+		// This isn't 100% reliable, as the content could be text that just happens to start with { or [, and so
+		// we'll still need to catch an exception if we try to canonicalize non-JSON content.
+		String trimmed = content.trim();
+		return trimmed.startsWith("{") || trimmed.startsWith("[");
+	}
+
+	private String computeHash(String content) {
+		byte[] bytes = content.getBytes(StandardCharsets.UTF_8);
+		long hash = hashFunction.hashBytes(bytes);
+		return Long.toHexString(hash);
+	}
+
+	protected static DocumentWriteOperation addHashToMetadata(DocumentWriteOperation op, String fieldName, String hash) {
+		DocumentMetadataHandle newMetadata = new DocumentMetadataHandle();
+		if (op.getMetadata() != null) {
+			DocumentMetadataHandle originalMetadata = (DocumentMetadataHandle) op.getMetadata();
+			newMetadata.setPermissions(originalMetadata.getPermissions());
+			newMetadata.setCollections(originalMetadata.getCollections());
+			newMetadata.setQuality(originalMetadata.getQuality());
+			newMetadata.setProperties(originalMetadata.getProperties());
+			newMetadata.getMetadataValues().putAll(originalMetadata.getMetadataValues());
+		}
+		newMetadata.getMetadataValues().put(fieldName, hash);
+		return new DocumentWriteOperationImpl(op.getUri(), newMetadata, op.getContent(), op.getTemporalDocumentURI());
+	}
+}
diff --git a/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java b/marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.document.DocumentWriteSet;
+import com.marklogic.client.row.RowTemplate;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.function.Consumer;
+
+/**
+ * Uses an Optic query to get the existing hash values for a set of URIs.
+ *
+ * @since 8.1.0
+ */
+class IncrementalWriteOpticFilter extends IncrementalWriteFilter {
+
+	IncrementalWriteOpticFilter(String fieldName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer) {
+		super(fieldName, canonicalizeJson, skippedDocumentsConsumer);
+	}
+
+	@Override
+	public DocumentWriteSet apply(Context context) {
+		final String[] uris = context.getDocumentWriteSet().stream()
+			.filter(op -> DocumentWriteOperation.OperationType.DOCUMENT_WRITE.equals(op.getOperationType()))
+			.map(DocumentWriteOperation::getUri)
+			.toArray(String[]::new);
+
+		// It doesn't seem possible yet to use a DSL query and bind an array of strings to a "uris" param, so using
+		// a serialized query instead. That doesn't allow a user to override the query though.
+		Map<String, String> existingHashes = new RowTemplate(context.getDatabaseClient()).query(op ->
+				op.fromLexicons(Map.of(
+					"uri", op.cts.uriReference(),
+					"hash", op.cts.fieldReference(super.fieldName)
+				)).where(
+					op.cts.documentQuery(op.xs.stringSeq(uris))
+				),
+
+			rows -> {
+				Map<String, String> map = new HashMap<>();
+				rows.forEach(row -> {
+					String uri = row.getString("uri");
+					String existingHash = row.getString("hash");
+					map.put(uri, existingHash);
+				});
+				return map;
+			}
+		);
+
+		return filterDocuments(context, uri -> existingHashes.get(uri));
+	}
+}
diff --git a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilterTest.java b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilterTest.java
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2010-2025 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
+ */
+package com.marklogic.client.datamovement.filter;
+
+import com.marklogic.client.document.DocumentWriteOperation;
+import com.marklogic.client.impl.DocumentWriteOperationImpl;
+import com.marklogic.client.io.DocumentMetadataHandle;
+import com.marklogic.client.io.StringHandle;
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+
+/**
+ * Unit tests that make no connection to MarkLogic.
+ */
+class IncrementalWriteFilterTest {
+
+	/**
+	 * Verifies that when a hash is added, a new metadata object is created so that a doc-specific hash field can be
+	 * added without affecting any other document that might be sharing the same metadata object.
+	 */
+	@Test
+	void addHashToMetadata() {
+		DocumentMetadataHandle metadata = new DocumentMetadataHandle()
+			.withCollections("c1")
+			.withPermission("rest-reader", DocumentMetadataHandle.Capability.READ)
+			.withQuality(2)
+			.withProperty("prop1", "value1")
+			.withMetadataValue("meta1", "value1");
+
+		DocumentWriteOperation doc1 = new DocumentWriteOperationImpl("/1.xml", metadata, new StringHandle("<doc1/>"));
+		DocumentWriteOperation doc2 = new DocumentWriteOperationImpl("/2.xml", metadata, new StringHandle("<doc2/>"));
+
+		doc2 = IncrementalWriteFilter.addHashToMetadata(doc2, "theField", "abc123");
+
+		assertEquals(metadata, doc1.getMetadata(), "doc1 should still have the original metadata object");
+
+		DocumentMetadataHandle metadata2 = (DocumentMetadataHandle) doc2.getMetadata();
+		assertEquals("c1", metadata2.getCollections().iterator().next(), "collection should be preserved");
+		assertEquals(DocumentMetadataHandle.Capability.READ, metadata2.getPermissions().get("rest-reader").iterator().next(), "permission should be preserved");
+		assertEquals(2, metadata2.getQuality(), "quality should be preserved");
+		assertEquals("value1", metadata2.getProperties().get("prop1"), "property should be preserved");
+
+		assertEquals("value1", metadata2.getMetadataValues().get("meta1"), "metadata value should be preserved");
+		assertEquals("abc123", metadata2.getMetadataValues().get("theField"), "hash field should be added");
+	}
+}
diff --git a/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java b/marklogic-client-api/src/test/java/com/marklogic/client/datamovement/filter/IncrementalWriteTest.java
diff --git a/test-app/src/main/ml-config/databases/content-database.json b/test-app/src/main/ml-config/databases/content-database.json