Skip to content

Commit 86f5aa7

Browse files
committed
MLE-26427 Initial exclusion support for XML
Going to tackle error handling and unhappy path stuff next. More refactoring to be done as well.
1 parent acdeb0c commit 86f5aa7

File tree

9 files changed

+352
-116
lines changed

9 files changed

+352
-116
lines changed

marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/ContentExclusionUtil.java

Lines changed: 76 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,35 @@
88
import com.fasterxml.jackson.databind.JsonNode;
99
import com.fasterxml.jackson.databind.ObjectMapper;
1010
import com.fasterxml.jackson.databind.node.ObjectNode;
11+
import com.marklogic.client.impl.XmlFactories;
1112
import org.slf4j.Logger;
1213
import org.slf4j.LoggerFactory;
14+
import org.w3c.dom.Document;
15+
import org.w3c.dom.Node;
16+
import org.w3c.dom.NodeList;
17+
18+
import javax.xml.namespace.QName;
19+
import javax.xml.parsers.DocumentBuilder;
20+
import javax.xml.transform.OutputKeys;
21+
import javax.xml.transform.Transformer;
22+
import javax.xml.transform.TransformerException;
23+
import javax.xml.transform.dom.DOMSource;
24+
import javax.xml.transform.stream.StreamResult;
25+
import javax.xml.xpath.XPath;
26+
import javax.xml.xpath.XPathConstants;
27+
import javax.xml.xpath.XPathExpression;
28+
import javax.xml.xpath.XPathExpressionException;
29+
import java.io.ByteArrayInputStream;
30+
import java.io.StringWriter;
31+
import java.nio.charset.StandardCharsets;
1332

1433
/**
1534
* Utility class for applying content exclusions to documents before hash calculation.
1635
* Supports removing specific paths from JSON and XML documents using JSON Pointer and XPath expressions.
1736
*
1837
* @since 8.1.0
1938
*/
20-
public class ContentExclusionUtil {
39+
class ContentExclusionUtil {
2140

2241
private static final Logger logger = LoggerFactory.getLogger(ContentExclusionUtil.class);
2342
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
@@ -31,7 +50,7 @@ public class ContentExclusionUtil {
3150
* @return the modified JSON content with specified paths removed
3251
* @throws JsonProcessingException if the JSON content cannot be parsed or serialized
3352
*/
34-
public static String applyJsonExclusions(String uri, String jsonContent, String[] jsonPointers) throws JsonProcessingException {
53+
static String applyJsonExclusions(String uri, String jsonContent, String[] jsonPointers) throws JsonProcessingException {
3554
if (jsonPointers == null || jsonPointers.length == 0) {
3655
return jsonContent;
3756
}
@@ -72,6 +91,59 @@ private static void removeNodeAtPointer(String uri, JsonNode rootNode, String js
7291
}
7392
}
7493

75-
// Future method for XML exclusions
76-
// public static String applyXmlExclusions(String xmlContent, String[] xpaths) { ... }
94+
/**
95+
* Applies XPath exclusions to XML content by removing the specified elements.
96+
*
97+
* @param uri the document URI (used for logging purposes)
98+
* @param xmlContent the XML content as a string
99+
* @param xpathExpressions array of XPath expressions identifying elements to exclude
100+
* @return the modified XML content with specified elements removed
101+
* @throws Exception if the XML content cannot be parsed or serialized
102+
*/
103+
static String applyXmlExclusions(String uri, String xmlContent, String... xpathExpressions) throws Exception {
104+
if (xpathExpressions == null || xpathExpressions.length == 0) {
105+
return xmlContent;
106+
}
107+
108+
DocumentBuilder builder = XmlFactories.getDocumentBuilderFactory().newDocumentBuilder();
109+
Document document = builder.parse(new ByteArrayInputStream(xmlContent.getBytes(StandardCharsets.UTF_8)));
110+
applyXmlExclusions(uri, document, xpathExpressions);
111+
return serializeDocument(document);
112+
}
113+
114+
private static void applyXmlExclusions(String uri, Document document, String[] xpathExpressions) {
115+
final XPath xpath = XmlFactories.getXPathFactory().newXPath();
116+
for (String xpathExpression : xpathExpressions) {
117+
try {
118+
XPathExpression expr = xpath.compile(xpathExpression);
119+
QName returnType = XPathConstants.NODESET;
120+
NodeList nodes = (NodeList) expr.evaluate(document, returnType);
121+
122+
if (nodes.getLength() == 0) {
123+
logger.debug("XPath '{}' does not match any nodes in document {}, skipping", xpathExpression, uri);
124+
continue;
125+
}
126+
127+
// Remove nodes in reverse order to avoid index issues
128+
for (int i = nodes.getLength() - 1; i >= 0; i--) {
129+
Node node = nodes.item(i);
130+
Node parent = node.getParentNode();
131+
if (parent != null) {
132+
parent.removeChild(node);
133+
}
134+
}
135+
} catch (XPathExpressionException e) {
136+
logger.warn("Invalid XPath expression '{}' for document {}: {}", xpathExpression, uri, e.getMessage());
137+
}
138+
}
139+
}
140+
141+
private static String serializeDocument(Document document) throws TransformerException {
142+
Transformer transformer = XmlFactories.newTransformer();
143+
transformer.setOutputProperty(OutputKeys.OMIT_XML_DECLARATION, "yes");
144+
transformer.setOutputProperty(OutputKeys.INDENT, "no");
145+
StringWriter writer = new StringWriter();
146+
transformer.transform(new DOMSource(document), new StreamResult(writer));
147+
return writer.toString();
148+
}
77149
}

marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteEvalFilter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ class IncrementalWriteEvalFilter extends IncrementalWriteFilter {
3131
""";
3232

3333
IncrementalWriteEvalFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson,
34-
Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions) {
35-
super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions);
34+
Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) {
35+
super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions);
3636
}
3737

3838
@Override

marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteFilter.java

Lines changed: 22 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ public static class Builder {
4646
private boolean useEvalQuery = false;
4747
private Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
4848
private String[] jsonExclusions;
49+
private String[] xmlExclusions;
4950

5051
/**
5152
* @param keyName the name of the MarkLogic metadata key that will hold the hash value; defaults to "incrementalWriteHash".
@@ -103,11 +104,20 @@ public Builder jsonExclusions(String... jsonPointers) {
103104
return this;
104105
}
105106

107+
/**
108+
* @param xpathExpressions XPath expressions identifying XML elements to exclude from hash calculation.
109+
* For example, "//timestamp" or "//metadata/lastModified".
110+
*/
111+
public Builder xmlExclusions(String... xpathExpressions) {
112+
this.xmlExclusions = xpathExpressions;
113+
return this;
114+
}
115+
106116
public IncrementalWriteFilter build() {
107117
if (useEvalQuery) {
108-
return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions);
118+
return new IncrementalWriteEvalFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions);
109119
}
110-
return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions);
120+
return new IncrementalWriteOpticFilter(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions);
111121
}
112122
}
113123

@@ -116,17 +126,19 @@ public IncrementalWriteFilter build() {
116126
private final boolean canonicalizeJson;
117127
private final Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer;
118128
private final String[] jsonExclusions;
129+
private final String[] xmlExclusions;
119130

120131
// Hardcoding this for now, with a good general purpose hashing function.
121132
// See https://xxhash.com for benchmarks.
122133
private final LongHashFunction hashFunction = LongHashFunction.xx3();
123134

124-
public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions) {
135+
public IncrementalWriteFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson, Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) {
125136
this.hashKeyName = hashKeyName;
126137
this.timestampKeyName = timestampKeyName;
127138
this.canonicalizeJson = canonicalizeJson;
128139
this.skippedDocumentsConsumer = skippedDocumentsConsumer;
129140
this.jsonExclusions = jsonExclusions;
141+
this.xmlExclusions = xmlExclusions;
130142
}
131143

132144
protected final DocumentWriteSet filterDocuments(Context context, Function<String, String> hashRetriever) {
@@ -178,7 +190,6 @@ private String serializeContent(DocumentWriteOperation doc) {
178190
JsonCanonicalizer jc;
179191
try {
180192
if (jsonExclusions != null && jsonExclusions.length > 0) {
181-
// TBD on error handling here, want to get XML supported first.
182193
content = ContentExclusionUtil.applyJsonExclusions(doc.getUri(), content, jsonExclusions);
183194
}
184195
jc = new JsonCanonicalizer(content);
@@ -190,6 +201,13 @@ private String serializeContent(DocumentWriteOperation doc) {
190201
logger.warn("Unable to canonicalize JSON content for URI {}, using original content for hashing; cause: {}",
191202
doc.getUri(), e.getMessage());
192203
}
204+
} else if (xmlExclusions != null && xmlExclusions.length > 0) {
205+
try {
206+
content = ContentExclusionUtil.applyXmlExclusions(doc.getUri(), content, xmlExclusions);
207+
} catch (Exception e) {
208+
logger.warn("Unable to apply XML exclusions for URI {}, using original content for hashing; cause: {}",
209+
doc.getUri(), e.getMessage());
210+
}
193211
}
194212

195213
return content;

marklogic-client-api/src/main/java/com/marklogic/client/datamovement/filter/IncrementalWriteOpticFilter.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,8 @@
2020
class IncrementalWriteOpticFilter extends IncrementalWriteFilter {
2121

2222
IncrementalWriteOpticFilter(String hashKeyName, String timestampKeyName, boolean canonicalizeJson,
23-
Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions) {
24-
super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions);
23+
Consumer<DocumentWriteOperation[]> skippedDocumentsConsumer, String[] jsonExclusions, String[] xmlExclusions) {
24+
super(hashKeyName, timestampKeyName, canonicalizeJson, skippedDocumentsConsumer, jsonExclusions, xmlExclusions);
2525
}
2626

2727
@Override

marklogic-client-api/src/main/java/com/marklogic/client/impl/XmlFactories.java

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,10 @@
1212
import javax.xml.stream.FactoryConfigurationError;
1313
import javax.xml.stream.XMLInputFactory;
1414
import javax.xml.stream.XMLOutputFactory;
15+
import javax.xml.transform.Transformer;
1516
import javax.xml.transform.TransformerConfigurationException;
1617
import javax.xml.transform.TransformerFactory;
18+
import javax.xml.xpath.XPathFactory;
1719
import java.lang.ref.SoftReference;
1820
import java.util.function.Supplier;
1921

@@ -27,6 +29,12 @@ public final class XmlFactories {
2729
private static final CachedInstancePerThreadSupplier<DocumentBuilderFactory> cachedDocumentBuilderFactory =
2830
new CachedInstancePerThreadSupplier<>(XmlFactories::makeNewDocumentBuilderFactory);
2931

32+
private static final CachedInstancePerThreadSupplier<XPathFactory> cachedXPathFactory =
33+
new CachedInstancePerThreadSupplier<>(XPathFactory::newInstance);
34+
35+
private static final CachedInstancePerThreadSupplier<TransformerFactory> cachedTransformerFactory =
36+
new CachedInstancePerThreadSupplier<>(XmlFactories::makeNewTransformerFactory);
37+
3038
private XmlFactories() {} // preventing instances of utility class
3139

3240
/**
@@ -152,6 +160,47 @@ public static XMLOutputFactory getOutputFactory() {
152160
return cachedOutputFactory.get();
153161
}
154162

163+
/**
164+
* Returns a shared {@link XPathFactory}.
165+
* <p>
166+
* Creating XML factories is potentially a pretty expensive operation. Using a shared instance helps to amortize
167+
* this initialization cost via reuse.
168+
*
169+
* @return a {@link XPathFactory}
170+
*
171+
* @since 8.1.0
172+
*/
173+
public static XPathFactory getXPathFactory() {
174+
return cachedXPathFactory.get();
175+
}
176+
177+
/**
178+
* Returns a shared {@link TransformerFactory} configured with secure defaults.
179+
* <p>
180+
* Creating XML factories is potentially a pretty expensive operation. Using a shared instance helps to amortize
181+
* this initialization cost via reuse.
182+
*
183+
* @return a securely configured {@link TransformerFactory}
184+
*
185+
* @since 8.1.0
186+
*/
187+
public static TransformerFactory getTransformerFactory() {
188+
return cachedTransformerFactory.get();
189+
}
190+
191+
/**
192+
* Creates a new {@link Transformer} from the shared {@link TransformerFactory}.
193+
*
194+
* @since 8.1.0
195+
*/
196+
public static Transformer newTransformer() {
197+
try {
198+
return getTransformerFactory().newTransformer();
199+
} catch (TransformerConfigurationException e) {
200+
throw new RuntimeException("Unable to create new Transformer from TransformerFactory", e);
201+
}
202+
}
203+
155204
/**
156205
* A supplier that caches results per thread.
157206
* <p>

marklogic-client-api/src/main/java/com/marklogic/client/io/DOMHandle.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,10 @@
1818
import javax.xml.namespace.QName;
1919
import javax.xml.parsers.DocumentBuilderFactory;
2020
import javax.xml.parsers.ParserConfigurationException;
21-
import javax.xml.xpath.*;
21+
import javax.xml.xpath.XPath;
22+
import javax.xml.xpath.XPathConstants;
23+
import javax.xml.xpath.XPathExpression;
24+
import javax.xml.xpath.XPathExpressionException;
2225
import java.io.*;
2326
import java.nio.charset.StandardCharsets;
2427

@@ -205,7 +208,7 @@ public void setFactory(DocumentBuilderFactory factory) {
205208
*/
206209
public XPath getXPathProcessor() {
207210
if (xpathProcessor == null)
208-
xpathProcessor = makeXPathProcessorFactory().newXPath();
211+
xpathProcessor = XmlFactories.getXPathFactory().newXPath();
209212
return xpathProcessor;
210213
}
211214
/**
@@ -216,9 +219,6 @@ public XPath getXPathProcessor() {
216219
public void setXPathProcessor(XPath xpathProcessor) {
217220
this.xpathProcessor = xpathProcessor;
218221
}
219-
protected XPathFactory makeXPathProcessorFactory() {
220-
return XPathFactory.newInstance();
221-
}
222222

223223
/**
224224
* Evaluate a string XPath expression against the retrieved document.
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
/*
2+
* Copyright (c) 2010-2026 Progress Software Corporation and/or its subsidiaries or affiliates. All Rights Reserved.
3+
*/
4+
package com.marklogic.client.datamovement.filter;
5+
6+
import com.fasterxml.jackson.databind.ObjectMapper;
7+
import com.marklogic.client.document.DocumentWriteOperation;
8+
import com.marklogic.client.io.DocumentMetadataHandle;
9+
import com.marklogic.client.test.AbstractClientTest;
10+
import com.marklogic.client.test.Common;
11+
import org.junit.jupiter.api.BeforeEach;
12+
13+
import java.util.ArrayList;
14+
import java.util.List;
15+
import java.util.concurrent.atomic.AtomicInteger;
16+
import java.util.concurrent.atomic.AtomicReference;
17+
18+
abstract class AbstractIncrementalWriteTest extends AbstractClientTest {
19+
20+
static final DocumentMetadataHandle METADATA = new DocumentMetadataHandle()
21+
.withCollections("incremental-test")
22+
.withPermission("rest-reader", DocumentMetadataHandle.Capability.READ, DocumentMetadataHandle.Capability.UPDATE);
23+
24+
AtomicInteger writtenCount = new AtomicInteger();
25+
AtomicInteger skippedCount = new AtomicInteger();
26+
AtomicReference<Throwable> batchFailure = new AtomicReference<>();
27+
ObjectMapper objectMapper = new ObjectMapper();
28+
29+
List<DocumentWriteOperation> docs = new ArrayList<>();
30+
IncrementalWriteFilter filter;
31+
32+
@BeforeEach
33+
void setup() {
34+
// Need a user with eval privileges so that the eval filter can be tested.
35+
Common.client = Common.newEvalClient();
36+
37+
// Default filter implementation, should be suitable for most tests.
38+
filter = IncrementalWriteFilter.newBuilder()
39+
.onDocumentsSkipped(docs -> skippedCount.addAndGet(docs.length))
40+
.build();
41+
}
42+
43+
final void writeDocs(List<DocumentWriteOperation> docs) {
44+
new WriteBatcherTemplate(Common.client).runWriteJob(
45+
writeBatcher -> writeBatcher
46+
.withDocumentWriteSetFilter(filter)
47+
.onBatchSuccess(batch -> writtenCount.addAndGet(batch.getItems().length))
48+
.onBatchFailure((batch, failure) -> batchFailure.set(failure)),
49+
50+
writeBatcher -> docs.forEach(writeBatcher::add)
51+
);
52+
}
53+
}

0 commit comments

Comments
 (0)