From 40b51c9172b9e54b1147d626b11dafdfdc22330e Mon Sep 17 00:00:00 2001 From: Rob Rudin Date: Fri, 26 Sep 2025 15:27:51 -0400 Subject: [PATCH] MLE-24374 Now using server-specific default namespace for embeddings --- .copyrightconfig | 2 +- .../java/com/marklogic/spark/Options.java | 4 ++- .../main/java/com/marklogic/spark/Util.java | 7 ++++++ .../spark/core/embedding/XmlChunkConfig.java | 2 +- .../spark/core/splitter/ChunkConfig.java | 9 ++----- .../spark/dom/NamespaceContextFactory.java | 1 + .../spark/AbstractIntegrationTest.java | 1 + .../embedding/AddEmbeddingsFromTextTest.java | 2 +- .../embedding/AddEmbeddingsToXmlTest.java | 25 +++++++++++-------- .../ml-schemas-12/tde/xml-vector-chunks.json | 2 +- 10 files changed, 32 insertions(+), 23 deletions(-) diff --git a/.copyrightconfig b/.copyrightconfig index 86775827..509012b6 100644 --- a/.copyrightconfig +++ b/.copyrightconfig @@ -11,4 +11,4 @@ startyear: 2023 # - Dotfiles already skipped automatically # Enable by removing the leading '# ' from the next line and editing values. # filesexcluded: third_party/*, docs/generated/*.md, assets/*.png, scripts/temp_*.py, vendor/lib.js -filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/** +filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/**, *.json diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java index fd859228..3d85d467 100644 --- a/marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java +++ b/marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java @@ -453,7 +453,9 @@ public abstract class Options { public static final String WRITE_EMBEDDER_EMBEDDING_NAME = WRITE_EMBEDDER_PREFIX + "embedding.name"; /** - * Allows for an optional namespace to be assigned to the embedding element in an XML chunk. + * Allows for an optional namespace to be assigned to the embedding element in an XML chunk. As of 3.0.0, if not + * set, then embedding elements in XML documents will be in the MarkLogic-specific vector namespace of + * "http://marklogic.com/vector". * * @since 2.5.0 */ diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java index e1f9bc9c..385bd938 100644 --- a/marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java +++ b/marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java @@ -26,6 +26,13 @@ public interface Util { String DEFAULT_XML_NAMESPACE = "http://marklogic.com/appservices/model"; + /** + * Aligns with the default vector namespace in the server. + * + * @since 3.0.0 + */ + String DEFAULT_VECTOR_NAMESPACE = "http://marklogic.com/vector"; + static boolean hasOption(Map properties, String... options) { return Stream.of(options) .anyMatch(option -> properties.get(option) != null && !properties.get(option).trim().isEmpty()); diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/embedding/XmlChunkConfig.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/embedding/XmlChunkConfig.java index e09e4e0d..907e9bda 100644 --- a/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/embedding/XmlChunkConfig.java +++ b/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/embedding/XmlChunkConfig.java @@ -31,7 +31,7 @@ public XmlChunkConfig(String embeddingNamespace) { public XmlChunkConfig(String textExpression, String embeddingName, String embeddingNamespace, NamespaceContext namespaceContext, boolean base64EncodeVectors) { this.textExpression = textExpression != null ? textExpression : DEFAULT_TEXT_EXPRESSION; this.embeddingName = embeddingName != null ? embeddingName : "embedding"; - this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_XML_NAMESPACE; + this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_VECTOR_NAMESPACE; this.namespaceContext = namespaceContext; this.base64EncodeVectors = base64EncodeVectors; } diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/splitter/ChunkConfig.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/splitter/ChunkConfig.java index 8b279fbe..69edd971 100644 --- a/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/splitter/ChunkConfig.java +++ b/marklogic-spark-connector/src/main/java/com/marklogic/spark/core/splitter/ChunkConfig.java @@ -43,18 +43,13 @@ public static class Builder { private String documentType; private String rootName; private String xmlNamespace = Util.DEFAULT_XML_NAMESPACE; - private String embeddingXmlNamespace; + private String embeddingXmlNamespace = Util.DEFAULT_VECTOR_NAMESPACE; private String uriPrefix; private String uriSuffix; private boolean base64EncodeVectors = false; public ChunkConfig build() { - String tempNamespace = embeddingXmlNamespace; - if (tempNamespace == null) { - // If no embedding XML namespace is specified, default to the chunk namespace is defined. - tempNamespace = xmlNamespace != null ? xmlNamespace : Util.DEFAULT_XML_NAMESPACE; - } - return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, tempNamespace, uriPrefix, uriSuffix, base64EncodeVectors); + return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, embeddingXmlNamespace, uriPrefix, uriSuffix, base64EncodeVectors); } public Builder withMetadata(DocumentMetadataHandle metadata) { diff --git a/marklogic-spark-connector/src/main/java/com/marklogic/spark/dom/NamespaceContextFactory.java b/marklogic-spark-connector/src/main/java/com/marklogic/spark/dom/NamespaceContextFactory.java index 3b6a1f29..f2773587 100644 --- a/marklogic-spark-connector/src/main/java/com/marklogic/spark/dom/NamespaceContextFactory.java +++ b/marklogic-spark-connector/src/main/java/com/marklogic/spark/dom/NamespaceContextFactory.java @@ -15,6 +15,7 @@ public interface NamespaceContextFactory { static NamespaceContext makeDefaultNamespaceContext() { Map prefixesToNamespaces = new HashMap<>(); prefixesToNamespaces.put("model", Util.DEFAULT_XML_NAMESPACE); + prefixesToNamespaces.put("vec", Util.DEFAULT_VECTOR_NAMESPACE); return new XPathNamespaceContext(prefixesToNamespaces); } diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java index 40a9f077..97757077 100644 --- a/marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java +++ b/marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java @@ -185,6 +185,7 @@ protected XmlNode readXmlDocument(String uri) { // Registers frequently used namespaces in tests. return readXmlDocument(uri, Namespace.getNamespace("model", "http://marklogic.com/appservices/model"), + Namespace.getNamespace("vec", "http://marklogic.com/vector"), Namespace.getNamespace("ex", "org:example"), Namespace.getNamespace("acme", "org:acme"), Namespace.getNamespace("xml", "http://www.w3.org/XML/1998/namespace"), diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java index d0d818fb..35479a75 100644 --- a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java +++ b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java @@ -72,7 +72,7 @@ void xmlSidecarDocuments() { XmlNode doc = readXmlDocument(uri); doc.assertElementCount("/node()/model:chunks/model:chunk", 1); String textValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:text"); - String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:embedding"); + String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/vec:embedding"); if (!chunkEmbeddings.containsKey(textValue)) { chunkEmbeddings.put(textValue, embeddingValue); } diff --git a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java index 70f5ab72..209c48f0 100644 --- a/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java +++ b/marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java @@ -88,8 +88,8 @@ void sidecarWithNamespace() { doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4); for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) { chunk.assertElementExists("/ex:chunk/ex:text"); - chunk.assertElementExists("When a namespace is specified for the sidecar XML document, that should " + - "override the default namespace for the embedding element.", "/ex:chunk/ex:embedding"); + chunk.assertElementExists("As of 3.0.0, the embedding namespace should be the MarkLogic-specific one, " + + "unless the user explicitly overrides it.", "/ex:chunk/vec:embedding"); } } @@ -164,8 +164,8 @@ void sidecarWithNoNamespace() { XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml"); doc.assertElementValue("/root/source-uri", "/split-test.xml"); doc.assertElementExists("/root/chunks/chunk[1]/text"); - doc.assertElementExists("Since a namespace is specified for the document - no namespace - it should be " + - "applied to the embedding element too.", "/root/chunks/chunk[1]/embedding"); + doc.assertElementExists("As of 3.0.0, the embedding element should default to the MarkLogic-specific " + + "namespace unless the user explicitly sets it.", "/root/chunks/chunk[1]/vec:embedding"); } @Test @@ -183,7 +183,7 @@ void customChunks() { XmlNode doc = readXmlDocument("/split-test.xml"); doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.", - "/envelope/my-chunks/my-chunk[my-text and model:embedding]", 2); + "/envelope/my-chunks/my-chunk[my-text and vec:embedding]", 2); } @Test @@ -201,8 +201,10 @@ void namespacedCustomChunks() { .save(); XmlNode doc = readXmlDocument("/split-test.xml"); - doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.", - "/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and model:embedding]", 2); + doc.assertElementCount("As of 3.0.0, the embedding element should default to the MarkLogic-specific " + + "namespace unless the user explicitly sets it. In this test, the chunks are in a custom namespace, " + + "but the embedding is not.", + "/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and vec:embedding]", 2); } @Test @@ -299,8 +301,9 @@ private void verifyEachChunkOnDocumentHasAnEmbedding(String uri) { doc.getXmlNodes("/node()/model:chunks/model:chunk").forEach(chunk -> { chunk.assertElementExists("/model:chunk/model:text"); chunk.assertElementExists( - "As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic.", - "/model:chunk/model:embedding[@xml:lang='zxx']" + "As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic. " + + "And as of the 3.0.0 release, the embedding should default to the MarkLogic-specific vector namespace.", + "/model:chunk/vec:embedding[@xml:lang='zxx']" ); }); } @@ -391,12 +394,12 @@ private void verifyDocumentHasTwoChunksWithEncodedVectors(XmlNode doc) { doc.assertElementCount("/root/model:chunks/model:chunk", 2); for (XmlNode chunk : doc.getXmlNodes("/root/model:chunks/model:chunk")) { - String embeddingValue = chunk.getElementValue("/model:chunk/model:embedding"); + String embeddingValue = chunk.getElementValue("/model:chunk/vec:embedding"); assertEquals("AAAAAAMAAADD9UhAH4XLP5qZKUA=", embeddingValue, "Base64 encoded vector should match expected encoding for test vector [3.14, 1.59, 2.65]"); chunk.assertElementExists("xml:lang attribute should be 'zxx' to disable stemming", - "/model:chunk/model:embedding[@xml:lang='zxx']"); + "/model:chunk/vec:embedding[@xml:lang='zxx']"); } } } diff --git a/test-app/src/main/ml-schemas-12/tde/xml-vector-chunks.json b/test-app/src/main/ml-schemas-12/tde/xml-vector-chunks.json index 386d2c65..c55e04ab 100644 --- a/test-app/src/main/ml-schemas-12/tde/xml-vector-chunks.json +++ b/test-app/src/main/ml-schemas-12/tde/xml-vector-chunks.json @@ -23,7 +23,7 @@ { "name": "embedding", "scalarType": "vector", - "val": "vec:vector(model:embedding)", + "val": "vec:vector(vec:embedding)", "dimension": "384", "invalidValues": "reject", "nullable": true