Skip to content

Commit 40b51c9

Browse files
committed
MLE-24374 Now using server-specific default namespace for embeddings
1 parent f02f1f2 commit 40b51c9

File tree

10 files changed

+32
-23
lines changed

10 files changed

+32
-23
lines changed

.copyrightconfig

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,4 @@ startyear: 2023
1111
# - Dotfiles already skipped automatically
1212
# Enable by removing the leading '# ' from the next line and editing values.
1313
# filesexcluded: third_party/*, docs/generated/*.md, assets/*.png, scripts/temp_*.py, vendor/lib.js
14-
filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/**
14+
filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/**, *.json

marklogic-spark-connector/src/main/java/com/marklogic/spark/Options.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -453,7 +453,9 @@ public abstract class Options {
453453
public static final String WRITE_EMBEDDER_EMBEDDING_NAME = WRITE_EMBEDDER_PREFIX + "embedding.name";
454454

455455
/**
456-
* Allows for an optional namespace to be assigned to the embedding element in an XML chunk.
456+
* Allows for an optional namespace to be assigned to the embedding element in an XML chunk. As of 3.0.0, if not
457+
* set, then embedding elements in XML documents will be in the MarkLogic-specific vector namespace of
458+
* "http://marklogic.com/vector".
457459
*
458460
* @since 2.5.0
459461
*/

marklogic-spark-connector/src/main/java/com/marklogic/spark/Util.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,13 @@ public interface Util {
2626

2727
String DEFAULT_XML_NAMESPACE = "http://marklogic.com/appservices/model";
2828

29+
/**
30+
* Aligns with the default vector namespace in the server.
31+
*
32+
* @since 3.0.0
33+
*/
34+
String DEFAULT_VECTOR_NAMESPACE = "http://marklogic.com/vector";
35+
2936
static boolean hasOption(Map<String, String> properties, String... options) {
3037
return Stream.of(options)
3138
.anyMatch(option -> properties.get(option) != null && !properties.get(option).trim().isEmpty());

marklogic-spark-connector/src/main/java/com/marklogic/spark/core/embedding/XmlChunkConfig.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ public XmlChunkConfig(String embeddingNamespace) {
3131
public XmlChunkConfig(String textExpression, String embeddingName, String embeddingNamespace, NamespaceContext namespaceContext, boolean base64EncodeVectors) {
3232
this.textExpression = textExpression != null ? textExpression : DEFAULT_TEXT_EXPRESSION;
3333
this.embeddingName = embeddingName != null ? embeddingName : "embedding";
34-
this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_XML_NAMESPACE;
34+
this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_VECTOR_NAMESPACE;
3535
this.namespaceContext = namespaceContext;
3636
this.base64EncodeVectors = base64EncodeVectors;
3737
}

marklogic-spark-connector/src/main/java/com/marklogic/spark/core/splitter/ChunkConfig.java

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -43,18 +43,13 @@ public static class Builder {
4343
private String documentType;
4444
private String rootName;
4545
private String xmlNamespace = Util.DEFAULT_XML_NAMESPACE;
46-
private String embeddingXmlNamespace;
46+
private String embeddingXmlNamespace = Util.DEFAULT_VECTOR_NAMESPACE;
4747
private String uriPrefix;
4848
private String uriSuffix;
4949
private boolean base64EncodeVectors = false;
5050

5151
public ChunkConfig build() {
52-
String tempNamespace = embeddingXmlNamespace;
53-
if (tempNamespace == null) {
54-
// If no embedding XML namespace is specified, default to the chunk namespace is defined.
55-
tempNamespace = xmlNamespace != null ? xmlNamespace : Util.DEFAULT_XML_NAMESPACE;
56-
}
57-
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, tempNamespace, uriPrefix, uriSuffix, base64EncodeVectors);
52+
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, embeddingXmlNamespace, uriPrefix, uriSuffix, base64EncodeVectors);
5853
}
5954

6055
public Builder withMetadata(DocumentMetadataHandle metadata) {

marklogic-spark-connector/src/main/java/com/marklogic/spark/dom/NamespaceContextFactory.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ public interface NamespaceContextFactory {
1515
static NamespaceContext makeDefaultNamespaceContext() {
1616
Map<String, String> prefixesToNamespaces = new HashMap<>();
1717
prefixesToNamespaces.put("model", Util.DEFAULT_XML_NAMESPACE);
18+
prefixesToNamespaces.put("vec", Util.DEFAULT_VECTOR_NAMESPACE);
1819
return new XPathNamespaceContext(prefixesToNamespaces);
1920
}
2021

marklogic-spark-connector/src/test/java/com/marklogic/spark/AbstractIntegrationTest.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ protected XmlNode readXmlDocument(String uri) {
185185
// Registers frequently used namespaces in tests.
186186
return readXmlDocument(uri,
187187
Namespace.getNamespace("model", "http://marklogic.com/appservices/model"),
188+
Namespace.getNamespace("vec", "http://marklogic.com/vector"),
188189
Namespace.getNamespace("ex", "org:example"),
189190
Namespace.getNamespace("acme", "org:acme"),
190191
Namespace.getNamespace("xml", "http://www.w3.org/XML/1998/namespace"),

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsFromTextTest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ void xmlSidecarDocuments() {
7272
XmlNode doc = readXmlDocument(uri);
7373
doc.assertElementCount("/node()/model:chunks/model:chunk", 1);
7474
String textValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:text");
75-
String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:embedding");
75+
String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/vec:embedding");
7676
if (!chunkEmbeddings.containsKey(textValue)) {
7777
chunkEmbeddings.put(textValue, embeddingValue);
7878
}

marklogic-spark-connector/src/test/java/com/marklogic/spark/writer/embedding/AddEmbeddingsToXmlTest.java

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ void sidecarWithNamespace() {
8888
doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4);
8989
for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) {
9090
chunk.assertElementExists("/ex:chunk/ex:text");
91-
chunk.assertElementExists("When a namespace is specified for the sidecar XML document, that should " +
92-
"override the default namespace for the embedding element.", "/ex:chunk/ex:embedding");
91+
chunk.assertElementExists("As of 3.0.0, the embedding namespace should be the MarkLogic-specific one, " +
92+
"unless the user explicitly overrides it.", "/ex:chunk/vec:embedding");
9393
}
9494
}
9595

@@ -164,8 +164,8 @@ void sidecarWithNoNamespace() {
164164
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
165165
doc.assertElementValue("/root/source-uri", "/split-test.xml");
166166
doc.assertElementExists("/root/chunks/chunk[1]/text");
167-
doc.assertElementExists("Since a namespace is specified for the document - no namespace - it should be " +
168-
"applied to the embedding element too.", "/root/chunks/chunk[1]/embedding");
167+
doc.assertElementExists("As of 3.0.0, the embedding element should default to the MarkLogic-specific " +
168+
"namespace unless the user explicitly sets it.", "/root/chunks/chunk[1]/vec:embedding");
169169
}
170170

171171
@Test
@@ -183,7 +183,7 @@ void customChunks() {
183183

184184
XmlNode doc = readXmlDocument("/split-test.xml");
185185
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
186-
"/envelope/my-chunks/my-chunk[my-text and model:embedding]", 2);
186+
"/envelope/my-chunks/my-chunk[my-text and vec:embedding]", 2);
187187
}
188188

189189
@Test
@@ -201,8 +201,10 @@ void namespacedCustomChunks() {
201201
.save();
202202

203203
XmlNode doc = readXmlDocument("/split-test.xml");
204-
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
205-
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and model:embedding]", 2);
204+
doc.assertElementCount("As of 3.0.0, the embedding element should default to the MarkLogic-specific " +
205+
"namespace unless the user explicitly sets it. In this test, the chunks are in a custom namespace, " +
206+
"but the embedding is not.",
207+
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and vec:embedding]", 2);
206208
}
207209

208210
@Test
@@ -299,8 +301,9 @@ private void verifyEachChunkOnDocumentHasAnEmbedding(String uri) {
299301
doc.getXmlNodes("/node()/model:chunks/model:chunk").forEach(chunk -> {
300302
chunk.assertElementExists("/model:chunk/model:text");
301303
chunk.assertElementExists(
302-
"As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic.",
303-
"/model:chunk/model:embedding[@xml:lang='zxx']"
304+
"As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic. " +
305+
"And as of the 3.0.0 release, the embedding should default to the MarkLogic-specific vector namespace.",
306+
"/model:chunk/vec:embedding[@xml:lang='zxx']"
304307
);
305308
});
306309
}
@@ -391,12 +394,12 @@ private void verifyDocumentHasTwoChunksWithEncodedVectors(XmlNode doc) {
391394
doc.assertElementCount("/root/model:chunks/model:chunk", 2);
392395

393396
for (XmlNode chunk : doc.getXmlNodes("/root/model:chunks/model:chunk")) {
394-
String embeddingValue = chunk.getElementValue("/model:chunk/model:embedding");
397+
String embeddingValue = chunk.getElementValue("/model:chunk/vec:embedding");
395398
assertEquals("AAAAAAMAAADD9UhAH4XLP5qZKUA=", embeddingValue,
396399
"Base64 encoded vector should match expected encoding for test vector [3.14, 1.59, 2.65]");
397400

398401
chunk.assertElementExists("xml:lang attribute should be 'zxx' to disable stemming",
399-
"/model:chunk/model:embedding[@xml:lang='zxx']");
402+
"/model:chunk/vec:embedding[@xml:lang='zxx']");
400403
}
401404
}
402405
}

test-app/src/main/ml-schemas-12/tde/xml-vector-chunks.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
{
2424
"name": "embedding",
2525
"scalarType": "vector",
26-
"val": "vec:vector(model:embedding)",
26+
"val": "vec:vector(vec:embedding)",
2727
"dimension": "384",
2828
"invalidValues": "reject",
2929
"nullable": true

0 commit comments

Comments
 (0)