Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .copyrightconfig
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ startyear: 2023
# - Dotfiles already skipped automatically
# Enable by removing the leading '# ' from the next line and editing values.
# filesexcluded: third_party/*, docs/generated/*.md, assets/*.png, scripts/temp_*.py, vendor/lib.js
filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/**
filesexcluded: .github/*, README.md, CONTRIBUTING.md, Jenkinsfile, gradle/*, docker-compose.yml, *.gradle, gradle.properties, gradlew, gradlew.bat, **/test/resources/**, docs/**, *.json
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,9 @@ public abstract class Options {
public static final String WRITE_EMBEDDER_EMBEDDING_NAME = WRITE_EMBEDDER_PREFIX + "embedding.name";

/**
* Allows for an optional namespace to be assigned to the embedding element in an XML chunk.
* Allows for an optional namespace to be assigned to the embedding element in an XML chunk. As of 3.0.0, if not
* set, then embedding elements in XML documents will be in the MarkLogic-specific vector namespace of
* "http://marklogic.com/vector".
*
* @since 2.5.0
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@ public interface Util {

String DEFAULT_XML_NAMESPACE = "http://marklogic.com/appservices/model";

/**
* Aligns with the default vector namespace in the server.
*
* @since 3.0.0
*/
String DEFAULT_VECTOR_NAMESPACE = "http://marklogic.com/vector";

static boolean hasOption(Map<String, String> properties, String... options) {
return Stream.of(options)
.anyMatch(option -> properties.get(option) != null && !properties.get(option).trim().isEmpty());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ public XmlChunkConfig(String embeddingNamespace) {
public XmlChunkConfig(String textExpression, String embeddingName, String embeddingNamespace, NamespaceContext namespaceContext, boolean base64EncodeVectors) {
this.textExpression = textExpression != null ? textExpression : DEFAULT_TEXT_EXPRESSION;
this.embeddingName = embeddingName != null ? embeddingName : "embedding";
this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_XML_NAMESPACE;
this.embeddingNamespace = embeddingNamespace != null ? embeddingNamespace : Util.DEFAULT_VECTOR_NAMESPACE;
this.namespaceContext = namespaceContext;
this.base64EncodeVectors = base64EncodeVectors;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,18 +43,13 @@ public static class Builder {
private String documentType;
private String rootName;
private String xmlNamespace = Util.DEFAULT_XML_NAMESPACE;
private String embeddingXmlNamespace;
private String embeddingXmlNamespace = Util.DEFAULT_VECTOR_NAMESPACE;
private String uriPrefix;
private String uriSuffix;
private boolean base64EncodeVectors = false;

public ChunkConfig build() {
String tempNamespace = embeddingXmlNamespace;
if (tempNamespace == null) {
// If no embedding XML namespace is specified, default to the chunk namespace is defined.
tempNamespace = xmlNamespace != null ? xmlNamespace : Util.DEFAULT_XML_NAMESPACE;
}
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, tempNamespace, uriPrefix, uriSuffix, base64EncodeVectors);
return new ChunkConfig(metadata, maxChunks, documentType, rootName, xmlNamespace, embeddingXmlNamespace, uriPrefix, uriSuffix, base64EncodeVectors);
}

public Builder withMetadata(DocumentMetadataHandle metadata) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ public interface NamespaceContextFactory {
static NamespaceContext makeDefaultNamespaceContext() {
Map<String, String> prefixesToNamespaces = new HashMap<>();
prefixesToNamespaces.put("model", Util.DEFAULT_XML_NAMESPACE);
prefixesToNamespaces.put("vec", Util.DEFAULT_VECTOR_NAMESPACE);
return new XPathNamespaceContext(prefixesToNamespaces);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ protected XmlNode readXmlDocument(String uri) {
// Registers frequently used namespaces in tests.
return readXmlDocument(uri,
Namespace.getNamespace("model", "http://marklogic.com/appservices/model"),
Namespace.getNamespace("vec", "http://marklogic.com/vector"),
Namespace.getNamespace("ex", "org:example"),
Namespace.getNamespace("acme", "org:acme"),
Namespace.getNamespace("xml", "http://www.w3.org/XML/1998/namespace"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ void xmlSidecarDocuments() {
XmlNode doc = readXmlDocument(uri);
doc.assertElementCount("/node()/model:chunks/model:chunk", 1);
String textValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:text");
String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/model:embedding");
String embeddingValue = doc.getElementValue("/node()/model:chunks/model:chunk/vec:embedding");
if (!chunkEmbeddings.containsKey(textValue)) {
chunkEmbeddings.put(textValue, embeddingValue);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,8 +88,8 @@ void sidecarWithNamespace() {
doc.assertElementCount("/ex:sidecar/ex:chunks/ex:chunk", 4);
for (XmlNode chunk : doc.getXmlNodes("/ex:sidecar/ex:chunks/ex:chunk")) {
chunk.assertElementExists("/ex:chunk/ex:text");
chunk.assertElementExists("When a namespace is specified for the sidecar XML document, that should " +
"override the default namespace for the embedding element.", "/ex:chunk/ex:embedding");
chunk.assertElementExists("As of 3.0.0, the embedding namespace should be the MarkLogic-specific one, " +
"unless the user explicitly overrides it.", "/ex:chunk/vec:embedding");
}
}

Expand Down Expand Up @@ -164,8 +164,8 @@ void sidecarWithNoNamespace() {
XmlNode doc = readXmlDocument("/split-test.xml-chunks-1.xml");
doc.assertElementValue("/root/source-uri", "/split-test.xml");
doc.assertElementExists("/root/chunks/chunk[1]/text");
doc.assertElementExists("Since a namespace is specified for the document - no namespace - it should be " +
"applied to the embedding element too.", "/root/chunks/chunk[1]/embedding");
doc.assertElementExists("As of 3.0.0, the embedding element should default to the MarkLogic-specific " +
"namespace unless the user explicitly sets it.", "/root/chunks/chunk[1]/vec:embedding");
}

@Test
Expand All @@ -183,7 +183,7 @@ void customChunks() {

XmlNode doc = readXmlDocument("/split-test.xml");
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
"/envelope/my-chunks/my-chunk[my-text and model:embedding]", 2);
"/envelope/my-chunks/my-chunk[my-text and vec:embedding]", 2);
}

@Test
Expand All @@ -201,8 +201,10 @@ void namespacedCustomChunks() {
.save();

XmlNode doc = readXmlDocument("/split-test.xml");
doc.assertElementCount("Each of the 2 custom chunks should have an 'embedding' element.",
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and model:embedding]", 2);
doc.assertElementCount("As of 3.0.0, the embedding element should default to the MarkLogic-specific " +
"namespace unless the user explicitly sets it. In this test, the chunks are in a custom namespace, " +
"but the embedding is not.",
"/ex:envelope/ex:my-chunks/ex:my-chunk[ex:my-text and vec:embedding]", 2);
}

@Test
Expand Down Expand Up @@ -299,8 +301,9 @@ private void verifyEachChunkOnDocumentHasAnEmbedding(String uri) {
doc.getXmlNodes("/node()/model:chunks/model:chunk").forEach(chunk -> {
chunk.assertElementExists("/model:chunk/model:text");
chunk.assertElementExists(
"As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic.",
"/model:chunk/model:embedding[@xml:lang='zxx']"
"As of the 2.7.0 release, the embedding should have the zxx lang to disable stemming by MarkLogic. " +
"And as of the 3.0.0 release, the embedding should default to the MarkLogic-specific vector namespace.",
"/model:chunk/vec:embedding[@xml:lang='zxx']"
);
});
}
Expand Down Expand Up @@ -391,12 +394,12 @@ private void verifyDocumentHasTwoChunksWithEncodedVectors(XmlNode doc) {
doc.assertElementCount("/root/model:chunks/model:chunk", 2);

for (XmlNode chunk : doc.getXmlNodes("/root/model:chunks/model:chunk")) {
String embeddingValue = chunk.getElementValue("/model:chunk/model:embedding");
String embeddingValue = chunk.getElementValue("/model:chunk/vec:embedding");
assertEquals("AAAAAAMAAADD9UhAH4XLP5qZKUA=", embeddingValue,
"Base64 encoded vector should match expected encoding for test vector [3.14, 1.59, 2.65]");

chunk.assertElementExists("xml:lang attribute should be 'zxx' to disable stemming",
"/model:chunk/model:embedding[@xml:lang='zxx']");
"/model:chunk/vec:embedding[@xml:lang='zxx']");
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
{
"name": "embedding",
"scalarType": "vector",
"val": "vec:vector(model:embedding)",
"val": "vec:vector(vec:embedding)",
"dimension": "384",
"invalidValues": "reject",
"nullable": true
Expand Down