Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnore;
Expand All @@ -31,6 +32,7 @@
import org.springframework.ai.document.id.RandomIdGenerator;
import org.springframework.ai.model.Media;
import org.springframework.ai.model.MediaContent;
import org.springframework.lang.Nullable;
import org.springframework.util.Assert;
import org.springframework.util.StringUtils;

Expand Down Expand Up @@ -61,7 +63,15 @@ public class Document implements MediaContent {
* Metadata for the document. It should not be nested and values should be restricted
* to string, int, float, boolean for simple use with Vector Dbs.
*/
private Map<String, Object> metadata;
private final Map<String, Object> metadata;

/**
* Measure of similarity between the document embedding and the query vector. The
* higher the score, the more they are similar. It's the opposite of the distance
* measure.
*/
@Nullable
private Double score;

/**
* Embedding of the document. Note: ephemeral field.
Expand All @@ -80,31 +90,61 @@ public Document(@JsonProperty("content") String content) {
this(content, new HashMap<>());
}

/**
* @deprecated Use builder instead: {@link Document#builder()}.
*/
@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Document(String content, Map<String, Object> metadata) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are probably many users out there using this constructor and perhaps the one with String id, String content, Map<String, Object> metadata args. Despite the having the builder, maybe we keep these ctors?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought of deprecating most of the constructors to make the callers more readable, mostly a problem with the other varying constructors with 3 or more arguments. For example, there are 3 constructors that accept 3 arguments, but all different (with media and metadata very easy to mix-up).

What if keep only the 2 ones you mentioned? Or should I keep all of them?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have "undeprecated" the mentioned constructors.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yea, the two are good, once we get to three is when the ambiguity starts so the builder should be preferred

this(content, metadata, new RandomIdGenerator());
}

/**
* @deprecated Use builder instead: {@link Document#builder()}.
*/
@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Document(String content, Collection<Media> media, Map<String, Object> metadata) {
this(new RandomIdGenerator().generateId(content, metadata), content, media, metadata);
}

/**
* @deprecated Use builder instead: {@link Document#builder()}.
*/
@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Document(String content, Map<String, Object> metadata, IdGenerator idGenerator) {
this(idGenerator.generateId(content, metadata), content, metadata);
}

/**
* @deprecated Use builder instead: {@link Document#builder()}.
*/
@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Document(String id, String content, Map<String, Object> metadata) {
this(id, content, List.of(), metadata);
}

/**
* @deprecated Use builder instead: {@link Document#builder()}.
*/
@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Document(String id, String content, Collection<Media> media, Map<String, Object> metadata) {
Assert.hasText(id, "id must not be null or empty");
Assert.notNull(content, "content must not be null");
Assert.notNull(metadata, "metadata must not be null");
this(id, content, media, metadata, null);
}

public Document(String id, String content, @Nullable Collection<Media> media,
@Nullable Map<String, Object> metadata, @Nullable Double score) {
Assert.hasText(id, "id cannot be null or empty");
Assert.notNull(content, "content cannot be null");
Assert.notNull(media, "media cannot be null");
Assert.noNullElements(media, "media cannot have null elements");
Assert.notNull(metadata, "metadata cannot be null");
Assert.noNullElements(metadata.keySet(), "metadata cannot have null keys");
Assert.noNullElements(metadata.values(), "metadata cannot have null values");

this.id = id;
this.content = content;
this.media = media;
this.metadata = metadata;
this.media = media != null ? media : List.of();
this.metadata = metadata != null ? metadata : new HashMap<>();
this.score = score;
}

public static Builder builder() {
Expand Down Expand Up @@ -149,6 +189,15 @@ public Map<String, Object> getMetadata() {
return this.metadata;
}

@Nullable
public Double getScore() {
return this.score;
}

public void setScore(@Nullable Double score) {
this.score = score;
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should the document object be immutable? If we want to update a Document with a score we should use a builder that takes the existing document and then call the 'score' builder method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The post-retrieval steps in a RAG flow would all modify the Documents in some way, including the score, the content and the metadata. I guess we could create new instances on each hop to the next step in the flow. But we'd need several builders based on the type of field we change. Should I go with that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made score a final field and introduced a mutate() method to build a new Document instance with the possibility to change the score. However, Document was not immutable to begin with (media and metadata are mutable). I have created a separate issue to look into that because it would be a breaking change and it would require lots of refactoring. #1838


/**
* Return the embedding that were calculated.
* @deprecated We are considering getting rid of this, please comment on
Expand Down Expand Up @@ -186,57 +235,24 @@ public void setContentFormatter(ContentFormatter contentFormatter) {

@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((this.id == null) ? 0 : this.id.hashCode());
result = prime * result + ((this.metadata == null) ? 0 : this.metadata.hashCode());
result = prime * result + ((this.content == null) ? 0 : this.content.hashCode());
return result;
return Objects.hash(id, content, media, metadata);
}

@Override
public boolean equals(Object obj) {
if (this == obj) {
public boolean equals(Object o) {
if (this == o)
return true;
}
if (obj == null) {
return false;
}
if (getClass() != obj.getClass()) {
return false;
}
Document other = (Document) obj;
if (this.id == null) {
if (other.id != null) {
return false;
}
}
else if (!this.id.equals(other.id)) {
return false;
}
if (this.metadata == null) {
if (other.metadata != null) {
return false;
}
}
else if (!this.metadata.equals(other.metadata)) {
if (o == null || getClass() != o.getClass())
return false;
}
if (this.content == null) {
if (other.content != null) {
return false;
}
}
else if (!this.content.equals(other.content)) {
return false;
}
return true;
Document document = (Document) o;
return Objects.equals(id, document.id) && Objects.equals(content, document.content)
&& Objects.equals(media, document.media) && Objects.equals(metadata, document.metadata);
}

@Override
public String toString() {
return "Document{" + "id='" + this.id + '\'' + ", metadata=" + this.metadata + ", content='" + this.content
+ '\'' + ", media=" + this.media + '}';
return "Document{" + "id='" + id + '\'' + ", content='" + content + '\'' + ", media=" + media + ", metadata="
+ metadata + ", score=" + score + '}';
}

public static class Builder {
Expand All @@ -249,56 +265,102 @@ public static class Builder {

private Map<String, Object> metadata = new HashMap<>();

private float[] embedding = new float[0];

private Double score;

private IdGenerator idGenerator = new RandomIdGenerator();

public Builder withIdGenerator(IdGenerator idGenerator) {
Assert.notNull(idGenerator, "idGenerator must not be null");
public Builder idGenerator(IdGenerator idGenerator) {
Assert.notNull(idGenerator, "idGenerator cannot be null");
this.idGenerator = idGenerator;
return this;
}

public Builder withId(String id) {
Assert.hasText(id, "id must not be null or empty");
public Builder id(String id) {
Assert.hasText(id, "id cannot be null or empty");
this.id = id;
return this;
}

public Builder withContent(String content) {
Assert.notNull(content, "content must not be null");
public Builder content(String content) {
this.content = content;
return this;
}

public Builder withMedia(List<Media> media) {
Assert.notNull(media, "media must not be null");
public Builder media(List<Media> media) {
this.media = media;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i've updated it so that it adds to the existing list vs. replacing it. There are tests that assume it aggregates.

return this;
}

public Builder withMedia(Media media) {
Assert.notNull(media, "media must not be null");
this.media.add(media);
public Builder media(Media... media) {
Assert.noNullElements(media, "media cannot contain null elements");
this.media.addAll(List.of(media));
return this;
}

public Builder withMetadata(Map<String, Object> metadata) {
Assert.notNull(metadata, "metadata must not be null");
public Builder metadata(Map<String, Object> metadata) {
this.metadata = metadata;
return this;
}

public Builder withMetadata(String key, Object value) {
Assert.notNull(key, "key must not be null");
Assert.notNull(value, "value must not be null");
public Builder metadata(String key, Object value) {
this.metadata.put(key, value);
return this;
}

public Builder embedding(float[] embedding) {
this.embedding = embedding;
return this;
}

public Builder score(Double score) {
this.score = score;
return this;
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withIdGenerator(IdGenerator idGenerator) {
return idGenerator(idGenerator);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withId(String id) {
return id(id);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withContent(String content) {
return content(content);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withMedia(List<Media> media) {
return media(media);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withMedia(Media media) {
return media(media);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withMetadata(Map<String, Object> metadata) {
return metadata(metadata);
}

@Deprecated(since = "1.0.0-M5", forRemoval = true)
public Builder withMetadata(String key, Object value) {
return metadata(key, value);
}

public Document build() {
if (!StringUtils.hasText(this.id)) {
this.id = this.idGenerator.generateId(this.content, this.metadata);
}
return new Document(this.id, this.content, this.media, this.metadata);
var document = new Document(this.id, this.content, this.media, this.metadata, this.score);
document.setEmbedding(this.embedding);
return document;
}

}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.springframework.ai.document;

import org.springframework.ai.vectorstore.VectorStore;

/**
* Common set of metadata keys used in {@link Document}s by {@link DocumentReader}s and
* {@link VectorStore}s.
*
* @author Thomas Vitale
* @since 1.0.0
*/
public enum DocumentMetadata {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea for this enum is to use it for other common metadata used in Documents, such as the "source file" or "page" when using a DocumentReader, helping the RAG flow traceability.


// @formatter:off

/**
* Measure of distance between the document embedding and the query vector.
* The lower the distance, the more they are similar.
* It's the opposite of the similarity score.
*/
DISTANCE("distance");
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I kept this metadata for backward compatibility, but we might consider removing it completely since we now have the "score" field in each Document (and "distance" is always the opposite value of "score").


private final String value;

DocumentMetadata(String value) {
this.value = value;
}
public String value() {
return this.value;
}

// @formatter:on

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

@NonNullApi
@NonNullFields
package org.springframework.ai.document;

import org.springframework.lang.NonNullApi;
import org.springframework.lang.NonNullFields;
Loading