-
Notifications
You must be signed in to change notification settings - Fork 2k
Support similarity scores in Document API #1794
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ | |
| import java.util.HashMap; | ||
| import java.util.List; | ||
| import java.util.Map; | ||
| import java.util.Objects; | ||
|
|
||
| import com.fasterxml.jackson.annotation.JsonCreator; | ||
| import com.fasterxml.jackson.annotation.JsonIgnore; | ||
|
|
@@ -31,6 +32,7 @@ | |
| import org.springframework.ai.document.id.RandomIdGenerator; | ||
| import org.springframework.ai.model.Media; | ||
| import org.springframework.ai.model.MediaContent; | ||
| import org.springframework.lang.Nullable; | ||
| import org.springframework.util.Assert; | ||
| import org.springframework.util.StringUtils; | ||
|
|
||
|
|
@@ -61,7 +63,15 @@ public class Document implements MediaContent { | |
| * Metadata for the document. It should not be nested and values should be restricted | ||
| * to string, int, float, boolean for simple use with Vector Dbs. | ||
| */ | ||
| private Map<String, Object> metadata; | ||
| private final Map<String, Object> metadata; | ||
|
|
||
| /** | ||
| * Measure of similarity between the document embedding and the query vector. The | ||
| * higher the score, the more they are similar. It's the opposite of the distance | ||
| * measure. | ||
| */ | ||
| @Nullable | ||
| private Double score; | ||
|
|
||
| /** | ||
| * Embedding of the document. Note: ephemeral field. | ||
|
|
@@ -80,31 +90,61 @@ public Document(@JsonProperty("content") String content) { | |
| this(content, new HashMap<>()); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated Use builder instead: {@link Document#builder()}. | ||
| */ | ||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Document(String content, Map<String, Object> metadata) { | ||
| this(content, metadata, new RandomIdGenerator()); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated Use builder instead: {@link Document#builder()}. | ||
| */ | ||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Document(String content, Collection<Media> media, Map<String, Object> metadata) { | ||
| this(new RandomIdGenerator().generateId(content, metadata), content, media, metadata); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated Use builder instead: {@link Document#builder()}. | ||
| */ | ||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Document(String content, Map<String, Object> metadata, IdGenerator idGenerator) { | ||
| this(idGenerator.generateId(content, metadata), content, metadata); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated Use builder instead: {@link Document#builder()}. | ||
| */ | ||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Document(String id, String content, Map<String, Object> metadata) { | ||
| this(id, content, List.of(), metadata); | ||
| } | ||
|
|
||
| /** | ||
| * @deprecated Use builder instead: {@link Document#builder()}. | ||
| */ | ||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Document(String id, String content, Collection<Media> media, Map<String, Object> metadata) { | ||
| Assert.hasText(id, "id must not be null or empty"); | ||
| Assert.notNull(content, "content must not be null"); | ||
| Assert.notNull(metadata, "metadata must not be null"); | ||
| this(id, content, media, metadata, null); | ||
| } | ||
|
|
||
| public Document(String id, String content, @Nullable Collection<Media> media, | ||
| @Nullable Map<String, Object> metadata, @Nullable Double score) { | ||
| Assert.hasText(id, "id cannot be null or empty"); | ||
| Assert.notNull(content, "content cannot be null"); | ||
| Assert.notNull(media, "media cannot be null"); | ||
| Assert.noNullElements(media, "media cannot have null elements"); | ||
| Assert.notNull(metadata, "metadata cannot be null"); | ||
| Assert.noNullElements(metadata.keySet(), "metadata cannot have null keys"); | ||
| Assert.noNullElements(metadata.values(), "metadata cannot have null values"); | ||
|
|
||
| this.id = id; | ||
| this.content = content; | ||
| this.media = media; | ||
| this.metadata = metadata; | ||
| this.media = media != null ? media : List.of(); | ||
| this.metadata = metadata != null ? metadata : new HashMap<>(); | ||
| this.score = score; | ||
| } | ||
|
|
||
| public static Builder builder() { | ||
|
|
@@ -149,6 +189,15 @@ public Map<String, Object> getMetadata() { | |
| return this.metadata; | ||
| } | ||
|
|
||
| @Nullable | ||
| public Double getScore() { | ||
| return this.score; | ||
| } | ||
|
|
||
| public void setScore(@Nullable Double score) { | ||
| this.score = score; | ||
| } | ||
|
||
|
|
||
| /** | ||
| * Return the embedding that were calculated. | ||
| * @deprecated We are considering getting rid of this, please comment on | ||
|
|
@@ -186,57 +235,24 @@ public void setContentFormatter(ContentFormatter contentFormatter) { | |
|
|
||
| @Override | ||
| public int hashCode() { | ||
| final int prime = 31; | ||
| int result = 1; | ||
| result = prime * result + ((this.id == null) ? 0 : this.id.hashCode()); | ||
| result = prime * result + ((this.metadata == null) ? 0 : this.metadata.hashCode()); | ||
| result = prime * result + ((this.content == null) ? 0 : this.content.hashCode()); | ||
| return result; | ||
| return Objects.hash(id, content, media, metadata); | ||
| } | ||
|
|
||
| @Override | ||
| public boolean equals(Object obj) { | ||
| if (this == obj) { | ||
| public boolean equals(Object o) { | ||
| if (this == o) | ||
| return true; | ||
| } | ||
| if (obj == null) { | ||
| return false; | ||
| } | ||
| if (getClass() != obj.getClass()) { | ||
| return false; | ||
| } | ||
| Document other = (Document) obj; | ||
| if (this.id == null) { | ||
| if (other.id != null) { | ||
| return false; | ||
| } | ||
| } | ||
| else if (!this.id.equals(other.id)) { | ||
| return false; | ||
| } | ||
| if (this.metadata == null) { | ||
| if (other.metadata != null) { | ||
| return false; | ||
| } | ||
| } | ||
| else if (!this.metadata.equals(other.metadata)) { | ||
| if (o == null || getClass() != o.getClass()) | ||
| return false; | ||
| } | ||
| if (this.content == null) { | ||
| if (other.content != null) { | ||
| return false; | ||
| } | ||
| } | ||
| else if (!this.content.equals(other.content)) { | ||
| return false; | ||
| } | ||
| return true; | ||
| Document document = (Document) o; | ||
| return Objects.equals(id, document.id) && Objects.equals(content, document.content) | ||
| && Objects.equals(media, document.media) && Objects.equals(metadata, document.metadata); | ||
| } | ||
|
|
||
| @Override | ||
| public String toString() { | ||
| return "Document{" + "id='" + this.id + '\'' + ", metadata=" + this.metadata + ", content='" + this.content | ||
| + '\'' + ", media=" + this.media + '}'; | ||
| return "Document{" + "id='" + id + '\'' + ", content='" + content + '\'' + ", media=" + media + ", metadata=" | ||
| + metadata + ", score=" + score + '}'; | ||
| } | ||
|
|
||
| public static class Builder { | ||
|
|
@@ -249,56 +265,102 @@ public static class Builder { | |
|
|
||
| private Map<String, Object> metadata = new HashMap<>(); | ||
|
|
||
| private float[] embedding = new float[0]; | ||
|
|
||
| private Double score; | ||
|
|
||
| private IdGenerator idGenerator = new RandomIdGenerator(); | ||
|
|
||
| public Builder withIdGenerator(IdGenerator idGenerator) { | ||
| Assert.notNull(idGenerator, "idGenerator must not be null"); | ||
| public Builder idGenerator(IdGenerator idGenerator) { | ||
| Assert.notNull(idGenerator, "idGenerator cannot be null"); | ||
| this.idGenerator = idGenerator; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withId(String id) { | ||
| Assert.hasText(id, "id must not be null or empty"); | ||
| public Builder id(String id) { | ||
| Assert.hasText(id, "id cannot be null or empty"); | ||
| this.id = id; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withContent(String content) { | ||
| Assert.notNull(content, "content must not be null"); | ||
| public Builder content(String content) { | ||
| this.content = content; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withMedia(List<Media> media) { | ||
| Assert.notNull(media, "media must not be null"); | ||
| public Builder media(List<Media> media) { | ||
| this.media = media; | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i've updated it so that it adds to the existing list vs. replacing it. There are tests that assume it aggregates. |
||
| return this; | ||
| } | ||
|
|
||
| public Builder withMedia(Media media) { | ||
| Assert.notNull(media, "media must not be null"); | ||
| this.media.add(media); | ||
| public Builder media(Media... media) { | ||
| Assert.noNullElements(media, "media cannot contain null elements"); | ||
| this.media.addAll(List.of(media)); | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withMetadata(Map<String, Object> metadata) { | ||
| Assert.notNull(metadata, "metadata must not be null"); | ||
| public Builder metadata(Map<String, Object> metadata) { | ||
| this.metadata = metadata; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder withMetadata(String key, Object value) { | ||
| Assert.notNull(key, "key must not be null"); | ||
| Assert.notNull(value, "value must not be null"); | ||
| public Builder metadata(String key, Object value) { | ||
| this.metadata.put(key, value); | ||
| return this; | ||
| } | ||
|
|
||
| public Builder embedding(float[] embedding) { | ||
| this.embedding = embedding; | ||
| return this; | ||
| } | ||
|
|
||
| public Builder score(Double score) { | ||
| this.score = score; | ||
| return this; | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withIdGenerator(IdGenerator idGenerator) { | ||
| return idGenerator(idGenerator); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withId(String id) { | ||
| return id(id); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withContent(String content) { | ||
| return content(content); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withMedia(List<Media> media) { | ||
| return media(media); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withMedia(Media media) { | ||
| return media(media); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withMetadata(Map<String, Object> metadata) { | ||
| return metadata(metadata); | ||
| } | ||
|
|
||
| @Deprecated(since = "1.0.0-M5", forRemoval = true) | ||
| public Builder withMetadata(String key, Object value) { | ||
| return metadata(key, value); | ||
| } | ||
|
|
||
| public Document build() { | ||
| if (!StringUtils.hasText(this.id)) { | ||
| this.id = this.idGenerator.generateId(this.content, this.metadata); | ||
| } | ||
| return new Document(this.id, this.content, this.media, this.metadata); | ||
| var document = new Document(this.id, this.content, this.media, this.metadata, this.score); | ||
| document.setEmbedding(this.embedding); | ||
| return document; | ||
| } | ||
|
|
||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,50 @@ | ||
| /* | ||
| * Copyright 2023-2024 the original author or authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * https://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.springframework.ai.document; | ||
|
|
||
| import org.springframework.ai.vectorstore.VectorStore; | ||
|
|
||
| /** | ||
| * Common set of metadata keys used in {@link Document}s by {@link DocumentReader}s and | ||
| * {@link VectorStore}s. | ||
| * | ||
| * @author Thomas Vitale | ||
| * @since 1.0.0 | ||
| */ | ||
| public enum DocumentMetadata { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea for this enum is to use it for other common metadata used in Documents, such as the "source file" or "page" when using a |
||
|
|
||
| // @formatter:off | ||
|
|
||
| /** | ||
| * Measure of distance between the document embedding and the query vector. | ||
| * The lower the distance, the more they are similar. | ||
| * It's the opposite of the similarity score. | ||
| */ | ||
| DISTANCE("distance"); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I kept this metadata for backward compatibility, but we might consider removing it completely since we now have the "score" field in each Document (and "distance" is always the opposite value of "score"). |
||
|
|
||
| private final String value; | ||
|
|
||
| DocumentMetadata(String value) { | ||
| this.value = value; | ||
| } | ||
| public String value() { | ||
| return this.value; | ||
| } | ||
|
|
||
| // @formatter:on | ||
|
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,22 @@ | ||
| /* | ||
| * Copyright 2023-2024 the original author or authors. | ||
| * | ||
| * Licensed under the Apache License, Version 2.0 (the "License"); | ||
| * you may not use this file except in compliance with the License. | ||
| * You may obtain a copy of the License at | ||
| * | ||
| * https://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| @NonNullApi | ||
| @NonNullFields | ||
| package org.springframework.ai.document; | ||
|
|
||
| import org.springframework.lang.NonNullApi; | ||
| import org.springframework.lang.NonNullFields; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There are probably many users out there using this constructor and perhaps the one with
String id, String content, Map<String, Object> metadataargs. Despite the having the builder, maybe we keep these ctors?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought of deprecating most of the constructors to make the callers more readable, mostly a problem with the other varying constructors with 3 or more arguments. For example, there are 3 constructors that accept 3 arguments, but all different (with media and metadata very easy to mix-up).
What if keep only the 2 ones you mentioned? Or should I keep all of them?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have "undeprecated" the mentioned constructors.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea, the two are good, once we get to three is when the ambiguity starts so the builder should be preferred