Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
746a233
Introduce a new setting and a feature flag for the synthetic vectors …
jimczi Jul 1, 2025
309e4c3
Introduce a synthetic vectors source loader and a synthetic vectors f…
jimczi Jul 1, 2025
9bfea9c
Handle the new setting in search and get api to exclude vectors when …
jimczi Jul 1, 2025
6c584e9
handle recovery and translog when synthetic vectors is on
jimczi Jul 1, 2025
9357340
Add support for synthetic vectors in dense vector field mapper
jimczi Jul 1, 2025
4ff2123
Update docs/changelog/130382.yaml
jimczi Jul 1, 2025
883af2c
Merge branch 'main' into synthetic_vectors
jimczi Jul 1, 2025
62c98c3
Propagate feature flag where it's needed
jimczi Jul 1, 2025
ba65346
Merge remote-tracking branch 'origin/synthetic_vectors' into syntheti…
jimczi Jul 1, 2025
fa02743
add yml tests for partial updates and get API
jimczi Jul 1, 2025
e419b76
Merge remote-tracking branch 'upstream/main' into synthetic_vectors
jimczi Jul 1, 2025
56d7b75
fix propagation of leaf reader
jimczi Jul 1, 2025
52d9278
fix RcsCcsCommonYamlTestSuiteIT
jimczi Jul 1, 2025
1d64c86
add yaml tests with the fields option and patch the vectors as list t…
jimczi Jul 1, 2025
bacc3fe
Merge remote-tracking branch 'upstream/main' into synthetic_vectors
jimczi Jul 1, 2025
bf61ed2
Merge branch 'main' into synthetic_vectors
jimczi Jul 2, 2025
9b383d3
Merge remote-tracking branch 'upstream/main' into synthetic_vectors
jimczi Jul 2, 2025
6cdf89b
empty line
jimczi Jul 2, 2025
439dae1
Merge branch 'main' into synthetic_vectors
jimczi Jul 3, 2025
6554aee
Update 130382.yaml
jimczi Jul 4, 2025
27c3a3b
apply review comments
jimczi Jul 4, 2025
5845c27
Add comments and make the code more readable
jimczi Jul 4, 2025
26ab174
Merge remote-tracking branch 'upstream/main' into synthetic_vectors
jimczi Jul 4, 2025
efaf6bd
Merge branch 'main' into synthetic_vectors
jimczi Jul 4, 2025
f0cbd8b
Merge remote-tracking branch 'upstream/main' into synthetic_vectors
jimczi Jul 7, 2025
e494584
Merge remote-tracking branch 'origin/synthetic_vectors' into syntheti…
jimczi Jul 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/130382.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 130382
summary: Remove vectors from `_source` transparently
area: "Mapping, Vector Search"
type: enhancement
issues: []
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
setup:
- requires:
reason: 'synthetic vectors are required'
test_runner_features: [ capabilities ]
capabilities:
- method: GET
path: /_search
capabilities: [ synthetic_vectors_setting ]
- skip:
features: "headers"

- do:
indices.create:
index: test
body:
settings:
index.mapping.synthetic_vectors: true
mappings:
properties:
name:
type: keyword
vector:
type: dense_vector
dims: 5
similarity: l2_norm

nested:
type: nested
properties:
paragraph_id:
type: keyword
vector:
type: dense_vector
dims: 5
similarity: l2_norm

- do:
index:
index: test
id: "1"
body:
name: cow.jpg
vector: [36, 267, -311, 12, -202]

- do:
index:
index: test
id: "2"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [-0.5, 100.0, -13, 14.8, -156.0]
- paragraph_id: 2
vector: [0, 100.0, 0, 14.8, -156.0]
- paragraph_id: 3
vector: [0, 1.0, 0, 1.8, -15.0]

- do:
index:
index: test
id: "3"
body:
name: rabbit.jpg
vector: [-0.5, 100.0, -13, 14.8, -156.0]

- do:
index:
index: test
id: "4"
body:
name: zoolander.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
- paragraph_id: 1
- paragraph_id: 2
vector: [ -9.8, 109, 32, 14.8, 23 ]


- do:
indices.refresh: {}

---
"exclude synthetic vectors":
- do:
search:
index: test
body:
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.vector

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.vector
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- not_exists: hits.hits.1._source.nested.1.vector
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- not_exists: hits.hits.1._source.nested.2.vector
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- not_exists: hits.hits.2._source.vector

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.vector
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
- not_exists: hits.hits.3._source.nested.2.vector
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }

---
"include synthetic vectors":
- do:
search:
index: test
body:
_source:
exclude_vectors: false
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- exists: hits.hits.0._source.vector

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.vector
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- exists: hits.hits.1._source.nested.1.vector
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- exists: hits.hits.1._source.nested.2.vector
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- exists: hits.hits.2._source.vector

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- exists: hits.hits.3._source.nested.0.vector
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
import java.util.Map;
import java.util.Set;

import static org.elasticsearch.index.IndexSettings.SYNTHETIC_VECTORS;

/**
* Encapsulates all valid index level settings.
* @see Property#IndexScope
Expand Down Expand Up @@ -240,6 +242,9 @@ public final class IndexScopedSettings extends AbstractScopedSettings {
if (IndexSettings.DOC_VALUES_SKIPPER) {
settings.add(IndexSettings.USE_DOC_VALUES_SKIPPER);
}
if (SYNTHETIC_VECTORS) {
settings.add(IndexSettings.INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING);
}
BUILT_IN_INDEX_SETTINGS = Collections.unmodifiableSet(settings);
};

Expand Down
10 changes: 9 additions & 1 deletion server/src/main/java/org/elasticsearch/index/IndexSettings.java
Original file line number Diff line number Diff line change
Expand Up @@ -847,6 +847,14 @@ private static String getIgnoreAboveDefaultValue(final Settings settings) {
Property.Final
);

public static final boolean SYNTHETIC_VECTORS = new FeatureFlag("mapping_synthetic_vectors").isEnabled();
public static final Setting<Boolean> INDEX_MAPPING_SOURCE_SYNTHETIC_VECTORS_SETTING = Setting.boolSetting(
"index.mapping.synthetic_vectors",
false,
Property.IndexScope,
Property.Final
);

private final Index index;
private final IndexVersion version;
private final Logger logger;
Expand Down Expand Up @@ -890,7 +898,7 @@ private static String getIgnoreAboveDefaultValue(final Settings settings) {
private final boolean logsdbRouteOnSortFields;
private final boolean logsdbSortOnHostName;
private final boolean logsdbAddHostNameField;

private volatile boolean searchExcludeVectors;
private volatile long retentionLeaseMillis;

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,16 @@
import org.elasticsearch.index.fieldvisitor.FieldsVisitor;
import org.elasticsearch.index.mapper.MapperService;
import org.elasticsearch.index.mapper.SourceFieldMapper;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.SourceLoader.SyntheticVectorsLoader;
import org.elasticsearch.index.translog.Translog;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.transport.Transports;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;

/**
* A {@link Translog.Snapshot} from changes in a Lucene index
Expand All @@ -42,6 +47,8 @@ public final class LuceneChangesSnapshot extends SearchBasedChangesSnapshot {

private int storedFieldsReaderOrd = -1;
private StoredFieldsReader storedFieldsReader = null;
private final SyntheticVectorsLoader syntheticVectorPatchLoader;
private SyntheticVectorsLoader.Leaf syntheticVectorPatchLoaderLeaf;

private final Thread creationThread; // for assertion

Expand Down Expand Up @@ -76,6 +83,7 @@ public LuceneChangesSnapshot(
this.lastSeenSeqNo = fromSeqNo - 1;
final TopDocs topDocs = nextTopDocs();
this.maxDocIndex = topDocs.scoreDocs.length;
this.syntheticVectorPatchLoader = mapperService.mappingLookup().getMapping().syntheticVectorsLoader(null);
fillParallelArray(topDocs.scoreDocs, parallelArray);
}

Expand Down Expand Up @@ -218,7 +226,7 @@ private Translog.Operation readDocAsOp(int docIndex) throws IOException {
if (leaf.reader() instanceof SequentialStoredFieldsLeafReader) {
storedFieldsReader = ((SequentialStoredFieldsLeafReader) leaf.reader()).getSequentialStoredFieldsReader();
storedFieldsReaderOrd = leaf.ord;
setNextSourceMetadataReader(leaf);
setNextSyntheticFieldsReader(leaf);
} else {
storedFieldsReader = null;
storedFieldsReaderOrd = -1;
Expand All @@ -232,10 +240,12 @@ private Translog.Operation readDocAsOp(int docIndex) throws IOException {
assert storedFieldsReaderOrd == leaf.ord : storedFieldsReaderOrd + " != " + leaf.ord;
storedFieldsReader.document(segmentDocID, fields);
} else {
setNextSourceMetadataReader(leaf);
setNextSyntheticFieldsReader(leaf);
leaf.reader().storedFields().document(segmentDocID, fields);
}
final BytesReference source = fields.source() != null ? addSourceMetadata(fields.source(), segmentDocID) : null;
final BytesReference source = fields.source() != null && fields.source().length() > 0
? addSyntheticFields(Source.fromBytes(fields.source()), segmentDocID).internalSourceRef()
: fields.source();

final Translog.Operation op;
final boolean isTombstone = parallelArray.isTombStone[docIndex];
Expand Down Expand Up @@ -281,6 +291,27 @@ private Translog.Operation readDocAsOp(int docIndex) throws IOException {
return op;
}

@Override
protected void setNextSyntheticFieldsReader(LeafReaderContext context) throws IOException {
if (syntheticVectorPatchLoader != null) {
syntheticVectorPatchLoaderLeaf = syntheticVectorPatchLoader.leaf(context);
}
}

@Override
protected Source addSyntheticFields(Source source, int segmentDocID) throws IOException {
if (syntheticVectorPatchLoaderLeaf == null) {
return super.addSyntheticFields(source, segmentDocID);
}
List<SourceLoader.SyntheticVectorPatch> patches = new ArrayList<>();
syntheticVectorPatchLoaderLeaf.load(segmentDocID, patches);
if (patches.size() == 0) {
return super.addSyntheticFields(source, segmentDocID);
}
var newSource = SourceLoader.applySyntheticVectors(source, patches);
return super.addSyntheticFields(newSource, segmentDocID);
}

private static final class ParallelArray {
final LeafReaderContext[] leafReaderContexts;
final int[] docID;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ var record = documentRecords.get(j);
int[] nextDocIdArray = nextDocIds.toArray();
leafFieldLoader = storedFieldLoader.getLoader(leafReaderContext, nextDocIdArray);
leafSourceLoader = sourceLoader.leaf(leafReaderContext.reader(), nextDocIdArray);
setNextSourceMetadataReader(leafReaderContext);
setNextSyntheticFieldsReader(leafReaderContext);
}
int segmentDocID = docRecord.docID() - docBase;
leafFieldLoader.advanceTo(segmentDocID);
Expand Down Expand Up @@ -255,13 +255,13 @@ private Translog.Operation createOperation(
return null;
}
}
var sourceBytes = addSourceMetadata(sourceLoader.source(fieldLoader, segmentDocID).internalSourceRef(), segmentDocID);
var source = addSyntheticFields(sourceLoader.source(fieldLoader, segmentDocID), segmentDocID);
return new Translog.Index(
fieldLoader.id(),
docRecord.seqNo(),
docRecord.primaryTerm(),
docRecord.version(),
sourceBytes,
source.internalSourceRef(),
fieldLoader.routing(),
-1 // autogenerated timestamp
);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollectorManager;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.lucene.Lucene;
import org.elasticsearch.common.lucene.search.Queries;
import org.elasticsearch.core.IOUtils;
Expand Down Expand Up @@ -199,42 +198,40 @@ protected TopDocs nextTopDocs() throws IOException {
}

/**
* Sets the reader context to enable reading metadata that was removed from the {@code _source}.
* Sets the reader context to enable reading synthetic fields that were removed from the {@code _source}.
* This method sets up the {@code sourceMetadataFetcher} with the provided {@link LeafReaderContext},
* ensuring it is ready to fetch metadata for subsequent operations.
*
* <p>Note: This method should be called before {@link #addSourceMetadata(BytesReference, int)} at the start of every leaf
* <p>Note: This method should be called before {@link #addSyntheticFields(Source, int)} at the start of every leaf
* to ensure the metadata fetcher is properly initialized.</p>
*/
protected void setNextSourceMetadataReader(LeafReaderContext context) {
protected void setNextSyntheticFieldsReader(LeafReaderContext context) throws IOException {
if (sourceMetadataFetcher != null) {
sourceMetadataFetcher.setNextReader(context);
}
}

/**
* Creates a new {@link Source} object by combining the provided {@code originalSource}
* with additional metadata fields. If the {@code sourceMetadataFetcher} is null or no metadata
* with additional synthetic fields. If the {@code sourceMetadataFetcher} is null or no metadata
* fields are fetched, the original source is returned unchanged.
*
* @param originalSourceBytes the original source bytes
* @param originalSource the original source
* @param segmentDocID the document ID used to fetch metadata fields
* @return a new {@link Source} instance containing the original data and additional metadata,
* or the original source if no metadata is added
* @throws IOException if an error occurs while fetching metadata values
* @throws IOException if an error occurs while fetching synthetic values
*/
protected BytesReference addSourceMetadata(BytesReference originalSourceBytes, int segmentDocID) throws IOException {
protected Source addSyntheticFields(Source originalSource, int segmentDocID) throws IOException {
if (sourceMetadataFetcher == null) {
return originalSourceBytes;
return originalSource;
}
var originalSource = Source.fromBytes(originalSourceBytes);
List<Object> values = sourceMetadataFetcher.fetchValues(originalSource, segmentDocID, List.of());
if (values.isEmpty()) {
return originalSourceBytes;
return originalSource;
}
var map = originalSource.source();
map.put(InferenceMetadataFieldsMapper.NAME, values.get(0));
return Source.fromMap(map, originalSource.sourceContentType()).internalSourceRef();
originalSource.source().put(InferenceMetadataFieldsMapper.NAME, values.get(0));
return Source.fromMap(originalSource.source(), originalSource.sourceContentType());
}

static IndexSearcher newIndexSearcher(Engine.Searcher engineSearcher) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ public boolean assertSameIndexOperation(Translog.Index o1, Translog.Index o2) th
return true;
}
if (engineConfig.getIndexSettings().isRecoverySourceSyntheticEnabled()
|| engineConfig.getMapperService().mappingLookup().inferenceFields().isEmpty() == false) {
|| engineConfig.getMapperService().mappingLookup().inferenceFields().isEmpty() == false
|| engineConfig.getMapperService().mappingLookup().syntheticVectorFields().isEmpty() == false) {
return super.assertSameIndexOperation(synthesizeSource(engineConfig, o1), o2)
|| super.assertSameIndexOperation(o1, synthesizeSource(engineConfig, o2));
}
Expand Down
Loading
Loading