Skip to content

Commit 74182d5

Browse files
committed
Add option to include or exclude vectors from _source retrieval
This PR introduces a new include_vectors option to the _source retrieval context. When set to false, vectors are excluded from the returned _source. This is especially efficient when used with synthetic source, as it avoids loading vector fields entirely. By default, vectors remain included unless explicitly excluded.
1 parent b688080 commit 74182d5

File tree

7 files changed

+317
-17
lines changed

7 files changed

+317
-17
lines changed
Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
setup:
2+
- requires:
3+
reason: 'include_vectors option is required'
4+
test_runner_features: [ capabilities ]
5+
capabilities:
6+
- method: GET
7+
path: /_search
8+
capabilities: [ include_vectors_param ]
9+
- skip:
10+
features: "headers"
11+
12+
- do:
13+
indices.create:
14+
index: test
15+
body:
16+
mappings:
17+
properties:
18+
name:
19+
type: keyword
20+
sparse_vector:
21+
type: sparse_vector
22+
vector:
23+
type: dense_vector
24+
dims: 5
25+
similarity: l2_norm
26+
27+
nested:
28+
type: nested
29+
properties:
30+
paragraph_id:
31+
type: keyword
32+
vector:
33+
type: dense_vector
34+
dims: 5
35+
similarity: l2_norm
36+
sparse_vector:
37+
type: sparse_vector
38+
39+
- do:
40+
index:
41+
index: test
42+
id: "1"
43+
body:
44+
name: cow.jpg
45+
vector: [36, 267, -311, 12, -202]
46+
47+
- do:
48+
index:
49+
index: test
50+
id: "2"
51+
body:
52+
name: moose.jpg
53+
nested:
54+
- paragraph_id: 0
55+
vector: [-0.5, 100.0, -13, 14.8, -156.0]
56+
- paragraph_id: 2
57+
vector: [0, 100.0, 0, 14.8, -156.0]
58+
- paragraph_id: 3
59+
vector: [0, 1.0, 0, 1.8, -15.0]
60+
61+
- do:
62+
index:
63+
index: test
64+
id: "3"
65+
body:
66+
name: rabbit.jpg
67+
vector: [-0.5, 100.0, -13, 14.8, -156.0]
68+
sparse_vector:
69+
running: 3
70+
good: 17
71+
run: 22
72+
73+
- do:
74+
index:
75+
index: test
76+
id: "4"
77+
body:
78+
name: zoolander.jpg
79+
nested:
80+
- paragraph_id: 0
81+
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
82+
sparse_vector:
83+
running: 3
84+
good: 17
85+
run: 22
86+
- paragraph_id: 1
87+
sparse_vector:
88+
modeling: 32
89+
model: 20
90+
mode: 54
91+
- paragraph_id: 2
92+
vector: [ -9.8, 109, 32, 14.8, 23 ]
93+
94+
95+
- do:
96+
indices.refresh: {}
97+
98+
---
99+
"exclude vectors":
100+
- do:
101+
search:
102+
index: test
103+
body:
104+
_source:
105+
include_vectors: false
106+
sort: ["name"]
107+
108+
- match: { hits.hits.0._id: "1"}
109+
- match: { hits.hits.0._source.name: "cow.jpg"}
110+
- not_exists: hits.hits.0._source.vector
111+
112+
- match: { hits.hits.1._id: "2"}
113+
- match: { hits.hits.1._source.name: "moose.jpg"}
114+
- length: { hits.hits.1._source.nested: 3 }
115+
- not_exists: hits.hits.1._source.nested.0.vector
116+
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
117+
- not_exists: hits.hits.1._source.nested.1.vector
118+
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
119+
- not_exists: hits.hits.1._source.nested.2.vector
120+
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
121+
122+
- match: { hits.hits.2._id: "3" }
123+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
124+
- not_exists: hits.hits.2._source.vector
125+
- not_exists: hits.hits.2._source.sparse_vector
126+
127+
- match: { hits.hits.3._id: "4" }
128+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
129+
- length: { hits.hits.3._source.nested: 3 }
130+
- not_exists: hits.hits.3._source.nested.0.vector
131+
- not_exists: hits.hits.3._source.nested.0.sparse_vector
132+
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
133+
- not_exists: hits.hits.3._source.nested.1.sparse_vector
134+
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
135+
- not_exists: hits.hits.3._source.nested.2.vector
136+
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
137+
138+
---
139+
"include vectors":
140+
- do:
141+
search:
142+
index: test
143+
body:
144+
_source:
145+
include_vectors: true
146+
sort: ["name"]
147+
148+
- match: { hits.hits.0._id: "1"}
149+
- match: { hits.hits.0._source.name: "cow.jpg"}
150+
- exists: hits.hits.0._source.vector
151+
152+
- match: { hits.hits.1._id: "2"}
153+
- match: { hits.hits.1._source.name: "moose.jpg"}
154+
- length: { hits.hits.1._source.nested: 3 }
155+
- exists: hits.hits.1._source.nested.0.vector
156+
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
157+
- exists: hits.hits.1._source.nested.1.vector
158+
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
159+
- exists: hits.hits.1._source.nested.2.vector
160+
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
161+
162+
- match: { hits.hits.2._id: "3" }
163+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
164+
- exists: hits.hits.2._source.vector
165+
- exists: hits.hits.2._source.sparse_vector
166+
167+
- match: { hits.hits.3._id: "4" }
168+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
169+
- length: { hits.hits.3._source.nested: 3 }
170+
- exists: hits.hits.3._source.nested.0.vector
171+
- exists: hits.hits.3._source.nested.0.sparse_vector
172+
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
173+
- exists: hits.hits.3._source.nested.1.sparse_vector
174+
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
175+
- exists: hits.hits.3._source.nested.2.vector
176+
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }

server/src/main/java/org/elasticsearch/TransportVersions.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ static TransportVersion def(int id) {
184184
public static final TransportVersion ML_INFERENCE_SAGEMAKER_CHAT_COMPLETION_8_19 = def(8_841_0_37);
185185
public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED_8_19 = def(8_841_0_38);
186186
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED_8_19 = def(8_841_0_39);
187+
public static final TransportVersion SEARCH_SOURCE_INCLUDE_VECTORS_PARAM_8_19 = def(8_841_0_40);
187188
public static final TransportVersion V_9_0_0 = def(9_000_0_09);
188189
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
189190
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
@@ -273,7 +274,7 @@ static TransportVersion def(int id) {
273274
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00);
274275
public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00);
275276
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00);
276-
277+
public static final TransportVersion SEARCH_SOURCE_INCLUDE_VECTORS_PARAM = def(9_087_0_00);
277278
/*
278279
* STOP! READ THIS FIRST! No, really,
279280
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _

server/src/main/java/org/elasticsearch/rest/action/search/SearchCapabilities.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ private SearchCapabilities() {}
5050

5151
private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";
5252

53+
private static final String INCLUDE_VECTORS_PARAM = "include_vectors_param";
54+
5355
public static final Set<String> CAPABILITIES;
5456
static {
5557
HashSet<String> capabilities = new HashSet<>();
@@ -69,6 +71,7 @@ private SearchCapabilities() {}
6971
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
7072
capabilities.add(INDEX_SELECTOR_SYNTAX);
7173
capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
74+
capabilities.add(INCLUDE_VECTORS_PARAM);
7275
CAPABILITIES = Set.copyOf(capabilities);
7376
}
7477
}

server/src/main/java/org/elasticsearch/search/fetch/FetchContext.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,9 @@ private static FetchSourceContext buildFetchSourceContext(SearchContext in) {
6868
if (sfc != null && sfc.fetchFields()) {
6969
for (String field : sfc.fieldNames()) {
7070
if (SourceFieldMapper.NAME.equals(field)) {
71-
fsc = fsc == null ? FetchSourceContext.of(true) : FetchSourceContext.of(true, fsc.includes(), fsc.excludes());
71+
fsc = fsc == null
72+
? FetchSourceContext.of(true)
73+
: FetchSourceContext.of(true, fsc.includeVectors(), fsc.includes(), fsc.excludes());
7274
}
7375
}
7476
}

server/src/main/java/org/elasticsearch/search/fetch/FetchPhase.java

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,21 +14,26 @@
1414
import org.apache.lucene.index.LeafReaderContext;
1515
import org.apache.lucene.search.TotalHits;
1616
import org.elasticsearch.common.bytes.BytesReference;
17+
import org.elasticsearch.common.regex.Regex;
1718
import org.elasticsearch.index.fieldvisitor.LeafStoredFieldLoader;
1819
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
1920
import org.elasticsearch.index.mapper.IdLoader;
2021
import org.elasticsearch.index.mapper.SourceLoader;
22+
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
23+
import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
2124
import org.elasticsearch.search.LeafNestedDocuments;
2225
import org.elasticsearch.search.NestedDocuments;
2326
import org.elasticsearch.search.SearchContextSourcePrinter;
2427
import org.elasticsearch.search.SearchHit;
2528
import org.elasticsearch.search.SearchHits;
2629
import org.elasticsearch.search.SearchShardTarget;
2730
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
31+
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
2832
import org.elasticsearch.search.fetch.subphase.InnerHitsContext;
2933
import org.elasticsearch.search.fetch.subphase.InnerHitsPhase;
3034
import org.elasticsearch.search.internal.SearchContext;
3135
import org.elasticsearch.search.lookup.Source;
36+
import org.elasticsearch.search.lookup.SourceFilter;
3237
import org.elasticsearch.search.lookup.SourceProvider;
3338
import org.elasticsearch.search.profile.ProfileResult;
3439
import org.elasticsearch.search.profile.Profilers;
@@ -45,6 +50,7 @@
4550
import java.util.List;
4651
import java.util.Map;
4752
import java.util.function.Supplier;
53+
import java.util.stream.Collectors;
4854

4955
/**
5056
* Fetch phase of a search request, used to fetch the actual top matching documents to be returned to the client, identified
@@ -111,7 +117,13 @@ public Source getSource(LeafReaderContext ctx, int doc) {
111117
}
112118

113119
private SearchHits buildSearchHits(SearchContext context, int[] docIdsToLoad, Profiler profiler, RankDocShardInfo rankDocs) {
114-
SourceLoader sourceLoader = context.newSourceLoader(null);
120+
// Optionally remove sparse and dense vector fields early to:
121+
// - Reduce the in-memory size of the source
122+
// - Speed up retrieval of the synthetic source
123+
// Note: These vectors will no longer be accessible via _source for any sub-fetch processors,
124+
// but they are typically accessed through doc values instead (e.g: re-scorer).
125+
SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectors(context);
126+
SourceLoader sourceLoader = context.newSourceLoader(sourceFilter);
115127
FetchContext fetchContext = new FetchContext(context, sourceLoader);
116128

117129
PreloadedSourceProvider sourceProvider = new PreloadedSourceProvider();
@@ -432,4 +444,39 @@ public String toString() {
432444
}
433445
};
434446
}
447+
448+
/**
449+
* Determines whether vector fields should be excluded from the source based on the {@link FetchSourceContext}.
450+
* Returns {@code true} if vector fields are explicitly marked to be excluded and {@code false} otherwise.
451+
*/
452+
private static boolean shouldExcludeVectorsFromSource(SearchContext context) {
453+
if (context.fetchSourceContext() == null) {
454+
return false;
455+
}
456+
return context.fetchSourceContext().includeVectors() != null && context.fetchSourceContext().includeVectors() == false;
457+
}
458+
459+
/**
460+
* Returns a {@link SourceFilter} that excludes vector fields not associated with semantic text fields,
461+
* unless vectors are explicitly requested to be included in the source.
462+
* Returns {@code null} when vectors should not be filtered out.
463+
*/
464+
private static SourceFilter maybeExcludeNonSemanticTextVectors(SearchContext context) {
465+
if (shouldExcludeVectorsFromSource(context)) {
466+
return null;
467+
}
468+
var lookup = context.getSearchExecutionContext().getMappingLookup();
469+
List<String> inferencePatterns = lookup.inferenceFields().isEmpty() ? null : lookup.inferenceFields().keySet().stream().toList();
470+
var excludes = lookup.getFullNameToFieldType()
471+
.values()
472+
.stream()
473+
.filter(
474+
f -> f instanceof DenseVectorFieldMapper.DenseVectorFieldType || f instanceof SparseVectorFieldMapper.SparseVectorFieldType
475+
)
476+
// Exclude vectors from semantic text fields, as they are processed separately
477+
.filter(f -> Regex.simpleMatch(inferencePatterns, f.name()) == false)
478+
.map(f -> f.name())
479+
.collect(Collectors.toList());
480+
return excludes.isEmpty() ? null : new SourceFilter(new String[] {}, excludes.toArray(String[]::new));
481+
}
435482
}

0 commit comments

Comments
 (0)