Skip to content

Commit fc037eb

Browse files
committed
address review comments
1 parent 9e9e8b3 commit fc037eb

File tree

6 files changed

+126
-15
lines changed

6 files changed

+126
-15
lines changed

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/230_include_vectors_search.yml

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,3 +174,52 @@ setup:
174174
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
175175
- exists: hits.hits.3._source.nested.2.vector
176176
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
177+
178+
---
179+
"exclude vectors with fields":
180+
- do:
181+
search:
182+
index: test
183+
body:
184+
_source:
185+
include_vectors: false
186+
sort: ["name"]
187+
fields: [vector, sparse_vector, nested.*]
188+
189+
- match: { hits.hits.0._id: "1"}
190+
- match: { hits.hits.0._source.name: "cow.jpg"}
191+
- not_exists: hits.hits.0._source.vector
192+
- exists: hits.hits.0.fields.vector
193+
194+
- match: { hits.hits.1._id: "2"}
195+
- match: { hits.hits.1._source.name: "moose.jpg"}
196+
- length: { hits.hits.1._source.nested: 3 }
197+
- not_exists: hits.hits.1._source.nested.0.vector
198+
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
199+
- not_exists: hits.hits.1._source.nested.1.vector
200+
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
201+
- not_exists: hits.hits.1._source.nested.2.vector
202+
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }
203+
204+
- match: { hits.hits.2._id: "3" }
205+
- match: { hits.hits.2._source.name: "rabbit.jpg" }
206+
- not_exists: hits.hits.2._source.vector
207+
- exists: hits.hits.2.fields.vector
208+
- not_exists: hits.hits.2._source.sparse_vector
209+
- exists: hits.hits.2.fields.sparse_vector
210+
211+
212+
- match: { hits.hits.3._id: "4" }
213+
- match: { hits.hits.3._source.name: "zoolander.jpg" }
214+
- length: { hits.hits.3._source.nested: 3 }
215+
- not_exists: hits.hits.3._source.nested.0.vector
216+
- exists: hits.hits.3.fields.nested.0.vector
217+
- not_exists: hits.hits.3._source.nested.0.sparse_vector
218+
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
219+
- exists: hits.hits.3.fields.nested.0.sparse_vector
220+
- not_exists: hits.hits.3._source.nested.1.sparse_vector
221+
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
222+
- exists: hits.hits.3.fields.nested.1.sparse_vector
223+
- not_exists: hits.hits.3._source.nested.2.vector
224+
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
225+
- exists: hits.hits.3.fields.nested.2.vector

server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,15 @@ public boolean isDimension() {
195195
return false;
196196
}
197197

198+
/**
199+
* Vector embeddings are typically large and not intended for human consumption, so such fields may be excluded from responses.
200+
*
201+
* @return true if this field contains vector embeddings.
202+
*/
203+
public boolean isVectorEmbedding() {
204+
return false;
205+
}
206+
198207
/**
199208
* @return true if field has script values.
200209
*/

server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
import org.elasticsearch.index.mapper.BlockDocValuesReader;
5959
import org.elasticsearch.index.mapper.BlockLoader;
6060
import org.elasticsearch.index.mapper.BlockSourceReader;
61+
import org.elasticsearch.index.mapper.DocValueFetcher;
6162
import org.elasticsearch.index.mapper.DocumentParserContext;
6263
import org.elasticsearch.index.mapper.FieldMapper;
6364
import org.elasticsearch.index.mapper.MappedFieldType;
@@ -75,6 +76,7 @@
7576
import org.elasticsearch.index.query.SearchExecutionContext;
7677
import org.elasticsearch.search.DocValueFormat;
7778
import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
79+
import org.elasticsearch.search.fetch.StoredFieldsSpec;
7880
import org.elasticsearch.search.lookup.Source;
7981
import org.elasticsearch.search.vectors.DenseVectorQuery;
8082
import org.elasticsearch.search.vectors.ESDiversifyingChildrenByteKnnVectorQuery;
@@ -2285,6 +2287,13 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format)
22852287
if (format != null) {
22862288
throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
22872289
}
2290+
if (context.getMappingLookup().isSourceSynthetic()) {
2291+
return new DocValueFetcher(
2292+
docValueFormat(null, null),
2293+
context.getForField(this, FielddataOperation.SEARCH),
2294+
StoredFieldsSpec.NO_REQUIREMENTS
2295+
);
2296+
}
22882297
return new ArraySourceValueFetcher(name(), context) {
22892298
@Override
22902299
protected Object parseSourceValue(Object value) {
@@ -2303,6 +2312,11 @@ public boolean isAggregatable() {
23032312
return false;
23042313
}
23052314

2315+
@Override
2316+
public boolean isVectorEmbedding() {
2317+
return true;
2318+
}
2319+
23062320
@Override
23072321
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
23082322
return elementType.fielddataBuilder(this, fieldDataContext);

server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,11 @@ public String typeName() {
119119
return CONTENT_TYPE;
120120
}
121121

122+
@Override
123+
public boolean isVectorEmbedding() {
124+
return true;
125+
}
126+
122127
@Override
123128
public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
124129
throw new IllegalArgumentException("[sparse_vector] fields do not support sorting, scripting or aggregating");

server/src/main/java/org/elasticsearch/search/fetch/FetchPhase.java

Lines changed: 44 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,14 @@
1313
import org.apache.logging.log4j.Logger;
1414
import org.apache.lucene.index.LeafReaderContext;
1515
import org.apache.lucene.search.TotalHits;
16+
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
1617
import org.elasticsearch.common.bytes.BytesReference;
1718
import org.elasticsearch.common.regex.Regex;
1819
import org.elasticsearch.index.fieldvisitor.LeafStoredFieldLoader;
1920
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
2021
import org.elasticsearch.index.mapper.IdLoader;
22+
import org.elasticsearch.index.mapper.MappedFieldType;
2123
import org.elasticsearch.index.mapper.SourceLoader;
22-
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
23-
import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
2424
import org.elasticsearch.search.LeafNestedDocuments;
2525
import org.elasticsearch.search.NestedDocuments;
2626
import org.elasticsearch.search.SearchContextSourcePrinter;
@@ -122,7 +122,7 @@ private SearchHits buildSearchHits(SearchContext context, int[] docIdsToLoad, Pr
122122
// - Speed up retrieval of the synthetic source
123123
// Note: These vectors will no longer be accessible via _source for any sub-fetch processors,
124124
// but they are typically accessed through doc values instead (e.g: re-scorer).
125-
SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectors(context);
125+
SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectorFields(context);
126126
SourceLoader sourceLoader = context.newSourceLoader(sourceFilter);
127127
FetchContext fetchContext = new FetchContext(context, sourceLoader);
128128

@@ -461,24 +461,53 @@ private static boolean shouldExcludeVectorsFromSource(SearchContext context) {
461461
* unless vectors are explicitly requested to be included in the source.
462462
* Returns {@code null} when vectors should not be filtered out.
463463
*/
464-
private static SourceFilter maybeExcludeNonSemanticTextVectors(SearchContext context) {
464+
private static SourceFilter maybeExcludeNonSemanticTextVectorFields(SearchContext context) {
465465
if (shouldExcludeVectorsFromSource(context) == false) {
466466
return null;
467467
}
468468
var lookup = context.getSearchExecutionContext().getMappingLookup();
469-
List<String> inferencePatterns = lookup.inferenceFields().isEmpty()
470-
? null
471-
: lookup.inferenceFields().keySet().stream().map(f -> f + "*").toList();
472-
var excludes = lookup.getFullNameToFieldType()
473-
.values()
474-
.stream()
475-
.filter(
476-
f -> f instanceof DenseVectorFieldMapper.DenseVectorFieldType || f instanceof SparseVectorFieldMapper.SparseVectorFieldType
469+
var fetchFieldsAut = context.fetchFieldsContext() != null && context.fetchFieldsContext().fields().size() > 0
470+
? new CharacterRunAutomaton(
471+
Regex.simpleMatchToAutomaton(context.fetchFieldsContext().fields().stream().map(f -> f.field).toArray(String[]::new))
477472
)
473+
: null;
474+
var inferenceFieldsAut = lookup.inferenceFields().size() > 0
475+
? new CharacterRunAutomaton(
476+
Regex.simpleMatchToAutomaton(lookup.inferenceFields().keySet().stream().map(f -> f + "*").toArray(String[]::new))
477+
)
478+
: null;
479+
480+
List<String> lateExcludes = new ArrayList<>();
481+
var excludes = lookup.getFullNameToFieldType().values().stream().filter(MappedFieldType::isVectorEmbedding).filter(f -> {
482+
// Exclude the field specified by the `fields` option
483+
if (fetchFieldsAut != null && fetchFieldsAut.run(f.name())) {
484+
lateExcludes.add(f.name());
485+
return false;
486+
}
478487
// Exclude vectors from semantic text fields, as they are processed separately
479-
.filter(f -> Regex.simpleMatch(inferencePatterns, f.name()) == false)
480-
.map(f -> f.name())
481-
.collect(Collectors.toList());
488+
return inferenceFieldsAut == null || inferenceFieldsAut.run(f.name()) == false;
489+
}).map(f -> f.name()).collect(Collectors.toList());
490+
491+
if (lateExcludes.size() > 0) {
492+
/**
493+
* Adds the vector field specified by the `fields` option to the excludes list of the fetch source context.
494+
* This ensures that vector fields are available to sub-fetch phases, but excluded during the {@link FetchSourcePhase}.
495+
*/
496+
if (context.fetchSourceContext() != null && context.fetchSourceContext().excludes() != null) {
497+
for (var exclude : context.fetchSourceContext().excludes()) {
498+
lateExcludes.add(exclude);
499+
}
500+
}
501+
var fetchSourceContext = context.fetchSourceContext() == null
502+
? FetchSourceContext.of(true, false, null, lateExcludes.toArray(String[]::new))
503+
: FetchSourceContext.of(
504+
context.fetchSourceContext().fetchSource(),
505+
context.fetchSourceContext().includeVectors(),
506+
context.fetchSourceContext().includes(),
507+
lateExcludes.toArray(String[]::new)
508+
);
509+
context.fetchSourceContext(fetchSourceContext);
510+
}
482511
return excludes.isEmpty() ? null : new SourceFilter(new String[] {}, excludes.toArray(String[]::new));
483512
}
484513
}

x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,11 @@ public String typeName() {
172172
return CONTENT_TYPE;
173173
}
174174

175+
@Override
176+
public boolean isVectorEmbedding() {
177+
return true;
178+
}
179+
175180
@Override
176181
public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
177182
if (format != null) {

0 commit comments

Comments
 (0)