Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/128735.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 128735
summary: Add option to include or exclude vectors from `_source` retrieval
area: Vector Search
type: feature
issues: []
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
setup:
- requires:
reason: 'include_vectors option is required'
test_runner_features: [ capabilities ]
capabilities:
- method: GET
path: /_search
capabilities: [ include_vectors_param ]
- skip:
features: "headers"

- do:
indices.create:
index: test
body:
mappings:
properties:
name:
type: keyword
sparse_vector:
type: sparse_vector
vector:
type: dense_vector
dims: 5
similarity: l2_norm

nested:
type: nested
properties:
paragraph_id:
type: keyword
vector:
type: dense_vector
dims: 5
similarity: l2_norm
sparse_vector:
type: sparse_vector

- do:
index:
index: test
id: "1"
body:
name: cow.jpg
vector: [36, 267, -311, 12, -202]

- do:
index:
index: test
id: "2"
body:
name: moose.jpg
nested:
- paragraph_id: 0
vector: [-0.5, 100.0, -13, 14.8, -156.0]
- paragraph_id: 2
vector: [0, 100.0, 0, 14.8, -156.0]
- paragraph_id: 3
vector: [0, 1.0, 0, 1.8, -15.0]

- do:
index:
index: test
id: "3"
body:
name: rabbit.jpg
vector: [-0.5, 100.0, -13, 14.8, -156.0]
sparse_vector:
running: 3
good: 17
run: 22

- do:
index:
index: test
id: "4"
body:
name: zoolander.jpg
nested:
- paragraph_id: 0
vector: [ -0.5, 100.0, -13, 14.8, -156.0 ]
sparse_vector:
running: 3
good: 17
run: 22
- paragraph_id: 1
sparse_vector:
modeling: 32
model: 20
mode: 54
- paragraph_id: 2
vector: [ -9.8, 109, 32, 14.8, 23 ]


- do:
indices.refresh: {}

---
"exclude vectors":
- do:
search:
index: test
body:
_source:
include_vectors: false
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- not_exists: hits.hits.0._source.vector

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- not_exists: hits.hits.1._source.nested.0.vector
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- not_exists: hits.hits.1._source.nested.1.vector
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- not_exists: hits.hits.1._source.nested.2.vector
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- not_exists: hits.hits.2._source.vector
- not_exists: hits.hits.2._source.sparse_vector

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- not_exists: hits.hits.3._source.nested.0.vector
- not_exists: hits.hits.3._source.nested.0.sparse_vector
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- not_exists: hits.hits.3._source.nested.1.sparse_vector
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
- not_exists: hits.hits.3._source.nested.2.vector
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }

---
"include vectors":
- do:
search:
index: test
body:
_source:
include_vectors: true
sort: ["name"]

- match: { hits.hits.0._id: "1"}
- match: { hits.hits.0._source.name: "cow.jpg"}
- exists: hits.hits.0._source.vector

- match: { hits.hits.1._id: "2"}
- match: { hits.hits.1._source.name: "moose.jpg"}
- length: { hits.hits.1._source.nested: 3 }
- exists: hits.hits.1._source.nested.0.vector
- match: { hits.hits.1._source.nested.0.paragraph_id: 0 }
- exists: hits.hits.1._source.nested.1.vector
- match: { hits.hits.1._source.nested.1.paragraph_id: 2 }
- exists: hits.hits.1._source.nested.2.vector
- match: { hits.hits.1._source.nested.2.paragraph_id: 3 }

- match: { hits.hits.2._id: "3" }
- match: { hits.hits.2._source.name: "rabbit.jpg" }
- exists: hits.hits.2._source.vector
- exists: hits.hits.2._source.sparse_vector

- match: { hits.hits.3._id: "4" }
- match: { hits.hits.3._source.name: "zoolander.jpg" }
- length: { hits.hits.3._source.nested: 3 }
- exists: hits.hits.3._source.nested.0.vector
- exists: hits.hits.3._source.nested.0.sparse_vector
- match: { hits.hits.3._source.nested.0.paragraph_id: 0 }
- exists: hits.hits.3._source.nested.1.sparse_vector
- match: { hits.hits.3._source.nested.1.paragraph_id: 1 }
- exists: hits.hits.3._source.nested.2.vector
- match: { hits.hits.3._source.nested.2.paragraph_id: 2 }
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ML_INFERENCE_SAGEMAKER_CHAT_COMPLETION_8_19 = def(8_841_0_37);
public static final TransportVersion ML_INFERENCE_VERTEXAI_CHATCOMPLETION_ADDED_8_19 = def(8_841_0_38);
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED_8_19 = def(8_841_0_39);
public static final TransportVersion SEARCH_SOURCE_INCLUDE_VECTORS_PARAM_8_19 = def(8_841_0_40);
public static final TransportVersion V_9_0_0 = def(9_000_0_09);
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_1 = def(9_000_0_10);
public static final TransportVersion INITIAL_ELASTICSEARCH_9_0_2 = def(9_000_0_11);
Expand Down Expand Up @@ -273,7 +274,7 @@ static TransportVersion def(int id) {
public static final TransportVersion INFERENCE_CUSTOM_SERVICE_ADDED = def(9_084_0_00);
public static final TransportVersion ESQL_LIMIT_ROW_SIZE = def(9_085_0_00);
public static final TransportVersion ESQL_REGEX_MATCH_WITH_CASE_INSENSITIVITY = def(9_086_0_00);

public static final TransportVersion SEARCH_SOURCE_INCLUDE_VECTORS_PARAM = def(9_087_0_00);
/*
* STOP! READ THIS FIRST! No, really,
* ____ _____ ___ ____ _ ____ _____ _ ____ _____ _ _ ___ ____ _____ ___ ____ ____ _____ _
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ private SearchCapabilities() {}

private static final String SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB = "significant_terms_background_filter_as_sub";

private static final String INCLUDE_VECTORS_PARAM = "include_vectors_param";

public static final Set<String> CAPABILITIES;
static {
HashSet<String> capabilities = new HashSet<>();
Expand All @@ -69,6 +71,7 @@ private SearchCapabilities() {}
capabilities.add(HIGHLIGHT_MAX_ANALYZED_OFFSET_DEFAULT);
capabilities.add(INDEX_SELECTOR_SYNTAX);
capabilities.add(SIGNIFICANT_TERMS_BACKGROUND_FILTER_AS_SUB);
capabilities.add(INCLUDE_VECTORS_PARAM);
CAPABILITIES = Set.copyOf(capabilities);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ private static FetchSourceContext buildFetchSourceContext(SearchContext in) {
if (sfc != null && sfc.fetchFields()) {
for (String field : sfc.fieldNames()) {
if (SourceFieldMapper.NAME.equals(field)) {
fsc = fsc == null ? FetchSourceContext.of(true) : FetchSourceContext.of(true, fsc.includes(), fsc.excludes());
fsc = fsc == null
? FetchSourceContext.of(true)
: FetchSourceContext.of(true, fsc.includeVectors(), fsc.includes(), fsc.excludes());
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,21 +14,26 @@
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.search.TotalHits;
import org.elasticsearch.common.bytes.BytesReference;
import org.elasticsearch.common.regex.Regex;
import org.elasticsearch.index.fieldvisitor.LeafStoredFieldLoader;
import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
import org.elasticsearch.index.mapper.IdLoader;
import org.elasticsearch.index.mapper.SourceLoader;
import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
import org.elasticsearch.search.LeafNestedDocuments;
import org.elasticsearch.search.NestedDocuments;
import org.elasticsearch.search.SearchContextSourcePrinter;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.SearchHits;
import org.elasticsearch.search.SearchShardTarget;
import org.elasticsearch.search.fetch.FetchSubPhase.HitContext;
import org.elasticsearch.search.fetch.subphase.FetchSourceContext;
import org.elasticsearch.search.fetch.subphase.InnerHitsContext;
import org.elasticsearch.search.fetch.subphase.InnerHitsPhase;
import org.elasticsearch.search.internal.SearchContext;
import org.elasticsearch.search.lookup.Source;
import org.elasticsearch.search.lookup.SourceFilter;
import org.elasticsearch.search.lookup.SourceProvider;
import org.elasticsearch.search.profile.ProfileResult;
import org.elasticsearch.search.profile.Profilers;
Expand All @@ -45,6 +50,7 @@
import java.util.List;
import java.util.Map;
import java.util.function.Supplier;
import java.util.stream.Collectors;

/**
* Fetch phase of a search request, used to fetch the actual top matching documents to be returned to the client, identified
Expand Down Expand Up @@ -111,7 +117,13 @@ public Source getSource(LeafReaderContext ctx, int doc) {
}

private SearchHits buildSearchHits(SearchContext context, int[] docIdsToLoad, Profiler profiler, RankDocShardInfo rankDocs) {
SourceLoader sourceLoader = context.newSourceLoader(null);
// Optionally remove sparse and dense vector fields early to:
// - Reduce the in-memory size of the source
// - Speed up retrieval of the synthetic source
// Note: These vectors will no longer be accessible via _source for any sub-fetch processors,
// but they are typically accessed through doc values instead (e.g: re-scorer).
SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectors(context);
SourceLoader sourceLoader = context.newSourceLoader(sourceFilter);
FetchContext fetchContext = new FetchContext(context, sourceLoader);

PreloadedSourceProvider sourceProvider = new PreloadedSourceProvider();
Expand Down Expand Up @@ -432,4 +444,39 @@ public String toString() {
}
};
}

/**
* Determines whether vector fields should be excluded from the source based on the {@link FetchSourceContext}.
* Returns {@code true} if vector fields are explicitly marked to be excluded and {@code false} otherwise.
*/
private static boolean shouldExcludeVectorsFromSource(SearchContext context) {
if (context.fetchSourceContext() == null) {
return false;
}
return context.fetchSourceContext().includeVectors() != null && context.fetchSourceContext().includeVectors() == false;
}

/**
* Returns a {@link SourceFilter} that excludes vector fields not associated with semantic text fields,
* unless vectors are explicitly requested to be included in the source.
* Returns {@code null} when vectors should not be filtered out.
*/
private static SourceFilter maybeExcludeNonSemanticTextVectors(SearchContext context) {
if (shouldExcludeVectorsFromSource(context) == false) {
return null;
}
var lookup = context.getSearchExecutionContext().getMappingLookup();
List<String> inferencePatterns = lookup.inferenceFields().isEmpty() ? null : lookup.inferenceFields().keySet().stream().toList();
var excludes = lookup.getFullNameToFieldType()
.values()
.stream()
.filter(
f -> f instanceof DenseVectorFieldMapper.DenseVectorFieldType || f instanceof SparseVectorFieldMapper.SparseVectorFieldType
)
// Exclude vectors from semantic text fields, as they are processed separately
.filter(f -> Regex.simpleMatch(inferencePatterns, f.name()) == false)
.map(f -> f.name())
.collect(Collectors.toList());
return excludes.isEmpty() ? null : new SourceFilter(new String[] {}, excludes.toArray(String[]::new));
}
}
Loading
Loading