address review comments

jimczi · jimczi · commit fc037eb22f85 · 2025-06-06T11:11:38.000+01:00
diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/230_include_vectors_search.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/search.vectors/230_include_vectors_search.yml
@@ -174,3 +174,52 @@ setup:
   - match:      { hits.hits.3._source.nested.1.paragraph_id: 1 }
   - exists:       hits.hits.3._source.nested.2.vector
   - match:      { hits.hits.3._source.nested.2.paragraph_id: 2 }
+
+---
+"exclude vectors with fields":
+  - do:
+      search:
+        index: test
+        body:
+          _source:
+            include_vectors: false
+          sort: ["name"]
+          fields: [vector, sparse_vector, nested.*]
+
+  - match:      { hits.hits.0._id: "1"}
+  - match:      { hits.hits.0._source.name: "cow.jpg"}
+  - not_exists:   hits.hits.0._source.vector
+  - exists:       hits.hits.0.fields.vector
+
+  - match:      { hits.hits.1._id: "2"}
+  - match:      { hits.hits.1._source.name: "moose.jpg"}
+  - length:     { hits.hits.1._source.nested: 3 }
+  - not_exists:   hits.hits.1._source.nested.0.vector
+  - match:      { hits.hits.1._source.nested.0.paragraph_id: 0 }
+  - not_exists:   hits.hits.1._source.nested.1.vector
+  - match:      { hits.hits.1._source.nested.1.paragraph_id: 2 }
+  - not_exists:   hits.hits.1._source.nested.2.vector
+  - match:      { hits.hits.1._source.nested.2.paragraph_id: 3 }
+
+  - match:      { hits.hits.2._id: "3" }
+  - match:      { hits.hits.2._source.name: "rabbit.jpg" }
+  - not_exists:   hits.hits.2._source.vector
+  - exists:       hits.hits.2.fields.vector
+  - not_exists:   hits.hits.2._source.sparse_vector
+  - exists:       hits.hits.2.fields.sparse_vector
+
+
+  - match:      { hits.hits.3._id: "4" }
+  - match:      { hits.hits.3._source.name: "zoolander.jpg" }
+  - length:     { hits.hits.3._source.nested: 3 }
+  - not_exists:   hits.hits.3._source.nested.0.vector
+  - exists:       hits.hits.3.fields.nested.0.vector
+  - not_exists:   hits.hits.3._source.nested.0.sparse_vector
+  - match:      { hits.hits.3._source.nested.0.paragraph_id: 0 }
+  - exists:       hits.hits.3.fields.nested.0.sparse_vector
+  - not_exists:   hits.hits.3._source.nested.1.sparse_vector
+  - match:      { hits.hits.3._source.nested.1.paragraph_id: 1 }
+  - exists:       hits.hits.3.fields.nested.1.sparse_vector
+  - not_exists:   hits.hits.3._source.nested.2.vector
+  - match:      { hits.hits.3._source.nested.2.paragraph_id: 2 }
+  - exists:       hits.hits.3.fields.nested.2.vector
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java b/server/src/main/java/org/elasticsearch/index/mapper/MappedFieldType.java
@@ -195,6 +195,15 @@ public boolean isDimension() {
         return false;
     }
 
+    /**
+     * Vector embeddings are typically large and not intended for human consumption, so such fields may be excluded from responses.
+     *
+     * @return true if this field contains vector embeddings.
+     */
+    public boolean isVectorEmbedding() {
+        return false;
+    }
+
     /**
      * @return true if field has script values.
      */
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/DenseVectorFieldMapper.java
@@ -58,6 +58,7 @@
 import org.elasticsearch.index.mapper.BlockDocValuesReader;
 import org.elasticsearch.index.mapper.BlockLoader;
 import org.elasticsearch.index.mapper.BlockSourceReader;
+import org.elasticsearch.index.mapper.DocValueFetcher;
 import org.elasticsearch.index.mapper.DocumentParserContext;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
@@ -75,6 +76,7 @@
 import org.elasticsearch.index.query.SearchExecutionContext;
 import org.elasticsearch.search.DocValueFormat;
 import org.elasticsearch.search.aggregations.support.CoreValuesSourceType;
+import org.elasticsearch.search.fetch.StoredFieldsSpec;
 import org.elasticsearch.search.lookup.Source;
 import org.elasticsearch.search.vectors.DenseVectorQuery;
 import org.elasticsearch.search.vectors.ESDiversifyingChildrenByteKnnVectorQuery;
@@ -2285,6 +2287,13 @@ public ValueFetcher valueFetcher(SearchExecutionContext context, String format)
             if (format != null) {
                 throw new IllegalArgumentException("Field [" + name() + "] of type [" + typeName() + "] doesn't support formats.");
             }
+            if (context.getMappingLookup().isSourceSynthetic()) {
+                return new DocValueFetcher(
+                    docValueFormat(null, null),
+                    context.getForField(this, FielddataOperation.SEARCH),
+                    StoredFieldsSpec.NO_REQUIREMENTS
+                );
+            }
             return new ArraySourceValueFetcher(name(), context) {
                 @Override
                 protected Object parseSourceValue(Object value) {
@@ -2303,6 +2312,11 @@ public boolean isAggregatable() {
             return false;
         }
 
+        @Override
+        public boolean isVectorEmbedding() {
+            return true;
+        }
+
         @Override
         public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
             return elementType.fielddataBuilder(this, fieldDataContext);
diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java
@@ -119,6 +119,11 @@ public String typeName() {
             return CONTENT_TYPE;
         }
 
+        @Override
+        public boolean isVectorEmbedding() {
+            return true;
+        }
+
         @Override
         public IndexFieldData.Builder fielddataBuilder(FieldDataContext fieldDataContext) {
             throw new IllegalArgumentException("[sparse_vector] fields do not support sorting, scripting or aggregating");
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/FetchPhase.java b/server/src/main/java/org/elasticsearch/search/fetch/FetchPhase.java
@@ -13,14 +13,14 @@
 import org.apache.logging.log4j.Logger;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.search.TotalHits;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.bytes.BytesReference;
 import org.elasticsearch.common.regex.Regex;
 import org.elasticsearch.index.fieldvisitor.LeafStoredFieldLoader;
 import org.elasticsearch.index.fieldvisitor.StoredFieldLoader;
 import org.elasticsearch.index.mapper.IdLoader;
+import org.elasticsearch.index.mapper.MappedFieldType;
 import org.elasticsearch.index.mapper.SourceLoader;
-import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper;
-import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper;
 import org.elasticsearch.search.LeafNestedDocuments;
 import org.elasticsearch.search.NestedDocuments;
 import org.elasticsearch.search.SearchContextSourcePrinter;
@@ -122,7 +122,7 @@ private SearchHits buildSearchHits(SearchContext context, int[] docIdsToLoad, Pr
         // - Speed up retrieval of the synthetic source
         // Note: These vectors will no longer be accessible via _source for any sub-fetch processors,
         // but they are typically accessed through doc values instead (e.g: re-scorer).
-        SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectors(context);
+        SourceFilter sourceFilter = maybeExcludeNonSemanticTextVectorFields(context);
         SourceLoader sourceLoader = context.newSourceLoader(sourceFilter);
         FetchContext fetchContext = new FetchContext(context, sourceLoader);
 
@@ -461,24 +461,53 @@ private static boolean shouldExcludeVectorsFromSource(SearchContext context) {
      * unless vectors are explicitly requested to be included in the source.
      * Returns {@code null} when vectors should not be filtered out.
      */
-    private static SourceFilter maybeExcludeNonSemanticTextVectors(SearchContext context) {
+    private static SourceFilter maybeExcludeNonSemanticTextVectorFields(SearchContext context) {
         if (shouldExcludeVectorsFromSource(context) == false) {
             return null;
         }
         var lookup = context.getSearchExecutionContext().getMappingLookup();
-        List<String> inferencePatterns = lookup.inferenceFields().isEmpty()
-            ? null
-            : lookup.inferenceFields().keySet().stream().map(f -> f + "*").toList();
-        var excludes = lookup.getFullNameToFieldType()
-            .values()
-            .stream()
-            .filter(
-                f -> f instanceof DenseVectorFieldMapper.DenseVectorFieldType || f instanceof SparseVectorFieldMapper.SparseVectorFieldType
+        var fetchFieldsAut = context.fetchFieldsContext() != null && context.fetchFieldsContext().fields().size() > 0
+            ? new CharacterRunAutomaton(
+                Regex.simpleMatchToAutomaton(context.fetchFieldsContext().fields().stream().map(f -> f.field).toArray(String[]::new))
             )
+            : null;
+        var inferenceFieldsAut = lookup.inferenceFields().size() > 0
+            ? new CharacterRunAutomaton(
+                Regex.simpleMatchToAutomaton(lookup.inferenceFields().keySet().stream().map(f -> f + "*").toArray(String[]::new))
+            )
+            : null;
+
+        List<String> lateExcludes = new ArrayList<>();
+        var excludes = lookup.getFullNameToFieldType().values().stream().filter(MappedFieldType::isVectorEmbedding).filter(f -> {
+            // Exclude the field specified by the `fields` option
+            if (fetchFieldsAut != null && fetchFieldsAut.run(f.name())) {
+                lateExcludes.add(f.name());
+                return false;
+            }
             // Exclude vectors from semantic text fields, as they are processed separately
-            .filter(f -> Regex.simpleMatch(inferencePatterns, f.name()) == false)
-            .map(f -> f.name())
-            .collect(Collectors.toList());
+            return inferenceFieldsAut == null || inferenceFieldsAut.run(f.name()) == false;
+        }).map(f -> f.name()).collect(Collectors.toList());
+
+        if (lateExcludes.size() > 0) {
+            /**
+             * Adds the vector field specified by the `fields` option to the excludes list of the fetch source context.
+             * This ensures that vector fields are available to sub-fetch phases, but excluded during the {@link FetchSourcePhase}.
+             */
+            if (context.fetchSourceContext() != null && context.fetchSourceContext().excludes() != null) {
+                for (var exclude : context.fetchSourceContext().excludes()) {
+                    lateExcludes.add(exclude);
+                }
+            }
+            var fetchSourceContext = context.fetchSourceContext() == null
+                ? FetchSourceContext.of(true, false, null, lateExcludes.toArray(String[]::new))
+                : FetchSourceContext.of(
+                    context.fetchSourceContext().fetchSource(),
+                    context.fetchSourceContext().includeVectors(),
+                    context.fetchSourceContext().includes(),
+                    lateExcludes.toArray(String[]::new)
+                );
+            context.fetchSourceContext(fetchSourceContext);
+        }
         return excludes.isEmpty() ? null : new SourceFilter(new String[] {}, excludes.toArray(String[]::new));
     }
 }
diff --git a/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java b/x-pack/plugin/rank-vectors/src/main/java/org/elasticsearch/xpack/rank/vectors/mapper/RankVectorsFieldMapper.java
@@ -172,6 +172,11 @@ public String typeName() {
             return CONTENT_TYPE;
         }
 
+        @Override
+        public boolean isVectorEmbedding() {
+            return true;
+        }
+
         @Override
         public ValueFetcher valueFetcher(SearchExecutionContext context, String format) {
             if (format != null) {