elastic · mromaios · Aug 13, 2025 · Jul 31, 2025 · Aug 1, 2025 · Aug 5, 2025
diff --git a/...er/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/...er/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java
@@ -17,6 +17,7 @@
 import org.apache.lucene.index.LeafReader;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.store.Directory;
@@ -54,7 +55,10 @@
 import java.util.LinkedHashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.TreeMap;
+import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
 import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
@@ -69,6 +73,22 @@
 
 public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {
 
+    public static final float STRICT_TOKENS_WEIGHT_THRESHOLD = 0.5f;
+    public static final float STRICT_TOKENS_FREQ_RATIO_THRESHOLD = 1;
+
+    private static final Map<String, Float> COMMON_TOKENS = Map.of(
+        "common1_drop_default",
+        0.1f,
+        "common2_drop_default",
+        0.1f,
+        "common3_drop_default",
+        0.1f
+    );
+
+    private static final Map<String, Float> MEDIUM_TOKENS = Map.of("medium1_keep_strict", 0.5f, "medium2_keep_default", 0.25f);
+
+    private static final Map<String, Float> RARE_TOKENS = Map.of("rare1_keep_strict", 0.9f, "rare2_keep_strict", 0.85f);
+
     @Override
     protected Object getSampleValueForDocument() {
         return new TreeMap<>(
@@ -123,7 +143,7 @@ protected void minimalMappingWithExplicitIndexOptions(XContentBuilder b) throws
             b.field("prune", true);
             b.startObject("pruning_config");
             {
-                b.field("tokens_freq_ratio_threshold", 3.0f);
+                b.field("tokens_freq_ratio_threshold", 1.0f);
                 b.field("tokens_weight_threshold", 0.5f);
             }
             b.endObject();
@@ -178,6 +198,13 @@ protected void mappingWithIndexOptionsPruneFalse(XContentBuilder b) throws IOExc
         b.endObject();
     }
 
+    private void mapping(XContentBuilder b, @Nullable Boolean prune, PruningConfig pruningConfig) throws IOException {
+        b.field("type", "sparse_vector");
+        if (prune != null) {
+            b.field("index_options", new SparseVectorFieldMapper.SparseVectorIndexOptions(prune, pruningConfig.tokenPruningConfig));
+        }
+    }
+
     @Override
     protected boolean supportsStoredFields() {
         return false;
@@ -676,14 +703,58 @@ public void testTokensWeightThresholdCorrect() {
         );
     }
 
+    private enum PruningScenario {
+        NO_PRUNING,      // No pruning applied - all tokens preserved
+        DEFAULT_PRUNING, // Default pruning configuration
+        STRICT_PRUNING   // Stricter pruning with higher thresholds
+    }
+
+    private enum PruningConfig {
+        NULL(null),
+        EXPLICIT_DEFAULT(new TokenPruningConfig()),
+        STRICT(new TokenPruningConfig(STRICT_TOKENS_FREQ_RATIO_THRESHOLD, STRICT_TOKENS_WEIGHT_THRESHOLD, false));
+
+        public final @Nullable TokenPruningConfig tokenPruningConfig;
+
+        PruningConfig(@Nullable TokenPruningConfig tokenPruningConfig) {
+            this.tokenPruningConfig = tokenPruningConfig;
+        }
+    }
+
+    private final Set<PruningOptions> validIndexPruningScenarios = Set.of(
+        new PruningOptions(false, PruningConfig.NULL),
+        new PruningOptions(true, PruningConfig.NULL),
+        new PruningOptions(true, PruningConfig.EXPLICIT_DEFAULT),
+        new PruningOptions(true, PruningConfig.STRICT),
+        new PruningOptions(null, PruningConfig.NULL)
+    );
+
+    private record PruningOptions(@Nullable Boolean prune, PruningConfig pruningConfig) {}
+
     private void withSearchExecutionContext(MapperService mapperService, CheckedConsumer<SearchExecutionContext, IOException> consumer)
         throws IOException {
         var mapper = mapperService.documentMapper();
         try (Directory directory = newDirectory()) {
             RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
-            var sourceToParse = source(this::writeField);
-            ParsedDocument doc1 = mapper.parse(sourceToParse);
-            iw.addDocument(doc1.rootDoc());
+
+            int commonDocs = 20;
+            for (int i = 0; i < commonDocs; i++) {
+                iw.addDocument(mapper.parse(source(b -> b.field("field", COMMON_TOKENS))).rootDoc());
+            }
+
+            int mediumDocs = 5;
+            for (int i = 0; i < mediumDocs; i++) {
+                iw.addDocument(mapper.parse(source(b -> b.field("field", MEDIUM_TOKENS))).rootDoc());
+            }
+
+            iw.addDocument(mapper.parse(source(b -> b.field("field", RARE_TOKENS))).rootDoc());
+
+            // This will lower the averageTokenFreqRatio so that common tokens get pruned with default settings
+            Map<String, Float> uniqueDoc = new TreeMap<>();
+            for (int i = 0; i < 20; i++) {
+                uniqueDoc.put("unique" + i, 0.5f);
+            }
+            iw.addDocument(mapper.parse(source(b -> b.field("field", uniqueDoc))).rootDoc());
             iw.close();
 
             try (DirectoryReader reader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) {
@@ -693,163 +764,97 @@ private void withSearchExecutionContext(MapperService mapperService, CheckedCons
         }
     }
 
-    public void testTypeQueryFinalizationWithRandomOptions() throws Exception {
-        for (int i = 0; i < 20; i++) {
-            runTestTypeQueryFinalization(
-                randomBoolean(), // useIndexVersionBeforeIndexOptions
-                randomBoolean(), // useMapperDefaultIndexOptions
-                randomBoolean(), // setMapperIndexOptionsPruneToFalse
-                randomBoolean(), // queryOverridesPruningConfig
-                randomBoolean()  // queryOverridesPruneToBeFalse
+    public void testPruningScenarios() throws Exception {
+        for (int i = 0; i < 120; i++) {
+            assertPruningScenario(
+                randomFrom(validIndexPruningScenarios),
+                new PruningOptions(randomBoolean() ? randomBoolean() : null, randomFrom(PruningConfig.values()))
             );
         }
     }
 
-    public void testTypeQueryFinalizationDefaultsCurrentVersion() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
-
-        // query should be pruned by default on newer index versions
-        performTypeQueryFinalizationTest(mapperService, null, null, true);
-    }
-
-    public void testTypeQueryFinalizationDefaultsPreviousVersion() throws Exception {
-        IndexVersion version = IndexVersionUtils.randomVersionBetween(
-            random(),
-            UPGRADE_TO_LUCENE_10_0_0,
-            IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
-        );
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
-
-        // query should _not_ be pruned by default on older index versions
-        performTypeQueryFinalizationTest(mapperService, null, null, false);
+    private XContentBuilder getIndexMapping(PruningOptions pruningOptions) throws IOException {
+        return fieldMapping(b -> mapping(b, pruningOptions.prune, pruningOptions.pruningConfig));
     }
 
-    public void testTypeQueryFinalizationWithIndexExplicit() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
+    private void assertQueryContains(List<Query> expectedClauses, Query query) {
+        SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
+        var termsQuery = queryWrapper.getTermsQuery();
+        assertNotNull(termsQuery);
+        var booleanQuery = (BooleanQuery) termsQuery;
 
-        // query should be pruned via explicit index options
-        performTypeQueryFinalizationTest(mapperService, null, null, true);
+        Collection<Query> shouldClauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
+        assertThat(shouldClauses, Matchers.containsInAnyOrder(expectedClauses.toArray()));
     }
 
-    public void testTypeQueryFinalizationWithIndexExplicitDoNotPrune() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+    private PruningScenario getEffectivePruningScenario(
+        PruningOptions indexPruningOptions,
+        PruningOptions queryPruningOptions,
+        IndexVersion indexVersion
+    ) {
+        Boolean shouldPrune = queryPruningOptions.prune;
+        if (shouldPrune == null) {
+            shouldPrune = indexPruningOptions.prune;
+        }
 
-        // query should be pruned via explicit index options
-        performTypeQueryFinalizationTest(mapperService, null, null, false);
-    }
+        if (shouldPrune == null) {
+            shouldPrune = indexVersion.onOrAfter(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT);
+        }
 
-    public void testTypeQueryFinalizationQueryOverridesPruning() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+        PruningScenario pruningScenario = PruningScenario.NO_PRUNING;
+        if (shouldPrune) {
+            PruningConfig pruningConfig = queryPruningOptions.pruningConfig;
+            if (pruningConfig == PruningConfig.NULL) {
+                pruningConfig = indexPruningOptions.pruningConfig;
+            }
+            pruningScenario = switch (pruningConfig) {
+                case STRICT -> PruningScenario.STRICT_PRUNING;
+                case EXPLICIT_DEFAULT, NULL -> PruningScenario.DEFAULT_PRUNING;
+            };
+        }
 
-        // query should still be pruned due to query builder setting it
-        performTypeQueryFinalizationTest(mapperService, true, new TokenPruningConfig(), true);
+        return pruningScenario;
     }
 
-    public void testTypeQueryFinalizationQueryOverridesPruningOff() throws Exception {
-        IndexVersion version = IndexVersion.current();
-        MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
+    private List<Query> getExpectedQueryClauses(
+        SparseVectorFieldMapper.SparseVectorFieldType ft,
+        PruningScenario pruningScenario,
+        SearchExecutionContext searchExecutionContext
+    ) {
+        List<WeightedToken> tokens = switch (pruningScenario) {
+            case NO_PRUNING -> QUERY_VECTORS;
+            case DEFAULT_PRUNING -> QUERY_VECTORS.stream()
+                .filter(t -> t.token().startsWith("rare") || t.token().startsWith("medium"))
+                .toList();
+            case STRICT_PRUNING -> QUERY_VECTORS.stream().filter(t -> t.token().endsWith("keep_strict")).toList();
+        };
 
-        // query should not pruned due to query builder setting it
-        performTypeQueryFinalizationTest(mapperService, false, null, false);
+        return tokens.stream().map(t -> {
+            Query termQuery = ft.termQuery(t.token(), searchExecutionContext);
+            return new BoostQuery(termQuery, t.weight());
+        }).collect(Collectors.toUnmodifiableList());
     }
 
-    private void performTypeQueryFinalizationTest(
-        MapperService mapperService,
-        @Nullable Boolean queryPrune,
-        @Nullable TokenPruningConfig queryTokenPruningConfig,
-        boolean queryShouldBePruned
-    ) throws IOException {
+    private void assertPruningScenario(PruningOptions indexPruningOptions, PruningOptions queryPruningOptions) throws IOException {
+        IndexVersion indexVersion = getIndexVersionForTest(randomBoolean());
+        MapperService mapperService = createMapperService(indexVersion, getIndexMapping(indexPruningOptions));
+        PruningScenario effectivePruningScenario = getEffectivePruningScenario(indexPruningOptions, queryPruningOptions, indexVersion);
         withSearchExecutionContext(mapperService, (context) -> {
             SparseVectorFieldMapper.SparseVectorFieldType ft = (SparseVectorFieldMapper.SparseVectorFieldType) mapperService.fieldType(
                 "field"
             );
-            Query finalizedQuery = ft.finalizeSparseVectorQuery(context, "field", QUERY_VECTORS, queryPrune, queryTokenPruningConfig);
-
-            if (queryShouldBePruned) {
-                assertQueryWasPruned(finalizedQuery);
-            } else {
-                assertQueryWasNotPruned(finalizedQuery);
-            }
+            List<Query> expectedQueryClauses = getExpectedQueryClauses(ft, effectivePruningScenario, context);
+            Query finalizedQuery = ft.finalizeSparseVectorQuery(
+                context,
+                "field",
+                QUERY_VECTORS,
+                queryPruningOptions.prune,
+                queryPruningOptions.pruningConfig.tokenPruningConfig
+            );
+            assertQueryContains(expectedQueryClauses, finalizedQuery);
         });
     }
 
-    private void assertQueryWasPruned(Query query) {
-        assertQueryHasClauseCount(query, 0);
-    }
-
-    private void assertQueryWasNotPruned(Query query) {
-        assertQueryHasClauseCount(query, QUERY_VECTORS.size());
-    }
-
-    private void assertQueryHasClauseCount(Query query, int clauseCount) {
-        SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
-        var termsQuery = queryWrapper.getTermsQuery();
-        assertNotNull(termsQuery);
-        var booleanQuery = (BooleanQuery) termsQuery;
-        Collection<Query> clauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
-        assertThat(clauses.size(), equalTo(clauseCount));
-    }
-
-    /**
-     * Runs a test of the query finalization based on various parameters
-     * that provides
-     * @param useIndexVersionBeforeIndexOptions set to true to use a previous index version before mapper index_options
-     * @param useMapperDefaultIndexOptions set to false to use an explicit, non-default mapper index_options
-     * @param setMapperIndexOptionsPruneToFalse set to true to use prune:false in the mapper index_options
-     * @param queryOverridesPruningConfig set to true to designate the query will provide a pruning_config
-     * @param queryOverridesPruneToBeFalse if true and queryOverridesPruningConfig is true, the query will provide prune:false
-     * @throws IOException
-     */
-    private void runTestTypeQueryFinalization(
-        boolean useIndexVersionBeforeIndexOptions,
-        boolean useMapperDefaultIndexOptions,
-        boolean setMapperIndexOptionsPruneToFalse,
-        boolean queryOverridesPruningConfig,
-        boolean queryOverridesPruneToBeFalse
-    ) throws IOException {
-        MapperService mapperService = getMapperServiceForTest(
-            useIndexVersionBeforeIndexOptions,
-            useMapperDefaultIndexOptions,
-            setMapperIndexOptionsPruneToFalse
-        );
-
-        // check and see if the query should explicitly override the index_options
-        Boolean shouldQueryPrune = queryOverridesPruningConfig ? (queryOverridesPruneToBeFalse == false) : null;
-
-        // get the pruning configuration for the query if it's overriding
-        TokenPruningConfig queryPruningConfig = Boolean.TRUE.equals(shouldQueryPrune) ? new TokenPruningConfig() : null;
-
-        // our logic if the results should be pruned or not
-        // we should _not_ prune if any of the following:
-        // - the query explicitly overrides the options and `prune` is set to false
-        // - the query does not override the pruning options and:
-        // - either we are using a previous index version
-        // - or the index_options explicitly sets `prune` to false
-        boolean resultShouldNotBePruned = ((queryOverridesPruningConfig && queryOverridesPruneToBeFalse)
-            || (queryOverridesPruningConfig == false && (useIndexVersionBeforeIndexOptions || setMapperIndexOptionsPruneToFalse)));
-
-        try {
-            performTypeQueryFinalizationTest(mapperService, shouldQueryPrune, queryPruningConfig, resultShouldNotBePruned == false);
-        } catch (AssertionError e) {
-            String message = "performTypeQueryFinalizationTest failed using parameters: "
-                + "useIndexVersionBeforeIndexOptions: "
-                + useIndexVersionBeforeIndexOptions
-                + ", useMapperDefaultIndexOptions: "
-                + useMapperDefaultIndexOptions
-                + ", setMapperIndexOptionsPruneToFalse: "
-                + setMapperIndexOptionsPruneToFalse
-                + ", queryOverridesPruningConfig: "
-                + queryOverridesPruningConfig
-                + ", queryOverridesPruneToBeFalse: "
-                + queryOverridesPruneToBeFalse;
-            throw new AssertionError(message, e);
-        }
-    }
-
     private IndexVersion getIndexVersionForTest(boolean usePreviousIndex) {
         return usePreviousIndex
             ? IndexVersionUtils.randomVersionBetween(
@@ -860,36 +865,10 @@ private IndexVersion getIndexVersionForTest(boolean usePreviousIndex) {
             : IndexVersionUtils.randomVersionBetween(random(), SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, IndexVersion.current());
     }
 
-    private MapperService getMapperServiceForTest(
-        boolean usePreviousIndex,
-        boolean useIndexOptionsDefaults,
-        boolean explicitIndexOptionsDoNotPrune
-    ) throws IOException {
-        // get the index version of the test to use
-        // either a current version that supports index options, or a previous version that does not
-        IndexVersion indexVersion = getIndexVersionForTest(usePreviousIndex);
-
-        // if it's using the old index, we always use the minimal mapping without index_options
-        if (usePreviousIndex) {
-            return createMapperService(indexVersion, fieldMapping(this::minimalMapping));
-        }
-
-        // if we set explicitIndexOptionsDoNotPrune, the index_options (if present) will explicitly include "prune: false"
-        if (explicitIndexOptionsDoNotPrune) {
-            return createMapperService(indexVersion, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
-        }
-
-        // either return the default (minimal) mapping or one with an explicit pruning_config
-        return useIndexOptionsDefaults
-            ? createMapperService(indexVersion, fieldMapping(this::minimalMapping))
-            : createMapperService(indexVersion, fieldMapping(this::minimalMappingWithExplicitIndexOptions));
-    }
-
-    private static List<WeightedToken> QUERY_VECTORS = List.of(
-        new WeightedToken("pugs", 0.5f),
-        new WeightedToken("cats", 0.4f),
-        new WeightedToken("is", 0.1f)
-    );
+    private static final List<WeightedToken> QUERY_VECTORS = Stream.of(RARE_TOKENS, MEDIUM_TOKENS, COMMON_TOKENS)
+        .flatMap(map -> map.entrySet().stream())
+        .map(entry -> new WeightedToken(entry.getKey(), entry.getValue()))
+        .collect(Collectors.toList());
 
     /**
      * Handles float/double conversion when reading/writing with xcontent by converting all numbers to floats.