Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
fe0757c
chore: add sparse vector pruning tests
mromaios Jul 31, 2025
fcb8ed9
Merge branch 'main' into increase_sparse_pruning_test_coverage
mromaios Aug 1, 2025
127b6f0
refactor tests with pruning scenarios
mromaios Aug 5, 2025
c49c67e
Merge branch 'increase_sparse_pruning_test_coverage' of github.com:mr…
mromaios Aug 5, 2025
75bb40a
Merge branch 'main' into increase_sparse_pruning_test_coverage
mromaios Aug 5, 2025
3546602
update yaml tests
mromaios Aug 5, 2025
ee00d63
[CI] Auto commit changes from spotless
Aug 5, 2025
64a8ed8
move things around
mromaios Aug 5, 2025
6fea090
remove TODOs
mromaios Aug 5, 2025
1c14f78
Merge branch 'main' into increase_sparse_pruning_test_coverage
mromaios Aug 5, 2025
a79f56f
[CI] Auto commit changes from spotless
Aug 5, 2025
d37bad0
address PR comments and minor refactoring
mromaios Aug 8, 2025
c5102cd
remove debug statements
mromaios Aug 8, 2025
248754f
[CI] Auto commit changes from spotless
Aug 8, 2025
63b00ff
refactor effectivepruning calculation
mromaios Aug 11, 2025
1ab8d82
Merge branch 'increase_sparse_pruning_test_coverage' of github.com:mr…
mromaios Aug 11, 2025
9d0f9b0
Merge branch 'main' into increase_sparse_pruning_test_coverage
mromaios Aug 11, 2025
3efb5cf
Merge branch 'increase_sparse_pruning_test_coverage' of github.com:mr…
mromaios Aug 11, 2025
a519058
[CI] Auto commit changes from spotless
Aug 11, 2025
a2bed06
Merge branch 'increase_sparse_pruning_test_coverage' of github.com:mr…
mromaios Aug 12, 2025
b22eb47
further cleanup, include pruningConfig in enum
mromaios Aug 12, 2025
676a764
Merge branch 'main' into increase_sparse_pruning_test_coverage
mromaios Aug 12, 2025
976f947
[CI] Auto commit changes from spotless
Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.Directory;
Expand Down Expand Up @@ -54,7 +55,10 @@
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.stream.Collectors;
import java.util.stream.Stream;

import static org.elasticsearch.index.IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT;
import static org.elasticsearch.index.IndexVersions.UPGRADE_TO_LUCENE_10_0_0;
Expand All @@ -69,6 +73,22 @@

public class SparseVectorFieldMapperTests extends SyntheticVectorsMapperTestCase {

public static final float STRICT_TOKENS_WEIGHT_THRESHOLD = 0.5f;
public static final float STRICT_TOKENS_FREQ_RATIO_THRESHOLD = 1;

private static final Map<String, Float> COMMON_TOKENS = Map.of(
"common1_drop_default",
0.1f,
"common2_drop_default",
0.1f,
"common3_drop_default",
0.1f
);

private static final Map<String, Float> MEDIUM_TOKENS = Map.of("medium1_keep_strict", 0.5f, "medium2_keep_default", 0.25f);

private static final Map<String, Float> RARE_TOKENS = Map.of("rare1_keep_strict", 0.9f, "rare2_keep_strict", 0.85f);

@Override
protected Object getSampleValueForDocument() {
return new TreeMap<>(
Expand Down Expand Up @@ -123,7 +143,7 @@ protected void minimalMappingWithExplicitIndexOptions(XContentBuilder b) throws
b.field("prune", true);
b.startObject("pruning_config");
{
b.field("tokens_freq_ratio_threshold", 3.0f);
b.field("tokens_freq_ratio_threshold", 1.0f);
b.field("tokens_weight_threshold", 0.5f);
}
b.endObject();
Expand Down Expand Up @@ -178,6 +198,13 @@ protected void mappingWithIndexOptionsPruneFalse(XContentBuilder b) throws IOExc
b.endObject();
}

private void mapping(XContentBuilder b, @Nullable Boolean prune, PruningConfig pruningConfig) throws IOException {
b.field("type", "sparse_vector");
if (prune != null) {
b.field("index_options", new SparseVectorFieldMapper.SparseVectorIndexOptions(prune, pruningConfig.tokenPruningConfig));
}
}

@Override
protected boolean supportsStoredFields() {
return false;
Expand Down Expand Up @@ -676,14 +703,58 @@ public void testTokensWeightThresholdCorrect() {
);
}

private enum PruningScenario {
NO_PRUNING, // No pruning applied - all tokens preserved
DEFAULT_PRUNING, // Default pruning configuration
STRICT_PRUNING // Stricter pruning with higher thresholds
}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

💭 I started of with these 3 scenarios for index and query configs as well, but quickly realised that those have 2 core dimensions. prune and pruningConfig, which for index you need to have prune=true to set a pruningConfig but that's not the case for queries, where the user can send any combination. This lead to the creation of separate enums for query and index scenarios.


private enum PruningConfig {
NULL(null),
EXPLICIT_DEFAULT(new TokenPruningConfig()),
STRICT(new TokenPruningConfig(STRICT_TOKENS_FREQ_RATIO_THRESHOLD, STRICT_TOKENS_WEIGHT_THRESHOLD, false));

public final @Nullable TokenPruningConfig tokenPruningConfig;

PruningConfig(@Nullable TokenPruningConfig tokenPruningConfig) {
this.tokenPruningConfig = tokenPruningConfig;
}
}

private final Set<PruningOptions> validIndexPruningScenarios = Set.of(
new PruningOptions(false, PruningConfig.NULL),
new PruningOptions(true, PruningConfig.NULL),
new PruningOptions(true, PruningConfig.EXPLICIT_DEFAULT),
new PruningOptions(true, PruningConfig.STRICT),
new PruningOptions(null, PruningConfig.NULL)
);

private record PruningOptions(@Nullable Boolean prune, PruningConfig pruningConfig) {}

private void withSearchExecutionContext(MapperService mapperService, CheckedConsumer<SearchExecutionContext, IOException> consumer)
throws IOException {
var mapper = mapperService.documentMapper();
try (Directory directory = newDirectory()) {
RandomIndexWriter iw = new RandomIndexWriter(random(), directory);
var sourceToParse = source(this::writeField);
ParsedDocument doc1 = mapper.parse(sourceToParse);
iw.addDocument(doc1.rootDoc());

int commonDocs = 20;
for (int i = 0; i < commonDocs; i++) {
iw.addDocument(mapper.parse(source(b -> b.field("field", COMMON_TOKENS))).rootDoc());
}

int mediumDocs = 5;
for (int i = 0; i < mediumDocs; i++) {
iw.addDocument(mapper.parse(source(b -> b.field("field", MEDIUM_TOKENS))).rootDoc());
}

iw.addDocument(mapper.parse(source(b -> b.field("field", RARE_TOKENS))).rootDoc());

// This will lower the averageTokenFreqRatio so that common tokens get pruned with default settings
Map<String, Float> uniqueDoc = new TreeMap<>();
for (int i = 0; i < 20; i++) {
uniqueDoc.put("unique" + i, 0.5f);
}
iw.addDocument(mapper.parse(source(b -> b.field("field", uniqueDoc))).rootDoc());
iw.close();

try (DirectoryReader reader = wrapInMockESDirectoryReader(DirectoryReader.open(directory))) {
Expand All @@ -693,163 +764,97 @@ private void withSearchExecutionContext(MapperService mapperService, CheckedCons
}
}

public void testTypeQueryFinalizationWithRandomOptions() throws Exception {
for (int i = 0; i < 20; i++) {
runTestTypeQueryFinalization(
randomBoolean(), // useIndexVersionBeforeIndexOptions
randomBoolean(), // useMapperDefaultIndexOptions
randomBoolean(), // setMapperIndexOptionsPruneToFalse
randomBoolean(), // queryOverridesPruningConfig
randomBoolean() // queryOverridesPruneToBeFalse
public void testPruningScenarios() throws Exception {
for (int i = 0; i < 120; i++) {
assertPruningScenario(
randomFrom(validIndexPruningScenarios),
new PruningOptions(randomBoolean() ? randomBoolean() : null, randomFrom(PruningConfig.values()))
);
}
}

public void testTypeQueryFinalizationDefaultsCurrentVersion() throws Exception {
IndexVersion version = IndexVersion.current();
MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));

// query should be pruned by default on newer index versions
performTypeQueryFinalizationTest(mapperService, null, null, true);
}

public void testTypeQueryFinalizationDefaultsPreviousVersion() throws Exception {
IndexVersion version = IndexVersionUtils.randomVersionBetween(
random(),
UPGRADE_TO_LUCENE_10_0_0,
IndexVersionUtils.getPreviousVersion(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)
);
MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));

// query should _not_ be pruned by default on older index versions
performTypeQueryFinalizationTest(mapperService, null, null, false);
private XContentBuilder getIndexMapping(PruningOptions pruningOptions) throws IOException {
return fieldMapping(b -> mapping(b, pruningOptions.prune, pruningOptions.pruningConfig));
}

public void testTypeQueryFinalizationWithIndexExplicit() throws Exception {
IndexVersion version = IndexVersion.current();
MapperService mapperService = createMapperService(version, fieldMapping(this::minimalMapping));
private void assertQueryContains(List<Query> expectedClauses, Query query) {
SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
var termsQuery = queryWrapper.getTermsQuery();
assertNotNull(termsQuery);
var booleanQuery = (BooleanQuery) termsQuery;

// query should be pruned via explicit index options
performTypeQueryFinalizationTest(mapperService, null, null, true);
Collection<Query> shouldClauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
assertThat(shouldClauses, Matchers.containsInAnyOrder(expectedClauses.toArray()));
}

public void testTypeQueryFinalizationWithIndexExplicitDoNotPrune() throws Exception {
IndexVersion version = IndexVersion.current();
MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
private PruningScenario getEffectivePruningScenario(
PruningOptions indexPruningOptions,
PruningOptions queryPruningOptions,
IndexVersion indexVersion
) {
Boolean shouldPrune = queryPruningOptions.prune;
if (shouldPrune == null) {
shouldPrune = indexPruningOptions.prune;
}

// query should be pruned via explicit index options
performTypeQueryFinalizationTest(mapperService, null, null, false);
}
if (shouldPrune == null) {
shouldPrune = indexVersion.onOrAfter(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing I overlooked here is that we also need to handle 8.x index versions that support default token pruning. This check needs to change to something like:

shouldPrune = indexVersion.between(IndexVersions.SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT_BACKPORT_8_X, IndexVersions.UPGRADE_TO_LUCENE_10_0_0) || indexVersion.onOrAfter(SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Will add it, thanks!

}

public void testTypeQueryFinalizationQueryOverridesPruning() throws Exception {
IndexVersion version = IndexVersion.current();
MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
PruningScenario pruningScenario = PruningScenario.NO_PRUNING;
if (shouldPrune) {
PruningConfig pruningConfig = queryPruningOptions.pruningConfig;
if (pruningConfig == PruningConfig.NULL) {
pruningConfig = indexPruningOptions.pruningConfig;
}
pruningScenario = switch (pruningConfig) {
case STRICT -> PruningScenario.STRICT_PRUNING;
case EXPLICIT_DEFAULT, NULL -> PruningScenario.DEFAULT_PRUNING;
};
}

// query should still be pruned due to query builder setting it
performTypeQueryFinalizationTest(mapperService, true, new TokenPruningConfig(), true);
return pruningScenario;
}

public void testTypeQueryFinalizationQueryOverridesPruningOff() throws Exception {
IndexVersion version = IndexVersion.current();
MapperService mapperService = createMapperService(version, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
private List<Query> getExpectedQueryClauses(
SparseVectorFieldMapper.SparseVectorFieldType ft,
PruningScenario pruningScenario,
SearchExecutionContext searchExecutionContext
) {
List<WeightedToken> tokens = switch (pruningScenario) {
case NO_PRUNING -> QUERY_VECTORS;
case DEFAULT_PRUNING -> QUERY_VECTORS.stream()
.filter(t -> t.token().startsWith("rare") || t.token().startsWith("medium"))
.toList();
case STRICT_PRUNING -> QUERY_VECTORS.stream().filter(t -> t.token().endsWith("keep_strict")).toList();
};

// query should not pruned due to query builder setting it
performTypeQueryFinalizationTest(mapperService, false, null, false);
return tokens.stream().map(t -> {
Query termQuery = ft.termQuery(t.token(), searchExecutionContext);
return new BoostQuery(termQuery, t.weight());
}).collect(Collectors.toUnmodifiableList());
}

private void performTypeQueryFinalizationTest(
MapperService mapperService,
@Nullable Boolean queryPrune,
@Nullable TokenPruningConfig queryTokenPruningConfig,
boolean queryShouldBePruned
) throws IOException {
private void assertPruningScenario(PruningOptions indexPruningOptions, PruningOptions queryPruningOptions) throws IOException {
IndexVersion indexVersion = getIndexVersionForTest(randomBoolean());
MapperService mapperService = createMapperService(indexVersion, getIndexMapping(indexPruningOptions));
PruningScenario effectivePruningScenario = getEffectivePruningScenario(indexPruningOptions, queryPruningOptions, indexVersion);
withSearchExecutionContext(mapperService, (context) -> {
SparseVectorFieldMapper.SparseVectorFieldType ft = (SparseVectorFieldMapper.SparseVectorFieldType) mapperService.fieldType(
"field"
);
Query finalizedQuery = ft.finalizeSparseVectorQuery(context, "field", QUERY_VECTORS, queryPrune, queryTokenPruningConfig);

if (queryShouldBePruned) {
assertQueryWasPruned(finalizedQuery);
} else {
assertQueryWasNotPruned(finalizedQuery);
}
List<Query> expectedQueryClauses = getExpectedQueryClauses(ft, effectivePruningScenario, context);
Query finalizedQuery = ft.finalizeSparseVectorQuery(
context,
"field",
QUERY_VECTORS,
queryPruningOptions.prune,
queryPruningOptions.pruningConfig.tokenPruningConfig
);
assertQueryContains(expectedQueryClauses, finalizedQuery);
});
}

private void assertQueryWasPruned(Query query) {
assertQueryHasClauseCount(query, 0);
}

private void assertQueryWasNotPruned(Query query) {
assertQueryHasClauseCount(query, QUERY_VECTORS.size());
}

private void assertQueryHasClauseCount(Query query, int clauseCount) {
SparseVectorQueryWrapper queryWrapper = (SparseVectorQueryWrapper) query;
var termsQuery = queryWrapper.getTermsQuery();
assertNotNull(termsQuery);
var booleanQuery = (BooleanQuery) termsQuery;
Collection<Query> clauses = booleanQuery.getClauses(BooleanClause.Occur.SHOULD);
assertThat(clauses.size(), equalTo(clauseCount));
}

/**
* Runs a test of the query finalization based on various parameters
* that provides
* @param useIndexVersionBeforeIndexOptions set to true to use a previous index version before mapper index_options
* @param useMapperDefaultIndexOptions set to false to use an explicit, non-default mapper index_options
* @param setMapperIndexOptionsPruneToFalse set to true to use prune:false in the mapper index_options
* @param queryOverridesPruningConfig set to true to designate the query will provide a pruning_config
* @param queryOverridesPruneToBeFalse if true and queryOverridesPruningConfig is true, the query will provide prune:false
* @throws IOException
*/
private void runTestTypeQueryFinalization(
boolean useIndexVersionBeforeIndexOptions,
boolean useMapperDefaultIndexOptions,
boolean setMapperIndexOptionsPruneToFalse,
boolean queryOverridesPruningConfig,
boolean queryOverridesPruneToBeFalse
) throws IOException {
MapperService mapperService = getMapperServiceForTest(
useIndexVersionBeforeIndexOptions,
useMapperDefaultIndexOptions,
setMapperIndexOptionsPruneToFalse
);

// check and see if the query should explicitly override the index_options
Boolean shouldQueryPrune = queryOverridesPruningConfig ? (queryOverridesPruneToBeFalse == false) : null;

// get the pruning configuration for the query if it's overriding
TokenPruningConfig queryPruningConfig = Boolean.TRUE.equals(shouldQueryPrune) ? new TokenPruningConfig() : null;

// our logic if the results should be pruned or not
// we should _not_ prune if any of the following:
// - the query explicitly overrides the options and `prune` is set to false
// - the query does not override the pruning options and:
// - either we are using a previous index version
// - or the index_options explicitly sets `prune` to false
boolean resultShouldNotBePruned = ((queryOverridesPruningConfig && queryOverridesPruneToBeFalse)
|| (queryOverridesPruningConfig == false && (useIndexVersionBeforeIndexOptions || setMapperIndexOptionsPruneToFalse)));

try {
performTypeQueryFinalizationTest(mapperService, shouldQueryPrune, queryPruningConfig, resultShouldNotBePruned == false);
} catch (AssertionError e) {
String message = "performTypeQueryFinalizationTest failed using parameters: "
+ "useIndexVersionBeforeIndexOptions: "
+ useIndexVersionBeforeIndexOptions
+ ", useMapperDefaultIndexOptions: "
+ useMapperDefaultIndexOptions
+ ", setMapperIndexOptionsPruneToFalse: "
+ setMapperIndexOptionsPruneToFalse
+ ", queryOverridesPruningConfig: "
+ queryOverridesPruningConfig
+ ", queryOverridesPruneToBeFalse: "
+ queryOverridesPruneToBeFalse;
throw new AssertionError(message, e);
}
}

private IndexVersion getIndexVersionForTest(boolean usePreviousIndex) {
return usePreviousIndex
? IndexVersionUtils.randomVersionBetween(
Expand All @@ -860,36 +865,10 @@ private IndexVersion getIndexVersionForTest(boolean usePreviousIndex) {
: IndexVersionUtils.randomVersionBetween(random(), SPARSE_VECTOR_PRUNING_INDEX_OPTIONS_SUPPORT, IndexVersion.current());
}

private MapperService getMapperServiceForTest(
boolean usePreviousIndex,
boolean useIndexOptionsDefaults,
boolean explicitIndexOptionsDoNotPrune
) throws IOException {
// get the index version of the test to use
// either a current version that supports index options, or a previous version that does not
IndexVersion indexVersion = getIndexVersionForTest(usePreviousIndex);

// if it's using the old index, we always use the minimal mapping without index_options
if (usePreviousIndex) {
return createMapperService(indexVersion, fieldMapping(this::minimalMapping));
}

// if we set explicitIndexOptionsDoNotPrune, the index_options (if present) will explicitly include "prune: false"
if (explicitIndexOptionsDoNotPrune) {
return createMapperService(indexVersion, fieldMapping(this::mappingWithIndexOptionsPruneFalse));
}

// either return the default (minimal) mapping or one with an explicit pruning_config
return useIndexOptionsDefaults
? createMapperService(indexVersion, fieldMapping(this::minimalMapping))
: createMapperService(indexVersion, fieldMapping(this::minimalMappingWithExplicitIndexOptions));
}

private static List<WeightedToken> QUERY_VECTORS = List.of(
new WeightedToken("pugs", 0.5f),
new WeightedToken("cats", 0.4f),
new WeightedToken("is", 0.1f)
);
private static final List<WeightedToken> QUERY_VECTORS = Stream.of(RARE_TOKENS, MEDIUM_TOKENS, COMMON_TOKENS)
.flatMap(map -> map.entrySet().stream())
.map(entry -> new WeightedToken(entry.getKey(), entry.getValue()))
.collect(Collectors.toList());

/**
* Handles float/double conversion when reading/writing with xcontent by converting all numbers to floats.
Expand Down
Loading