Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
79b7e72
Instead of generating snippets via highlighter, chunk and score chunk…
kderusso Aug 26, 2025
9f28c08
[CI] Auto commit changes from spotless
Aug 26, 2025
49d25a7
Add customization based on preferred chunking settings or chunk size …
kderusso Aug 27, 2025
0036271
Cleanup
kderusso Aug 27, 2025
2df2f9d
Update docs/changelog/133576.yaml
kderusso Aug 27, 2025
ad404db
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Aug 27, 2025
8b7f7f2
Refactor/Rename SnippetScorer to MemoryIndexChunkScorer
kderusso Aug 27, 2025
80f4434
PR feedback on MemoryIndexChunkScorer
kderusso Aug 27, 2025
9872258
Update API and code to rename snippets to chunks
kderusso Aug 27, 2025
8c4ab1e
Missed some snippet renames
kderusso Aug 28, 2025
fc706a8
Handle case where no matches were found to score chunks
kderusso Sep 2, 2025
246dfa2
PR feedback on MemoryIndexChunkScorer, add tests
kderusso Sep 8, 2025
6355282
Rename num_chunks to size
kderusso Sep 8, 2025
d03a0f1
Merge from main
kderusso Sep 8, 2025
ed13074
[CI] Auto commit changes from spotless
Sep 8, 2025
6a06b84
Fix error in merge
kderusso Sep 8, 2025
3c695ac
Fix transport version issues after they were consolidated in main
kderusso Sep 8, 2025
386f3b8
[CI] Auto commit changes from spotless
Sep 8, 2025
9d35f6c
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 8, 2025
7131b30
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 8, 2025
9c6041b
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 8, 2025
ed4859e
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
elasticmachine Sep 8, 2025
8ac71ef
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 8, 2025
5129275
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
elasticmachine Sep 8, 2025
5815c9b
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
ee106a6
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
dfcefc5
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
a0cad00
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
92a060f
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
a172b6c
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
a6c2364
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 9, 2025
b3f95f9
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
elasticmachine Sep 9, 2025
e55ccfe
Add feature flag to InferenceUpgradeTestCase
kderusso Sep 10, 2025
68ea8cb
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 10, 2025
68af14f
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 10, 2025
37eca54
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
elasticmachine Sep 10, 2025
b66a58e
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 10, 2025
7a4ccff
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 10, 2025
ca597fa
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 10, 2025
ef024cf
Yolo see if this fixes the test
kderusso Sep 11, 2025
c208845
Real fix for upgrade IT
kderusso Sep 11, 2025
cc1e913
[CI] Auto commit changes from spotless
Sep 11, 2025
2651fbd
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 11, 2025
d0813b2
Another ignore
kderusso Sep 11, 2025
d0c2139
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 11, 2025
187106f
Revert "Another ignore"
kderusso Sep 11, 2025
af95d57
let's try reverting the renamed feature flag. If this is the cause of…
kderusso Sep 11, 2025
6f8b5fe
Merge branch 'main' into kderusso/text-similarity-reranking-now-with-…
kderusso Sep 11, 2025
d0dd688
Remove ignored test
kderusso Sep 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/changelog/133576.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 133576
summary: Text similarity reranker chunks and scores snippets
area: Relevance
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ public class CcsCommonYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
.feature(FeatureFlag.SUB_OBJECTS_AUTO_ENABLED)
.feature(FeatureFlag.IVF_FORMAT)
.feature(FeatureFlag.SYNTHETIC_VECTORS)
.feature(FeatureFlag.RERANK_SNIPPETS);
.feature(FeatureFlag.RERANK_RESCORE_CHUNKS);

private static ElasticsearchCluster remoteCluster = ElasticsearchCluster.local()
.name(REMOTE_CLUSTER_NAME)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ public class RcsCcsCommonYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
.feature(FeatureFlag.SUB_OBJECTS_AUTO_ENABLED)
.feature(FeatureFlag.IVF_FORMAT)
.feature(FeatureFlag.SYNTHETIC_VECTORS)
.feature(FeatureFlag.RERANK_SNIPPETS)
.feature(FeatureFlag.RERANK_RESCORE_CHUNKS)
.user("test_admin", "x-pack-test-password");

private static ElasticsearchCluster fulfillingCluster = ElasticsearchCluster.local()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class SmokeTestMultiNodeClientYamlTestSuiteIT extends ESClientYamlSuiteTe
.feature(FeatureFlag.USE_LUCENE101_POSTINGS_FORMAT)
.feature(FeatureFlag.IVF_FORMAT)
.feature(FeatureFlag.SYNTHETIC_VECTORS)
.feature(FeatureFlag.RERANK_SNIPPETS)
.feature(FeatureFlag.RERANK_RESCORE_CHUNKS)
.build();

public SmokeTestMultiNodeClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class ClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
.feature(FeatureFlag.USE_LUCENE101_POSTINGS_FORMAT)
.feature(FeatureFlag.IVF_FORMAT)
.feature(FeatureFlag.SYNTHETIC_VECTORS)
.feature(FeatureFlag.RERANK_SNIPPETS)
.feature(FeatureFlag.RERANK_RESCORE_CHUNKS)
.build();

public ClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,7 @@ static TransportVersion def(int id) {
public static final TransportVersion ESQL_SAMPLE_OPERATOR_STATUS = def(9_127_0_00);
public static final TransportVersion ESQL_TOPN_TIMINGS = def(9_128_0_00);
public static final TransportVersion NODE_WEIGHTS_ADDED_TO_NODE_BALANCE_STATS = def(9_129_0_00);
public static final TransportVersion RERANK_SNIPPETS = def(9_130_0_00);
public static final TransportVersion RERANK_RESCORE_CHUNKS = def(9_130_0_00);
public static final TransportVersion PIPELINE_TRACKING_INFO = def(9_131_0_00);
public static final TransportVersion COMPONENT_TEMPLATE_TRACKING_INFO = def(9_132_0_00);
public static final TransportVersion TO_CHILD_BLOCK_JOIN_QUERY = def(9_133_0_00);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ public interface ChunkingSettings extends ToXContentObject, VersionedNamedWritea
ChunkingStrategy getChunkingStrategy();

Map<String, Object> asMap();

default void validate() {}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ public RankFeatureDoc(int doc, float score, int shardIndex) {

public RankFeatureDoc(StreamInput in) throws IOException {
super(in);
if (in.getTransportVersion().onOrAfter(TransportVersions.RERANK_SNIPPETS)) {
if (in.getTransportVersion().onOrAfter(TransportVersions.RERANK_RESCORE_CHUNKS)) {
featureData = in.readOptionalStringCollectionAsList();
} else {
String featureDataString = in.readOptionalString();
Expand All @@ -55,7 +55,7 @@ public void featureData(List<String> featureData) {

@Override
protected void doWriteTo(StreamOutput out) throws IOException {
if (out.getTransportVersion().onOrAfter(TransportVersions.RERANK_SNIPPETS)) {
if (out.getTransportVersion().onOrAfter(TransportVersions.RERANK_RESCORE_CHUNKS)) {
out.writeOptionalStringCollection(featureData);
} else {
out.writeOptionalString(featureData.get(0));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public enum FeatureFlag {
LOGS_STREAM("es.logs_stream_feature_flag_enabled=true", Version.fromString("9.1.0"), null),
PATTERNED_TEXT("es.patterned_text_feature_flag_enabled=true", Version.fromString("9.1.0"), null),
SYNTHETIC_VECTORS("es.mapping_synthetic_vectors=true", Version.fromString("9.2.0"), null),
RERANK_SNIPPETS("es.text_similarity_reranker_snippets=true", Version.fromString("9.2.0"), null);
RERANK_RESCORE_CHUNKS("es.text_similarity_reranker_rescore_chunks=true", Version.fromString("9.2.0"), null);

public final String systemProperty;
public final Version from;
Expand Down
1 change: 1 addition & 0 deletions x-pack/plugin/core/src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,7 @@
exports org.elasticsearch.xpack.core.watcher.watch;
exports org.elasticsearch.xpack.core.watcher;
exports org.elasticsearch.xpack.core.security.authc.apikey;
exports org.elasticsearch.xpack.core.common.chunks;

provides org.elasticsearch.action.admin.cluster.node.info.ComponentVersionNumber
with
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.core.common.chunks;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.QueryBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

/**
* Utility class for scoring pre-determined chunks using an in-memory Lucene index.
*/
public class MemoryIndexChunkScorer {

private static final String CONTENT_FIELD = "content";

private final StandardAnalyzer analyzer;

public MemoryIndexChunkScorer() {
// TODO: Allow analyzer to be customizable and/or read from the field mapping
this.analyzer = new StandardAnalyzer();
}

/**
* Creates an in-memory index of chunks, or chunks, returns ordered, scored list.
*
* @param chunks the list of text chunks to score
* @param inferenceText the query text to compare against
* @param maxResults maximum number of results to return
* @return list of scored chunks ordered by relevance
* @throws IOException on failure scoring chunks
*/
public List<ScoredChunk> scoreChunks(List<String> chunks, String inferenceText, int maxResults) throws IOException {
if (chunks == null || chunks.isEmpty() || inferenceText == null || inferenceText.trim().isEmpty()) {
return new ArrayList<>();
}

try (Directory directory = new ByteBuffersDirectory()) {
IndexWriterConfig config = new IndexWriterConfig(analyzer);
try (IndexWriter writer = new IndexWriter(directory, config)) {
for (String chunk : chunks) {
Document doc = new Document();
doc.add(new TextField(CONTENT_FIELD, chunk, Field.Store.YES));
writer.addDocument(doc);
}
writer.commit();
}

try (DirectoryReader reader = DirectoryReader.open(directory)) {
IndexSearcher searcher = new IndexSearcher(reader);

org.apache.lucene.util.QueryBuilder qb = new QueryBuilder(analyzer);
Query query = qb.createBooleanQuery(CONTENT_FIELD, inferenceText, BooleanClause.Occur.SHOULD);
int numResults = Math.min(maxResults, chunks.size());
TopDocs topDocs = searcher.search(query, numResults);

List<ScoredChunk> scoredChunks = new ArrayList<>();
for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
Document doc = reader.storedFields().document(scoreDoc.doc);
String content = doc.get(CONTENT_FIELD);
scoredChunks.add(new ScoredChunk(content, scoreDoc.score));
}

return scoredChunks;
}
}
}

private String[] tokenizeText(String text) throws IOException {
List<String> tokens = new ArrayList<>();
try (org.apache.lucene.analysis.TokenStream tokenStream = analyzer.tokenStream(CONTENT_FIELD, text)) {
org.apache.lucene.analysis.tokenattributes.CharTermAttribute termAttribute = tokenStream.addAttribute(
org.apache.lucene.analysis.tokenattributes.CharTermAttribute.class
);
tokenStream.reset();
while (tokenStream.incrementToken()) {
tokens.add(termAttribute.toString());
}
tokenStream.end();
}
return tokens.toArray(new String[0]);
}

/**
* Represents a chunk with its relevance score.
*/
public record ScoredChunk(String content, float score) {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0; you may not use this file except in compliance with the Elastic License
* 2.0.
*/

package org.elasticsearch.xpack.core.common.chunks;

import org.elasticsearch.test.ESTestCase;

import java.io.IOException;
import java.util.Arrays;
import java.util.List;

import static org.hamcrest.Matchers.greaterThan;

public class MemoryIndexChunkScorerTests extends ESTestCase {

public void testScoreChunks() throws IOException {
MemoryIndexChunkScorer scorer = new MemoryIndexChunkScorer();

List<String> snippets = Arrays.asList(
"Cats like to sleep all day and play with mice",
"Dogs are loyal companions and great pets",
"The weather today is very sunny and warm",
"Dogs love to play with toys and go for walks",
"Elasticsearch is a great search engine"
);

String inferenceText = "dogs play walk";
int maxResults = 3;

List<MemoryIndexChunkScorer.ScoredChunk> scoredChunks = scorer.scoreChunks(snippets, inferenceText, maxResults);

assertEquals(maxResults, scoredChunks.size());

// The snippets about dogs should score highest, followed by the snippet about cats
MemoryIndexChunkScorer.ScoredChunk snippet = scoredChunks.getFirst();
assertTrue(snippet.content().equalsIgnoreCase("Dogs love to play with toys and go for walks"));
assertThat(snippet.score(), greaterThan(0f));

snippet = scoredChunks.get(1);
assertTrue(snippet.content().equalsIgnoreCase("Dogs are loyal companions and great pets"));
assertThat(snippet.score(), greaterThan(0f));

snippet = scoredChunks.get(2);
assertTrue(snippet.content().equalsIgnoreCase("Cats like to sleep all day and play with mice"));
assertThat(snippet.score(), greaterThan(0f));

// Scores should be in descending order
for (int i = 1; i < scoredChunks.size(); i++) {
assertTrue(scoredChunks.get(i - 1).score() >= scoredChunks.get(i).score());
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
import static org.elasticsearch.xpack.inference.queries.SemanticKnnVectorQueryRewriteInterceptor.SEMANTIC_KNN_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
import static org.elasticsearch.xpack.inference.queries.SemanticMatchQueryRewriteInterceptor.SEMANTIC_MATCH_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
import static org.elasticsearch.xpack.inference.queries.SemanticSparseVectorQueryRewriteInterceptor.SEMANTIC_SPARSE_VECTOR_QUERY_REWRITE_INTERCEPTION_SUPPORTED;
import static org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder.RERANK_SNIPPETS;
import static org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_SNIPPETS;
import static org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder.RERANK_RESCORE_CHUNKS;
import static org.elasticsearch.xpack.inference.rank.textsimilarity.TextSimilarityRankRetrieverBuilder.TEXT_SIMILARITY_RERANKER_RESCORE_CHUNKS;

/**
* Provides inference features.
Expand Down Expand Up @@ -85,8 +85,8 @@ public Set<NodeFeature> getTestFeatures() {
SEMANTIC_TEXT_FIELDS_CHUNKS_FORMAT
)
);
if (RERANK_SNIPPETS.isEnabled()) {
testFeatures.add(TEXT_SIMILARITY_RERANKER_SNIPPETS);
if (RERANK_RESCORE_CHUNKS.isEnabled()) {
testFeatures.add(TEXT_SIMILARITY_RERANKER_RESCORE_CHUNKS);
}
return testFeatures;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,31 @@ public RecursiveChunkingSettings(StreamInput in) throws IOException {
separators = in.readCollectionAsList(StreamInput::readString);
}

@Override
public void validate() {
ValidationException validationException = new ValidationException();

if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT || maxChunkSize > MAX_CHUNK_SIZE_UPPER_LIMIT) {
validationException.addValidationError(
ChunkingSettingsOptions.MAX_CHUNK_SIZE
+ "["
+ maxChunkSize
+ "] must be between "
+ MAX_CHUNK_SIZE_LOWER_LIMIT
+ " and "
+ MAX_CHUNK_SIZE_UPPER_LIMIT
);

if (separators != null && separators.isEmpty()) {
validationException.addValidationError("Recursive chunking settings can not have an empty list of separators");
}

if (validationException.validationErrors().isEmpty() == false) {
throw validationException;
}
}
}

public static RecursiveChunkingSettings fromMap(Map<String, Object> map) {
ValidationException validationException = new ValidationException();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,33 @@ public SentenceBoundaryChunkingSettings(StreamInput in) throws IOException {
}
}

@Override
public void validate() {
ValidationException validationException = new ValidationException();

if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT || maxChunkSize > MAX_CHUNK_SIZE_UPPER_LIMIT) {
validationException.addValidationError(
ChunkingSettingsOptions.MAX_CHUNK_SIZE
+ "["
+ maxChunkSize
+ "] must be between "
+ MAX_CHUNK_SIZE_LOWER_LIMIT
+ " and "
+ MAX_CHUNK_SIZE_UPPER_LIMIT
);
}

if (sentenceOverlap > 1 || sentenceOverlap < 0) {
validationException.addValidationError(
ChunkingSettingsOptions.SENTENCE_OVERLAP + "[" + sentenceOverlap + "] must be either 0 or 1"
);
}

if (validationException.validationErrors().isEmpty() == false) {
throw validationException;
}
}

@Override
public Map<String, Object> asMap() {
return Map.of(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,33 @@ public WordBoundaryChunkingSettings(StreamInput in) throws IOException {
overlap = in.readInt();
}

@Override
public void validate() {
ValidationException validationException = new ValidationException();

if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT || maxChunkSize > MAX_CHUNK_SIZE_UPPER_LIMIT) {
validationException.addValidationError(
ChunkingSettingsOptions.MAX_CHUNK_SIZE
+ "["
+ maxChunkSize
+ "] must be between "
+ MAX_CHUNK_SIZE_LOWER_LIMIT
+ " and "
+ MAX_CHUNK_SIZE_UPPER_LIMIT
);
}

if (overlap > maxChunkSize / 2) {
validationException.addValidationError(
ChunkingSettingsOptions.OVERLAP + "[" + overlap + "] must be less than or equal to half of max chunk size"
);
}

if (validationException.validationErrors().isEmpty() == false) {
throw validationException;
}
}

@Override
public Map<String, Object> asMap() {
return Map.of(
Expand Down
Loading