elastic
diff --git a/‎docs/changelog/133576.yaml‎
Lines changed: 5 additions & 0 deletions b/‎docs/changelog/133576.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎server/src/main/java/org/elasticsearch/inference/ChunkingSettings.java‎
Lines changed: 2 additions & 0 deletions b/‎server/src/main/java/org/elasticsearch/inference/ChunkingSettings.java‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎x-pack/plugin/core/src/main/java/module-info.java‎
Lines changed: 1 addition & 0 deletions b/‎x-pack/plugin/core/src/main/java/module-info.java‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/common/chunks/MemoryIndexChunkScorer.java‎
Lines changed: 98 additions & 0 deletions b/‎x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/common/chunks/MemoryIndexChunkScorer.java‎
Lines changed: 98 additions & 0 deletions
diff --git a/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/common/chunks/MemoryIndexChunkScorerTests.java‎
Lines changed: 95 additions & 0 deletions b/‎x-pack/plugin/core/src/test/java/org/elasticsearch/xpack/core/common/chunks/MemoryIndexChunkScorerTests.java‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunkingSettings.java‎
Lines changed: 19 additions & 0 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/RecursiveChunkingSettings.java‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunkingSettings.java‎
Lines changed: 21 additions & 0 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/SentenceBoundaryChunkingSettings.java‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/WordBoundaryChunkingSettings.java‎
Lines changed: 21 additions & 0 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/chunking/WordBoundaryChunkingSettings.java‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/rank/textsimilarity/ChunkScorerConfig.java‎
Lines changed: 100 additions & 0 deletions b/‎x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/rank/textsimilarity/ChunkScorerConfig.java‎
Lines changed: 100 additions & 0 deletions
@@ -0,0 +1,5 @@
+pr: 133576
+summary: Text similarity reranker chunks and scores snippets
+area: Relevance
+type: enhancement
+issues: []
@@ -24,4 +24,6 @@ public interface ChunkingSettings extends ToXContentObject, VersionedNamedWritea
      * @return The max chunk size specified, or null if not specified
      */
     Integer maxChunkSize();
+
+    default void validate() {}
 }
@@ -234,6 +234,7 @@
     exports org.elasticsearch.xpack.core.watcher.watch;
     exports org.elasticsearch.xpack.core.watcher;
     exports org.elasticsearch.xpack.core.security.authc.apikey;
+    exports org.elasticsearch.xpack.core.common.chunks;
 
     provides org.elasticsearch.action.admin.cluster.node.info.ComponentVersionNumber
         with
 
@@ -0,0 +1,98 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.core.common.chunks;
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.TextField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.store.ByteBuffersDirectory;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.QueryBuilder;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Utility class for scoring pre-determined chunks using an in-memory Lucene index.
+ */
+public class MemoryIndexChunkScorer {
+
+    private static final String CONTENT_FIELD = "content";
+
+    private final StandardAnalyzer analyzer;
+
+    public MemoryIndexChunkScorer() {
+        // TODO: Allow analyzer to be customizable and/or read from the field mapping
+        this.analyzer = new StandardAnalyzer();
+    }
+
+    /**
+     * Creates an in-memory index of chunks, or chunks, returns ordered, scored list.
+     *
+     * @param chunks the list of text chunks to score
+     * @param inferenceText the query text to compare against
+     * @param maxResults maximum number of results to return
+     * @return list of scored chunks ordered by relevance
+     * @throws IOException on failure scoring chunks
+     */
+    public List<ScoredChunk> scoreChunks(List<String> chunks, String inferenceText, int maxResults) throws IOException {
+        if (chunks == null || chunks.isEmpty() || inferenceText == null || inferenceText.trim().isEmpty()) {
+            return new ArrayList<>();
+        }
+
+        try (Directory directory = new ByteBuffersDirectory()) {
+            IndexWriterConfig config = new IndexWriterConfig(analyzer);
+            try (IndexWriter writer = new IndexWriter(directory, config)) {
+                for (String chunk : chunks) {
+                    Document doc = new Document();
+                    doc.add(new TextField(CONTENT_FIELD, chunk, Field.Store.YES));
+                    writer.addDocument(doc);
+                }
+                writer.commit();
+            }
+
+            try (DirectoryReader reader = DirectoryReader.open(directory)) {
+                IndexSearcher searcher = new IndexSearcher(reader);
+
+                org.apache.lucene.util.QueryBuilder qb = new QueryBuilder(analyzer);
+                Query query = qb.createBooleanQuery(CONTENT_FIELD, inferenceText, BooleanClause.Occur.SHOULD);
+                int numResults = Math.min(maxResults, chunks.size());
+                TopDocs topDocs = searcher.search(query, numResults);
+
+                List<ScoredChunk> scoredChunks = new ArrayList<>();
+                for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
+                    Document doc = reader.storedFields().document(scoreDoc.doc);
+                    String content = doc.get(CONTENT_FIELD);
+                    scoredChunks.add(new ScoredChunk(content, scoreDoc.score));
+                }
+
+                // It's possible that no chunks were scorable (for example, a semantic match that does not have a lexical match).
+                // In this case, we'll return the first N chunks with a score of 0.
+                // TODO: consider parameterizing this
+                return scoredChunks.isEmpty() == false
+                    ? scoredChunks
+                    : chunks.subList(0, Math.min(maxResults, chunks.size())).stream().map(c -> new ScoredChunk(c, 0.0f)).toList();
+            }
+        }
+    }
+
+    /**
+     * Represents a chunk with its relevance score.
+     */
+    public record ScoredChunk(String content, float score) {}
+}
@@ -0,0 +1,95 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.core.common.chunks;
+
+import org.elasticsearch.test.ESTestCase;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.List;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
+
+public class MemoryIndexChunkScorerTests extends ESTestCase {
+
+    private static final List<String> CHUNKS = Arrays.asList(
+        "Cats like to sleep all day and play with mice",
+        "Dogs are loyal companions and great pets",
+        "The weather today is very sunny and warm",
+        "Dogs love to play with toys and go for walks",
+        "Elasticsearch is a great search engine"
+    );
+
+    public void testScoreChunks() throws IOException {
+        MemoryIndexChunkScorer scorer = new MemoryIndexChunkScorer();
+
+        String inferenceText = "dogs play walk";
+        int maxResults = 3;
+
+        List<MemoryIndexChunkScorer.ScoredChunk> scoredChunks = scorer.scoreChunks(CHUNKS, inferenceText, maxResults);
+
+        assertEquals(maxResults, scoredChunks.size());
+
+        // The chunks about dogs should score highest, followed by the chunk about cats
+        MemoryIndexChunkScorer.ScoredChunk chunk = scoredChunks.getFirst();
+        assertTrue(chunk.content().equalsIgnoreCase("Dogs love to play with toys and go for walks"));
+        assertThat(chunk.score(), greaterThan(0f));
+
+        chunk = scoredChunks.get(1);
+        assertTrue(chunk.content().equalsIgnoreCase("Dogs are loyal companions and great pets"));
+        assertThat(chunk.score(), greaterThan(0f));
+
+        chunk = scoredChunks.get(2);
+        assertTrue(chunk.content().equalsIgnoreCase("Cats like to sleep all day and play with mice"));
+        assertThat(chunk.score(), greaterThan(0f));
+
+        // Scores should be in descending order
+        for (int i = 1; i < scoredChunks.size(); i++) {
+            assertTrue(scoredChunks.get(i - 1).score() >= scoredChunks.get(i).score());
+        }
+    }
+
+    public void testEmptyChunks() throws IOException {
+
+        int maxResults = 3;
+
+        MemoryIndexChunkScorer scorer = new MemoryIndexChunkScorer();
+
+        // Zero results
+        List<MemoryIndexChunkScorer.ScoredChunk> scoredChunks = scorer.scoreChunks(CHUNKS, "puggles", maxResults);
+        assertEquals(maxResults, scoredChunks.size());
+
+        // There were no results so we return the first N chunks in order
+        MemoryIndexChunkScorer.ScoredChunk chunk = scoredChunks.getFirst();
+        assertTrue(chunk.content().equalsIgnoreCase("Cats like to sleep all day and play with mice"));
+        assertThat(chunk.score(), equalTo(0f));
+
+        chunk = scoredChunks.get(1);
+        assertTrue(chunk.content().equalsIgnoreCase("Dogs are loyal companions and great pets"));
+        assertThat(chunk.score(), equalTo(0f));
+
+        chunk = scoredChunks.get(2);
+        assertTrue(chunk.content().equalsIgnoreCase("The weather today is very sunny and warm"));
+        assertThat(chunk.score(), equalTo(0f));
+
+        // Null and Empty chunk input
+        scoredChunks = scorer.scoreChunks(List.of(), "puggles", maxResults);
+        assertTrue(scoredChunks.isEmpty());
+
+        scoredChunks = scorer.scoreChunks(CHUNKS, "", maxResults);
+        assertTrue(scoredChunks.isEmpty());
+
+        scoredChunks = scorer.scoreChunks(null, "puggles", maxResults);
+        assertTrue(scoredChunks.isEmpty());
+
+        scoredChunks = scorer.scoreChunks(CHUNKS, null, maxResults);
+        assertTrue(scoredChunks.isEmpty());
+    }
+
+}
@@ -52,6 +52,25 @@ public RecursiveChunkingSettings(StreamInput in) throws IOException {
         separators = in.readCollectionAsList(StreamInput::readString);
     }
 
+    @Override
+    public void validate() {
+        ValidationException validationException = new ValidationException();
+
+        if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT) {
+            validationException.addValidationError(
+                ChunkingSettingsOptions.MAX_CHUNK_SIZE + "[" + maxChunkSize + "] must be above " + MAX_CHUNK_SIZE_LOWER_LIMIT
+            );
+
+            if (separators != null && separators.isEmpty()) {
+                validationException.addValidationError("Recursive chunking settings can not have an empty list of separators");
+            }
+
+            if (validationException.validationErrors().isEmpty() == false) {
+                throw validationException;
+            }
+        }
+    }
+
     public static RecursiveChunkingSettings fromMap(Map<String, Object> map) {
         ValidationException validationException = new ValidationException();
 
 
@@ -59,6 +59,27 @@ public Integer maxChunkSize() {
         return maxChunkSize;
     }
 
+    @Override
+    public void validate() {
+        ValidationException validationException = new ValidationException();
+
+        if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT) {
+            validationException.addValidationError(
+                ChunkingSettingsOptions.MAX_CHUNK_SIZE + "[" + maxChunkSize + "] must be above " + MAX_CHUNK_SIZE_LOWER_LIMIT
+            );
+        }
+
+        if (sentenceOverlap > 1 || sentenceOverlap < 0) {
+            validationException.addValidationError(
+                ChunkingSettingsOptions.SENTENCE_OVERLAP + "[" + sentenceOverlap + "] must be either 0 or 1"
+            );
+        }
+
+        if (validationException.validationErrors().isEmpty() == false) {
+            throw validationException;
+        }
+    }
+
     @Override
     public Map<String, Object> asMap() {
         return Map.of(
 
@@ -48,6 +48,27 @@ public WordBoundaryChunkingSettings(StreamInput in) throws IOException {
         overlap = in.readInt();
     }
 
+    @Override
+    public void validate() {
+        ValidationException validationException = new ValidationException();
+
+        if (maxChunkSize < MAX_CHUNK_SIZE_LOWER_LIMIT) {
+            validationException.addValidationError(
+                ChunkingSettingsOptions.MAX_CHUNK_SIZE + "[" + maxChunkSize + "] must be above " + MAX_CHUNK_SIZE_LOWER_LIMIT
+            );
+        }
+
+        if (overlap > maxChunkSize / 2) {
+            validationException.addValidationError(
+                ChunkingSettingsOptions.OVERLAP + "[" + overlap + "] must be less than or equal to half of max chunk size"
+            );
+        }
+
+        if (validationException.validationErrors().isEmpty() == false) {
+            throw validationException;
+        }
+    }
+
     @Override
     public Map<String, Object> asMap() {
         return Map.of(
 
@@ -0,0 +1,100 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0; you may not use this file except in compliance with the Elastic License
+ * 2.0.
+ */
+
+package org.elasticsearch.xpack.inference.rank.textsimilarity;
+
+import org.elasticsearch.common.io.stream.StreamInput;
+import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.inference.ChunkingSettings;
+import org.elasticsearch.xpack.inference.chunking.ChunkingSettingsBuilder;
+import org.elasticsearch.xpack.inference.chunking.SentenceBoundaryChunkingSettings;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Objects;
+
+public class ChunkScorerConfig implements Writeable {
+
+    public final Integer size;
+    private final String inferenceText;
+    private final ChunkingSettings chunkingSettings;
+
+    public static final int DEFAULT_CHUNK_SIZE = 300;
+    public static final int DEFAULT_SIZE = 1;
+
+    public static ChunkingSettings createChunkingSettings(Integer chunkSize) {
+        int chunkSizeOrDefault = chunkSize != null ? chunkSize : DEFAULT_CHUNK_SIZE;
+        ChunkingSettings chunkingSettings = new SentenceBoundaryChunkingSettings(chunkSizeOrDefault, 0);
+        chunkingSettings.validate();
+        return chunkingSettings;
+    }
+
+    public static ChunkingSettings chunkingSettingsFromMap(Map<String, Object> map) {
+
+        if (map == null || map.isEmpty()) {
+            return createChunkingSettings(DEFAULT_CHUNK_SIZE);
+        }
+
+        if (map.size() == 1 && map.containsKey("max_chunk_size")) {
+            return createChunkingSettings((Integer) map.get("max_chunk_size"));
+        }
+
+        return ChunkingSettingsBuilder.fromMap(map);
+    }
+
+    public ChunkScorerConfig(StreamInput in) throws IOException {
+        this.size = in.readOptionalVInt();
+        this.inferenceText = in.readString();
+        Map<String, Object> chunkingSettingsMap = in.readGenericMap();
+        this.chunkingSettings = ChunkingSettingsBuilder.fromMap(chunkingSettingsMap);
+    }
+
+    public ChunkScorerConfig(Integer size, ChunkingSettings chunkingSettings) {
+        this(size, null, chunkingSettings);
+    }
+
+    public ChunkScorerConfig(Integer size, String inferenceText, ChunkingSettings chunkingSettings) {
+        this.size = size;
+        this.inferenceText = inferenceText;
+        this.chunkingSettings = chunkingSettings;
+    }
+
+    @Override
+    public void writeTo(StreamOutput out) throws IOException {
+        out.writeOptionalVInt(size);
+        out.writeString(inferenceText);
+        out.writeGenericMap(chunkingSettings.asMap());
+    }
+
+    public Integer size() {
+        return size;
+    }
+
+    public String inferenceText() {
+        return inferenceText;
+    }
+
+    public ChunkingSettings chunkingSettings() {
+        return chunkingSettings;
+    }
+
+    @Override
+    public boolean equals(Object o) {
+        if (this == o) return true;
+        if (o == null || getClass() != o.getClass()) return false;
+        ChunkScorerConfig that = (ChunkScorerConfig) o;
+        return Objects.equals(size, that.size)
+            && Objects.equals(inferenceText, that.inferenceText)
+            && Objects.equals(chunkingSettings, that.chunkingSettings);
+    }
+
+    @Override
+    public int hashCode() {
+        return Objects.hash(size, inferenceText, chunkingSettings);
+    }
+}
Original file line number	Diff line number	Diff line change
`@@ -24,4 +24,6 @@ public interface ChunkingSettings extends ToXContentObject, VersionedNamedWritea`
`24`	`24`	`* @return The max chunk size specified, or null if not specified`
`25`	`25`	`*/`
`26`	`26`	`Integer maxChunkSize();`
	`27`	`+`
	`28`	`+ default void validate() {}`
`27`	`29`	`}`