Skip to content

Commit a807ab5

Browse files
committed
Fix chunking in test inference service and yaml tests
1 parent 0b2ebf6 commit a807ab5

File tree

4 files changed

+146
-58
lines changed

4 files changed

+146
-58
lines changed

x-pack/plugin/inference/qa/test-service-plugin/src/main/java/org/elasticsearch/xpack/inference/mock/AbstractTestInferenceService.java

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import java.io.IOException;
3131
import java.util.ArrayList;
32+
import java.util.Collections;
3233
import java.util.HashMap;
3334
import java.util.List;
3435
import java.util.Map;
@@ -111,25 +112,24 @@ public void start(Model model, TimeValue timeout, ActionListener<Boolean> listen
111112
@Override
112113
public void close() throws IOException {}
113114

114-
protected List<String> chunkInputs(List<String> input, ChunkingSettings chunkingSettings) {
115+
protected List<String> chunkInputs(String input, ChunkingSettings chunkingSettings) {
115116
if (chunkingSettings == null) {
116-
return input;
117+
return Collections.singletonList(input);
117118
}
118119
List<String> chunkedInputs = new ArrayList<>();
119120
ChunkingStrategy chunkingStrategy = chunkingSettings.getChunkingStrategy();
120121
if (chunkingStrategy == ChunkingStrategy.WORD) {
121122
WordBoundaryChunker chunker = new WordBoundaryChunker();
122-
for (String inputString : input) {
123-
WordBoundaryChunkingSettings wordBoundaryChunkingSettings = (WordBoundaryChunkingSettings) chunkingSettings;
124-
List<WordBoundaryChunker.ChunkOffset> offsets = chunker.chunk(
125-
inputString,
126-
wordBoundaryChunkingSettings.maxChunkSize(),
127-
wordBoundaryChunkingSettings.overlap()
128-
);
129-
for (WordBoundaryChunker.ChunkOffset offset : offsets) {
130-
chunkedInputs.add(inputString.substring(offset.start(), offset.end()));
131-
}
123+
WordBoundaryChunkingSettings wordBoundaryChunkingSettings = (WordBoundaryChunkingSettings) chunkingSettings;
124+
List<WordBoundaryChunker.ChunkOffset> offsets = chunker.chunk(
125+
input,
126+
wordBoundaryChunkingSettings.maxChunkSize(),
127+
wordBoundaryChunkingSettings.overlap()
128+
);
129+
for (WordBoundaryChunker.ChunkOffset offset : offsets) {
130+
chunkedInputs.add(input.substring(offset.start(), offset.end()));
132131
}
132+
133133
} else {
134134
// Won't implement till we need it
135135
throw new UnsupportedOperationException("Test inference service only supports word chunking strategies");

x-pack/plugin/inference/qa/test-service-plugin/src/main/java/org/elasticsearch/xpack/inference/mock/TestDenseInferenceServiceExtension.java

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,9 @@
3636
import org.elasticsearch.xcontent.ToXContentObject;
3737
import org.elasticsearch.xcontent.XContentBuilder;
3838
import org.elasticsearch.xpack.core.inference.results.ChunkedInferenceEmbedding;
39+
import org.elasticsearch.xpack.core.inference.results.SparseEmbeddingResults;
3940
import org.elasticsearch.xpack.core.inference.results.TextEmbeddingFloatResults;
41+
import org.elasticsearch.xpack.core.ml.search.WeightedToken;
4042

4143
import java.io.IOException;
4244
import java.nio.charset.StandardCharsets;
@@ -175,27 +177,28 @@ private TextEmbeddingFloatResults makeResults(List<String> input, int dimensions
175177
return new TextEmbeddingFloatResults(embeddings);
176178
}
177179

178-
private List<ChunkedInference> makeChunkedResults(List<String> input, int dimensions, ChunkingSettings chunkingSettings) {
179-
List<String> chunkedInputs = chunkInputs(input, chunkingSettings);
180-
return makeChunkedResults(chunkedInputs, dimensions);
181-
}
182-
183-
private List<ChunkedInference> makeChunkedResults(List<String> input, int dimensions) {
184-
TextEmbeddingFloatResults nonChunkedResults = makeResults(input, dimensions);
185-
186-
var results = new ArrayList<ChunkedInference>();
187-
for (int i = 0; i < input.size(); i++) {
188-
results.add(
189-
new ChunkedInferenceEmbedding(
190-
List.of(
191-
new TextEmbeddingFloatResults.Chunk(
192-
nonChunkedResults.embeddings().get(i).values(),
193-
input.get(i),
194-
new ChunkedInference.TextOffset(0, input.get(i).length())
195-
)
180+
private List<ChunkedInference> makeChunkedResults(List<String> inputs, int dimensions, ChunkingSettings chunkingSettings) {
181+
182+
List<ChunkedInference> results = new ArrayList<>();
183+
for (int i = 0; i < inputs.size(); i++) {
184+
String input = inputs.get(i);
185+
TextEmbeddingFloatResults nonChunkedResults = makeResults(inputs, dimensions);
186+
List<String> chunkedInput = chunkInputs(input, chunkingSettings);
187+
List<TextEmbeddingFloatResults.Chunk> chunks = new ArrayList<>();
188+
int offset = 0;
189+
for (String c : chunkedInput) {
190+
offset = input.indexOf(c, offset);
191+
int endOffset = offset + c.length();
192+
chunks.add(
193+
new TextEmbeddingFloatResults.Chunk(
194+
nonChunkedResults.embeddings().get(i).values(),
195+
c,
196+
new ChunkedInference.TextOffset(offset, endOffset)
196197
)
197-
)
198-
);
198+
);
199+
}
200+
ChunkedInferenceEmbedding chunkedInferenceEmbedding = new ChunkedInferenceEmbedding(chunks);
201+
results.add(chunkedInferenceEmbedding);
199202
}
200203
return results;
201204
}

x-pack/plugin/inference/qa/test-service-plugin/src/main/java/org/elasticsearch/xpack/inference/mock/TestSparseInferenceServiceExtension.java

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.elasticsearch.xcontent.ToXContentObject;
3535
import org.elasticsearch.xcontent.XContentBuilder;
3636
import org.elasticsearch.xpack.core.inference.results.ChunkedInferenceEmbedding;
37+
import org.elasticsearch.xpack.core.inference.results.EmbeddingResults;
3738
import org.elasticsearch.xpack.core.inference.results.SparseEmbeddingResults;
3839
import org.elasticsearch.xpack.core.ml.search.WeightedToken;
3940

@@ -166,29 +167,24 @@ private SparseEmbeddingResults makeResults(List<String> input) {
166167
return new SparseEmbeddingResults(embeddings);
167168
}
168169

169-
private List<ChunkedInference> makeChunkedResults(List<String> input, ChunkingSettings chunkingSettings) {
170-
List<String> chunkedInputs = chunkInputs(input, chunkingSettings);
171-
return makeChunkedResults(chunkedInputs);
172-
}
173-
174-
private List<ChunkedInference> makeChunkedResults(List<String> input) {
170+
private List<ChunkedInference> makeChunkedResults(List<String> inputs, ChunkingSettings chunkingSettings) {
175171
List<ChunkedInference> results = new ArrayList<>();
176-
for (int i = 0; i < input.size(); i++) {
172+
for (int i = 0; i < inputs.size(); i++) {
173+
String input = inputs.get(i);
177174
var tokens = new ArrayList<WeightedToken>();
178175
for (int j = 0; j < 5; j++) {
179-
tokens.add(new WeightedToken("feature_" + j, generateEmbedding(input.get(i), j)));
176+
tokens.add(new WeightedToken("feature_" + j, generateEmbedding(input, j)));
180177
}
181-
results.add(
182-
new ChunkedInferenceEmbedding(
183-
List.of(
184-
new SparseEmbeddingResults.Chunk(
185-
tokens,
186-
input.get(i),
187-
new ChunkedInference.TextOffset(0, input.get(i).length())
188-
)
189-
)
190-
)
191-
);
178+
List<String> chunkedInput = chunkInputs(input, chunkingSettings);
179+
List<SparseEmbeddingResults.Chunk> chunks = new ArrayList<>();
180+
int offset = 0;
181+
for (String c : chunkedInput) {
182+
offset = input.indexOf(c, offset);
183+
int endOffset = offset + c.length();
184+
chunks.add(new SparseEmbeddingResults.Chunk(tokens, c, new ChunkedInference.TextOffset(offset, endOffset)));
185+
}
186+
ChunkedInferenceEmbedding chunkedInferenceEmbedding = new ChunkedInferenceEmbedding(chunks);
187+
results.add(chunkedInferenceEmbedding);
192188
}
193189
return results;
194190
}

x-pack/plugin/inference/src/yamlRestTest/resources/rest-api-spec/test/inference/25_semantic_text_field_mapping_chunking.yml

Lines changed: 96 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,22 @@ setup:
7575
max_chunk_size: 10
7676
overlap: 1
7777

78+
- do:
79+
indices.create:
80+
index: custom-chunking-dense
81+
body:
82+
mappings:
83+
properties:
84+
keyword_field:
85+
type: keyword
86+
inference_field:
87+
type: semantic_text
88+
inference_id: dense-inference-id
89+
chunking_settings:
90+
strategy: word
91+
max_chunk_size: 10
92+
overlap: 1
93+
7894
- do:
7995
index:
8096
index: default-chunking-sparse
@@ -93,25 +109,57 @@ setup:
93109
inference_field: "Elasticsearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides."
94110
refresh: true
95111

112+
- do:
113+
index:
114+
index: default-chunking-dense
115+
id: doc_3
116+
body:
117+
keyword_field: "default sentence chunking"
118+
inference_field: "Elasticsearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides."
119+
refresh: true
120+
121+
- do:
122+
index:
123+
index: custom-chunking-dense
124+
id: doc_4
125+
body:
126+
keyword_field: "custom word chunking"
127+
inference_field: "Elasticsearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides."
128+
refresh: true
129+
96130
---
97131
"We return chunking configurations with mappings":
98132

99133
- do:
100134
indices.get_mapping:
101135
index: default-chunking-sparse
102136

103-
- is_false: default-chunking.mappings.properties.inference_field.chunking_settings
137+
- is_false: default-chunking-sparse.mappings.properties.inference_field.chunking_settings
104138

105139
- do:
106140
indices.get_mapping:
107141
index: custom-chunking-sparse
108142

109-
- match: { "custom-chunking.mappings.properties.inference_field.chunking_settings.strategy": "word" }
110-
- match: { "custom-chunking.mappings.properties.inference_field.chunking_settings.max_chunk_size": 10 }
111-
- match: { "custom-chunking.mappings.properties.inference_field.chunking_settings.overlap": 5 }
143+
- match: { "custom-chunking-sparse.mappings.properties.inference_field.chunking_settings.strategy": "word" }
144+
- match: { "custom-chunking-sparse.mappings.properties.inference_field.chunking_settings.max_chunk_size": 10 }
145+
- match: { "custom-chunking-sparse.mappings.properties.inference_field.chunking_settings.overlap": 1 }
146+
147+
- do:
148+
indices.get_mapping:
149+
index: default-chunking-dense
150+
151+
- is_false: default-chunking-dense.mappings.properties.inference_field.chunking_settings
152+
153+
- do:
154+
indices.get_mapping:
155+
index: custom-chunking-dense
156+
157+
- match: { "custom-chunking-dense.mappings.properties.inference_field.chunking_settings.strategy": "word" }
158+
- match: { "custom-chunking-dense.mappings.properties.inference_field.chunking_settings.max_chunk_size": 10 }
159+
- match: { "custom-chunking-dense.mappings.properties.inference_field.chunking_settings.overlap": 1 }
112160

113161
---
114-
"We return different chunks based on configured chunking overrides or model defaults":
162+
"We return different chunks based on configured chunking overrides or model defaults for sparse embeddings":
115163

116164
- do:
117165
search:
@@ -149,6 +197,47 @@ setup:
149197
- match: { hits.total.value: 1 }
150198
- match: { hits.hits.0._id: "doc_2" }
151199
- length: { hits.hits.0.highlight.inference_field: 2 }
152-
- match: { hits.hits.0.highlight.inference_field.0: "Elasticsearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all" }
153-
- match: { hits.hits.0.highlight.inference_field.1: " the features it provides." }
200+
- match: { hits.hits.0.highlight.inference_field.0: "Elasticsearch is an open source, distributed, RESTful, search engine which" }
201+
- match: { hits.hits.0.highlight.inference_field.1: " which is built on top of Lucene internally and enjoys" }
154202

203+
---
204+
"We return different chunks based on configured chunking overrides or model defaults for dense embeddings":
205+
206+
- do:
207+
search:
208+
index: default-chunking-dense
209+
body:
210+
query:
211+
semantic:
212+
field: "inference_field"
213+
query: "What is Elasticsearch?"
214+
highlight:
215+
fields:
216+
inference_field:
217+
type: "semantic"
218+
number_of_fragments: 2
219+
220+
- match: { hits.total.value: 1 }
221+
- match: { hits.hits.0._id: "doc_3" }
222+
- length: { hits.hits.0.highlight.inference_field: 1 }
223+
- match: { hits.hits.0.highlight.inference_field.0: "Elasticsearch is an open source, distributed, RESTful, search engine which is built on top of Lucene internally and enjoys all the features it provides." }
224+
225+
- do:
226+
search:
227+
index: custom-chunking-dense
228+
body:
229+
query:
230+
semantic:
231+
field: "inference_field"
232+
query: "What is Elasticsearch?"
233+
highlight:
234+
fields:
235+
inference_field:
236+
type: "semantic"
237+
number_of_fragments: 2
238+
239+
- match: { hits.total.value: 1 }
240+
- match: { hits.hits.0._id: "doc_4" }
241+
- length: { hits.hits.0.highlight.inference_field: 2 }
242+
- match: { hits.hits.0.highlight.inference_field.0: "Elasticsearch is an open source, distributed, RESTful, search engine which" }
243+
- match: { hits.hits.0.highlight.inference_field.1: " which is built on top of Lucene internally and enjoys" }

0 commit comments

Comments
 (0)