Skip to content

Commit 5b6b51e

Browse files
Adding ChunkingProcessor
1 parent 825b5de commit 5b6b51e

File tree

2 files changed

+102
-1
lines changed

2 files changed

+102
-1
lines changed

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,13 @@
2929
import org.elasticsearch.indices.SystemIndexDescriptor;
3030
import org.elasticsearch.inference.InferenceServiceExtension;
3131
import org.elasticsearch.inference.InferenceServiceRegistry;
32+
import org.elasticsearch.ingest.Processor;
3233
import org.elasticsearch.license.License;
3334
import org.elasticsearch.license.LicensedFeature;
3435
import org.elasticsearch.node.PluginComponentBinding;
3536
import org.elasticsearch.plugins.ActionPlugin;
3637
import org.elasticsearch.plugins.ExtensiblePlugin;
38+
import org.elasticsearch.plugins.IngestPlugin;
3739
import org.elasticsearch.plugins.MapperPlugin;
3840
import org.elasticsearch.plugins.Plugin;
3941
import org.elasticsearch.plugins.SearchPlugin;
@@ -76,6 +78,7 @@
7678
import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
7779
import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings;
7880
import org.elasticsearch.xpack.inference.highlight.SemanticTextHighlighter;
81+
import org.elasticsearch.xpack.inference.ingest.ChunkingProcessor;
7982
import org.elasticsearch.xpack.inference.logging.ThrottlerManager;
8083
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
8184
import org.elasticsearch.xpack.inference.mapper.SemanticInferenceMetadataFieldsMapper;
@@ -134,7 +137,14 @@
134137
import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.DEPRECATED_ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;
135138
import static org.elasticsearch.xpack.inference.services.elastic.ElasticInferenceServiceFeature.ELASTIC_INFERENCE_SERVICE_FEATURE_FLAG;
136139

137-
public class InferencePlugin extends Plugin implements ActionPlugin, ExtensiblePlugin, SystemIndexPlugin, MapperPlugin, SearchPlugin {
140+
public class InferencePlugin extends Plugin
141+
implements
142+
ActionPlugin,
143+
ExtensiblePlugin,
144+
SystemIndexPlugin,
145+
MapperPlugin,
146+
SearchPlugin,
147+
IngestPlugin {
138148

139149
/**
140150
* When this setting is true the verification check that
@@ -463,6 +473,13 @@ public List<QueryRewriteInterceptor> getQueryRewriteInterceptors() {
463473
);
464474
}
465475

476+
@Override
477+
public Map<String, Processor.Factory> getProcessors(Processor.Parameters parameters) {
478+
ChunkingProcessor.Factory chunkingProcessorFactory = new ChunkingProcessor.Factory();
479+
480+
return Map.of(ChunkingProcessor.TYPE, chunkingProcessorFactory);
481+
}
482+
466483
@Override
467484
public List<RetrieverSpec<?>> getRetrievers() {
468485
return List.of(
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.inference.ingest;
9+
10+
import org.elasticsearch.inference.ChunkingSettings;
11+
import org.elasticsearch.ingest.AbstractProcessor;
12+
import org.elasticsearch.ingest.ConfigurationUtils;
13+
import org.elasticsearch.ingest.IngestDocument;
14+
import org.elasticsearch.ingest.Processor;
15+
import org.elasticsearch.xpack.inference.chunking.Chunker;
16+
import org.elasticsearch.xpack.inference.chunking.ChunkerBuilder;
17+
import org.elasticsearch.xpack.inference.chunking.ChunkingSettingsBuilder;
18+
19+
import java.util.ArrayList;
20+
import java.util.List;
21+
import java.util.Map;
22+
import java.util.stream.Collectors;
23+
24+
public class ChunkingProcessor extends AbstractProcessor {
25+
26+
public static final String TYPE = "chunking";
27+
28+
private final List<Factory.InputConfig> inputConfigs;
29+
private final ChunkingSettings chunkingSettings;
30+
31+
public ChunkingProcessor(String tag, String description, List<Factory.InputConfig> inputConfigs, ChunkingSettings chunkingSettings) {
32+
super(tag, description);
33+
this.inputConfigs = inputConfigs;
34+
this.chunkingSettings = chunkingSettings;
35+
}
36+
37+
@Override
38+
public String getType() {
39+
return TYPE;
40+
}
41+
42+
@Override
43+
public IngestDocument execute(IngestDocument document) {
44+
for (var inputConfig : inputConfigs) {
45+
var text = document.getFieldValue(inputConfig.inputField, String.class);
46+
var chunks = ChunkerBuilder.fromChunkingStrategy(chunkingSettings.getChunkingStrategy()).chunk(text, chunkingSettings);
47+
document.setFieldValue(inputConfig.outputField, toChunkText(chunks, text));
48+
}
49+
50+
return document;
51+
}
52+
53+
private List<String> toChunkText(List<Chunker.ChunkOffset> offsets, String text) {
54+
return offsets.stream().map(o -> text.substring(o.start(), o.end())).collect(Collectors.toList());
55+
}
56+
57+
public static final class Factory implements Processor.Factory {
58+
@Override
59+
public Processor create(
60+
Map<String, Processor.Factory> processorFactories,
61+
String tag,
62+
String description,
63+
Map<String, Object> config
64+
) {
65+
List<InputConfig> inputConfigs = parseInputConfigs(tag, ConfigurationUtils.readList(TYPE, tag, config, "input_output"));
66+
ChunkingSettings chunkingSettings = ChunkingSettingsBuilder.fromMap(
67+
ConfigurationUtils.readMap(TYPE, tag, config, "chunking_settings")
68+
);
69+
return new ChunkingProcessor(tag, description, inputConfigs, chunkingSettings);
70+
}
71+
72+
private List<InputConfig> parseInputConfigs(String tag, List<Map<String, Object>> inputConfigMaps) {
73+
List<InputConfig> inputConfigs = new ArrayList<>();
74+
for (var inputConfigMap : inputConfigMaps) {
75+
String inputField = ConfigurationUtils.readStringProperty(TYPE, tag, inputConfigMap, "input_field");
76+
String outputField = ConfigurationUtils.readStringProperty(TYPE, tag, inputConfigMap, "output_field");
77+
inputConfigs.add(new InputConfig(inputField, outputField));
78+
}
79+
return inputConfigs;
80+
}
81+
82+
public record InputConfig(String inputField, String outputField) {}
83+
}
84+
}

0 commit comments

Comments
 (0)