Skip to content

Commit f0b2a1f

Browse files
authored
Add a new offset_source field to store offsets referencing substrings of another field. (#118017) (#118088)
This field is primarily designed for use with the `semantic_text` field, where it enables storing offsets that point to substrings of the field used to generate its underlying chunks. To prevent external usage, the field is intentionally undocumented, with detailed javadocs explaining its specific purpose and limitations. I couldn’t find a way to fully block external usage, but skipping the docs should keep it mostly out of sight for now.
1 parent 379a3a8 commit f0b2a1f

File tree

6 files changed

+737
-1
lines changed

6 files changed

+737
-1
lines changed

x-pack/plugin/inference/src/main/java/org/elasticsearch/xpack/inference/InferencePlugin.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
import org.elasticsearch.xpack.inference.external.http.sender.HttpRequestSender;
6767
import org.elasticsearch.xpack.inference.external.http.sender.RequestExecutorServiceSettings;
6868
import org.elasticsearch.xpack.inference.logging.ThrottlerManager;
69+
import org.elasticsearch.xpack.inference.mapper.OffsetSourceFieldMapper;
6970
import org.elasticsearch.xpack.inference.mapper.SemanticTextFieldMapper;
7071
import org.elasticsearch.xpack.inference.queries.SemanticQueryBuilder;
7172
import org.elasticsearch.xpack.inference.rank.random.RandomRankBuilder;
@@ -365,7 +366,12 @@ public void close() {
365366

366367
@Override
367368
public Map<String, Mapper.TypeParser> getMappers() {
368-
return Map.of(SemanticTextFieldMapper.CONTENT_TYPE, SemanticTextFieldMapper.PARSER);
369+
return Map.of(
370+
SemanticTextFieldMapper.CONTENT_TYPE,
371+
SemanticTextFieldMapper.PARSER,
372+
OffsetSourceFieldMapper.CONTENT_TYPE,
373+
OffsetSourceFieldMapper.PARSER
374+
);
369375
}
370376

371377
@Override
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
/*
2+
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3+
* or more contributor license agreements. Licensed under the Elastic License
4+
* 2.0; you may not use this file except in compliance with the Elastic License
5+
* 2.0.
6+
*/
7+
8+
package org.elasticsearch.xpack.inference.mapper;
9+
10+
import org.apache.lucene.analysis.Analyzer;
11+
import org.apache.lucene.analysis.TokenStream;
12+
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
13+
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
14+
import org.apache.lucene.document.Field;
15+
import org.apache.lucene.document.FieldType;
16+
import org.apache.lucene.index.IndexOptions;
17+
import org.apache.lucene.index.PostingsEnum;
18+
import org.apache.lucene.index.Term;
19+
import org.apache.lucene.index.Terms;
20+
import org.apache.lucene.search.DocIdSetIterator;
21+
22+
import java.io.IOException;
23+
import java.nio.charset.Charset;
24+
import java.util.LinkedHashMap;
25+
import java.util.Map;
26+
27+
/**
28+
* Represents a {@link Field} that stores a {@link Term} along with its start and end offsets.
29+
* Note: The {@link Charset} used to calculate these offsets is not associated with this field.
30+
* It is the responsibility of the consumer to handle the appropriate {@link Charset}.
31+
*/
32+
public final class OffsetSourceField extends Field {
33+
private static final FieldType FIELD_TYPE = new FieldType();
34+
35+
static {
36+
FIELD_TYPE.setTokenized(false);
37+
FIELD_TYPE.setOmitNorms(true);
38+
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
39+
}
40+
41+
private int startOffset;
42+
private int endOffset;
43+
44+
public OffsetSourceField(String fieldName, String sourceFieldName, int startOffset, int endOffset) {
45+
super(fieldName, sourceFieldName, FIELD_TYPE);
46+
this.startOffset = startOffset;
47+
this.endOffset = endOffset;
48+
}
49+
50+
public void setValues(String fieldName, int startOffset, int endOffset) {
51+
this.fieldsData = fieldName;
52+
this.startOffset = startOffset;
53+
this.endOffset = endOffset;
54+
}
55+
56+
@Override
57+
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
58+
OffsetTokenStream stream;
59+
if (reuse instanceof OffsetTokenStream) {
60+
stream = (OffsetTokenStream) reuse;
61+
} else {
62+
stream = new OffsetTokenStream();
63+
}
64+
65+
stream.setValues((String) fieldsData, startOffset, endOffset);
66+
return stream;
67+
}
68+
69+
public static OffsetSourceLoader loader(Terms terms) throws IOException {
70+
return new OffsetSourceLoader(terms);
71+
}
72+
73+
private static final class OffsetTokenStream extends TokenStream {
74+
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
75+
private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
76+
private boolean used = true;
77+
private String term = null;
78+
private int startOffset = 0;
79+
private int endOffset = 0;
80+
81+
private OffsetTokenStream() {}
82+
83+
/** Sets the values */
84+
void setValues(String term, int startOffset, int endOffset) {
85+
this.term = term;
86+
this.startOffset = startOffset;
87+
this.endOffset = endOffset;
88+
}
89+
90+
@Override
91+
public boolean incrementToken() {
92+
if (used) {
93+
return false;
94+
}
95+
clearAttributes();
96+
termAttribute.append(term);
97+
offsetAttribute.setOffset(startOffset, endOffset);
98+
used = true;
99+
return true;
100+
}
101+
102+
@Override
103+
public void reset() {
104+
used = false;
105+
}
106+
107+
@Override
108+
public void close() {
109+
term = null;
110+
}
111+
}
112+
113+
public static class OffsetSourceLoader {
114+
private final Map<String, PostingsEnum> postingsEnums = new LinkedHashMap<>();
115+
116+
private OffsetSourceLoader(Terms terms) throws IOException {
117+
var termsEnum = terms.iterator();
118+
while (termsEnum.next() != null) {
119+
var postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
120+
if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
121+
postingsEnums.put(termsEnum.term().utf8ToString(), postings);
122+
}
123+
}
124+
}
125+
126+
public OffsetSourceFieldMapper.OffsetSource advanceTo(int doc) throws IOException {
127+
for (var it = postingsEnums.entrySet().iterator(); it.hasNext();) {
128+
var entry = it.next();
129+
var postings = entry.getValue();
130+
if (postings.docID() < doc) {
131+
if (postings.advance(doc) == DocIdSetIterator.NO_MORE_DOCS) {
132+
it.remove();
133+
continue;
134+
}
135+
}
136+
if (postings.docID() == doc) {
137+
assert postings.freq() == 1;
138+
postings.nextPosition();
139+
return new OffsetSourceFieldMapper.OffsetSource(entry.getKey(), postings.startOffset(), postings.endOffset());
140+
}
141+
}
142+
return null;
143+
}
144+
}
145+
}

0 commit comments

Comments
 (0)