|
| 1 | +/* |
| 2 | + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one |
| 3 | + * or more contributor license agreements. Licensed under the Elastic License |
| 4 | + * 2.0; you may not use this file except in compliance with the Elastic License |
| 5 | + * 2.0. |
| 6 | + */ |
| 7 | + |
| 8 | +package org.elasticsearch.xpack.inference.mapper; |
| 9 | + |
| 10 | +import org.apache.lucene.analysis.Analyzer; |
| 11 | +import org.apache.lucene.analysis.TokenStream; |
| 12 | +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| 13 | +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| 14 | +import org.apache.lucene.document.Field; |
| 15 | +import org.apache.lucene.document.FieldType; |
| 16 | +import org.apache.lucene.index.IndexOptions; |
| 17 | +import org.apache.lucene.index.PostingsEnum; |
| 18 | +import org.apache.lucene.index.Term; |
| 19 | +import org.apache.lucene.index.Terms; |
| 20 | +import org.apache.lucene.search.DocIdSetIterator; |
| 21 | + |
| 22 | +import java.io.IOException; |
| 23 | +import java.nio.charset.Charset; |
| 24 | +import java.util.LinkedHashMap; |
| 25 | +import java.util.Map; |
| 26 | + |
| 27 | +/** |
| 28 | + * Represents a {@link Field} that stores a {@link Term} along with its start and end offsets. |
| 29 | + * Note: The {@link Charset} used to calculate these offsets is not associated with this field. |
| 30 | + * It is the responsibility of the consumer to handle the appropriate {@link Charset}. |
| 31 | + */ |
| 32 | +public final class OffsetSourceField extends Field { |
| 33 | + private static final FieldType FIELD_TYPE = new FieldType(); |
| 34 | + |
| 35 | + static { |
| 36 | + FIELD_TYPE.setTokenized(false); |
| 37 | + FIELD_TYPE.setOmitNorms(true); |
| 38 | + FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); |
| 39 | + } |
| 40 | + |
| 41 | + private int startOffset; |
| 42 | + private int endOffset; |
| 43 | + |
| 44 | + public OffsetSourceField(String fieldName, String sourceFieldName, int startOffset, int endOffset) { |
| 45 | + super(fieldName, sourceFieldName, FIELD_TYPE); |
| 46 | + this.startOffset = startOffset; |
| 47 | + this.endOffset = endOffset; |
| 48 | + } |
| 49 | + |
| 50 | + public void setValues(String fieldName, int startOffset, int endOffset) { |
| 51 | + this.fieldsData = fieldName; |
| 52 | + this.startOffset = startOffset; |
| 53 | + this.endOffset = endOffset; |
| 54 | + } |
| 55 | + |
| 56 | + @Override |
| 57 | + public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { |
| 58 | + OffsetTokenStream stream; |
| 59 | + if (reuse instanceof OffsetTokenStream) { |
| 60 | + stream = (OffsetTokenStream) reuse; |
| 61 | + } else { |
| 62 | + stream = new OffsetTokenStream(); |
| 63 | + } |
| 64 | + |
| 65 | + stream.setValues((String) fieldsData, startOffset, endOffset); |
| 66 | + return stream; |
| 67 | + } |
| 68 | + |
| 69 | + public static OffsetSourceLoader loader(Terms terms) throws IOException { |
| 70 | + return new OffsetSourceLoader(terms); |
| 71 | + } |
| 72 | + |
| 73 | + private static final class OffsetTokenStream extends TokenStream { |
| 74 | + private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); |
| 75 | + private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class); |
| 76 | + private boolean used = true; |
| 77 | + private String term = null; |
| 78 | + private int startOffset = 0; |
| 79 | + private int endOffset = 0; |
| 80 | + |
| 81 | + private OffsetTokenStream() {} |
| 82 | + |
| 83 | + /** Sets the values */ |
| 84 | + void setValues(String term, int startOffset, int endOffset) { |
| 85 | + this.term = term; |
| 86 | + this.startOffset = startOffset; |
| 87 | + this.endOffset = endOffset; |
| 88 | + } |
| 89 | + |
| 90 | + @Override |
| 91 | + public boolean incrementToken() { |
| 92 | + if (used) { |
| 93 | + return false; |
| 94 | + } |
| 95 | + clearAttributes(); |
| 96 | + termAttribute.append(term); |
| 97 | + offsetAttribute.setOffset(startOffset, endOffset); |
| 98 | + used = true; |
| 99 | + return true; |
| 100 | + } |
| 101 | + |
| 102 | + @Override |
| 103 | + public void reset() { |
| 104 | + used = false; |
| 105 | + } |
| 106 | + |
| 107 | + @Override |
| 108 | + public void close() { |
| 109 | + term = null; |
| 110 | + } |
| 111 | + } |
| 112 | + |
| 113 | + public static class OffsetSourceLoader { |
| 114 | + private final Map<String, PostingsEnum> postingsEnums = new LinkedHashMap<>(); |
| 115 | + |
| 116 | + private OffsetSourceLoader(Terms terms) throws IOException { |
| 117 | + var termsEnum = terms.iterator(); |
| 118 | + while (termsEnum.next() != null) { |
| 119 | + var postings = termsEnum.postings(null, PostingsEnum.OFFSETS); |
| 120 | + if (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { |
| 121 | + postingsEnums.put(termsEnum.term().utf8ToString(), postings); |
| 122 | + } |
| 123 | + } |
| 124 | + } |
| 125 | + |
| 126 | + public OffsetSourceFieldMapper.OffsetSource advanceTo(int doc) throws IOException { |
| 127 | + for (var it = postingsEnums.entrySet().iterator(); it.hasNext();) { |
| 128 | + var entry = it.next(); |
| 129 | + var postings = entry.getValue(); |
| 130 | + if (postings.docID() < doc) { |
| 131 | + if (postings.advance(doc) == DocIdSetIterator.NO_MORE_DOCS) { |
| 132 | + it.remove(); |
| 133 | + continue; |
| 134 | + } |
| 135 | + } |
| 136 | + if (postings.docID() == doc) { |
| 137 | + assert postings.freq() == 1; |
| 138 | + postings.nextPosition(); |
| 139 | + return new OffsetSourceFieldMapper.OffsetSource(entry.getKey(), postings.startOffset(), postings.endOffset()); |
| 140 | + } |
| 141 | + } |
| 142 | + return null; |
| 143 | + } |
| 144 | + } |
| 145 | +} |
0 commit comments