diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java index a9733e83c8be3..bb212c81f7477 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapper.java @@ -369,9 +369,9 @@ public void parse(DocumentParserContext context) throws IOException { // based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf IndexableField currentField = context.doc().getByKey(key); if (currentField == null) { - context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored())); - } else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) { - ((XFeatureField) currentField).setFeatureValue(value); + context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value, fieldType().isStored())); + } else if (currentField instanceof FeatureField ff && ff.getFeatureValue() < value) { + ff.setFeatureValue(value); } } else { throw new IllegalArgumentException( diff --git a/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java b/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java index 5f4afb4a86acc..14e8a762eda5d 100644 --- a/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java +++ b/server/src/main/java/org/elasticsearch/index/mapper/vectors/XFeatureField.java @@ -18,149 +18,13 @@ package org.elasticsearch.index.mapper.vectors; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; import org.apache.lucene.document.FeatureField; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.index.IndexOptions; /** * This class is forked from the Lucene {@link FeatureField} implementation to enable support for storing term vectors. - * It should be removed once apache/lucene#14034 becomes available. + * Its purpose is to allow decoding the feature value from the term frequency */ -public final class XFeatureField extends Field { - private static final FieldType FIELD_TYPE = new FieldType(); - private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType(); - - static { - FIELD_TYPE.setTokenized(false); - FIELD_TYPE.setOmitNorms(true); - FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS); - - FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false); - FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true); - FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS); - FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true); - } - - private float featureValue; - - /** - * Create a feature. - * - * @param fieldName The name of the field to store the information into. All features may be - * stored in the same field. - * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term. - * @param featureValue The value of the feature, must be a positive, finite, normal float. - */ - public XFeatureField(String fieldName, String featureName, float featureValue) { - this(fieldName, featureName, featureValue, false); - } - - /** - * Create a feature. - * - * @param fieldName The name of the field to store the information into. All features may be - * stored in the same field. - * @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term. - * @param featureValue The value of the feature, must be a positive, finite, normal float. - */ - public XFeatureField(String fieldName, String featureName, float featureValue, boolean storeTermVectors) { - super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE); - setFeatureValue(featureValue); - } - - /** - * Update the feature value of this field. - */ - public void setFeatureValue(float featureValue) { - if (Float.isFinite(featureValue) == false) { - throw new IllegalArgumentException( - "featureValue must be finite, got: " + featureValue + " for feature " + fieldsData + " on field " + name - ); - } - if (featureValue < Float.MIN_NORMAL) { - throw new IllegalArgumentException( - "featureValue must be a positive normal float, got: " - + featureValue - + " for feature " - + fieldsData - + " on field " - + name - + " which is less than the minimum positive normal float: " - + Float.MIN_NORMAL - ); - } - this.featureValue = featureValue; - } - - @Override - public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) { - FeatureTokenStream stream; - if (reuse instanceof FeatureTokenStream) { - stream = (FeatureTokenStream) reuse; - } else { - stream = new FeatureTokenStream(); - } - - int freqBits = Float.floatToIntBits(featureValue); - stream.setValues((String) fieldsData, freqBits >>> 15); - return stream; - } - - /** - * This is useful if you have multiple features sharing a name and you want to take action to - * deduplicate them. - * - * @return the feature value of this field. - */ - public float getFeatureValue() { - return featureValue; - } - - private static final class FeatureTokenStream extends TokenStream { - private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class); - private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class); - private boolean used = true; - private String value = null; - private int freq = 0; - - private FeatureTokenStream() {} - - /** - * Sets the values - */ - void setValues(String value, int freq) { - this.value = value; - this.freq = freq; - } - - @Override - public boolean incrementToken() { - if (used) { - return false; - } - clearAttributes(); - termAttribute.append(value); - freqAttribute.setTermFrequency(freq); - used = true; - return true; - } - - @Override - public void reset() { - used = false; - } - - @Override - public void close() { - value = null; - } - } - +public final class XFeatureField { static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15; static float decodeFeatureValue(float freq) { diff --git a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java index 3ceab2cf204c7..2099a4b138264 100644 --- a/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java +++ b/server/src/test/java/org/elasticsearch/index/mapper/vectors/SparseVectorFieldMapperTests.java @@ -11,6 +11,7 @@ import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.LeafReader; @@ -212,14 +213,14 @@ public void testDefaults() throws Exception { List fields = doc1.rootDoc().getFields("field"); assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; + assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class)); + FeatureField featureField1 = null; + FeatureField featureField2 = null; for (IndexableField field : fields) { if (field.stringValue().equals("ten")) { - featureField1 = (XFeatureField) field; + featureField1 = (FeatureField) field; } else if (field.stringValue().equals("twenty")) { - featureField2 = (XFeatureField) field; + featureField2 = (FeatureField) field; } else { throw new UnsupportedOperationException(); } @@ -314,14 +315,14 @@ public void testDotInFieldName() throws Exception { List fields = parsedDocument.rootDoc().getFields("field"); assertEquals(2, fields.size()); - assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class)); - XFeatureField featureField1 = null; - XFeatureField featureField2 = null; + assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class)); + FeatureField featureField1 = null; + FeatureField featureField2 = null; for (IndexableField field : fields) { if (field.stringValue().equals("foo.bar")) { - featureField1 = (XFeatureField) field; + featureField1 = (FeatureField) field; } else if (field.stringValue().equals("foobar")) { - featureField2 = (XFeatureField) field; + featureField2 = (FeatureField) field; } else { throw new UnsupportedOperationException(); } @@ -369,13 +370,13 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce })); // then validate that the generate document stored both values appropriately and we have only the max value stored - XFeatureField barField = ((XFeatureField) doc1.rootDoc().getByKey("foo.field\\.bar")); + FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field\\.bar")); assertEquals(20, barField.getFeatureValue(), 1); - XFeatureField storedBarField = ((XFeatureField) doc1.rootDoc().getFields("foo.field").get(1)); + FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1)); assertEquals(20, storedBarField.getFeatureValue(), 1); - assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof XFeatureField).count()); + assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof FeatureField).count()); } public void testCannotBeUsedInMultiFields() { diff --git a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java index 0fab22d45d08c..cc87edf59e9d3 100644 --- a/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java +++ b/x-pack/plugin/inference/src/test/java/org/elasticsearch/xpack/inference/mapper/SemanticTextFieldMapperTests.java @@ -10,6 +10,7 @@ import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.document.FeatureField; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.IndexableField; @@ -55,7 +56,6 @@ import org.elasticsearch.index.mapper.vectors.DenseVectorFieldMapper; import org.elasticsearch.index.mapper.vectors.DenseVectorFieldTypeTests; import org.elasticsearch.index.mapper.vectors.SparseVectorFieldMapper; -import org.elasticsearch.index.mapper.vectors.XFeatureField; import org.elasticsearch.index.query.SearchExecutionContext; import org.elasticsearch.index.search.ESToParentBlockJoinQuery; import org.elasticsearch.inference.ChunkingSettings; @@ -1561,7 +1561,7 @@ private static void assertChildLeafNestedDocument( private static void assertSparseFeatures(LuceneDocument doc, String fieldName, int expectedCount) { int count = 0; for (IndexableField field : doc.getFields()) { - if (field instanceof XFeatureField featureField) { + if (field instanceof FeatureField featureField) { assertThat(featureField.name(), equalTo(fieldName)); ++count; }