Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -369,9 +369,9 @@ public void parse(DocumentParserContext context) throws IOException {
// based on recommendations from this paper: https://arxiv.org/pdf/2305.18494.pdf
IndexableField currentField = context.doc().getByKey(key);
if (currentField == null) {
context.doc().addWithKey(key, new XFeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof XFeatureField && ((XFeatureField) currentField).getFeatureValue() < value) {
((XFeatureField) currentField).setFeatureValue(value);
context.doc().addWithKey(key, new FeatureField(fullPath(), feature, value, fieldType().isStored()));
} else if (currentField instanceof FeatureField ff && ff.getFeatureValue() < value) {
ff.setFeatureValue(value);
}
} else {
throw new IllegalArgumentException(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,149 +18,13 @@

package org.elasticsearch.index.mapper.vectors;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.document.FeatureField;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;

/**
* This class is forked from the Lucene {@link FeatureField} implementation to enable support for storing term vectors.
* It should be removed once apache/lucene#14034 becomes available.
* Its purpose is to allow decoding the feature value from the term frequency
*/
public final class XFeatureField extends Field {
private static final FieldType FIELD_TYPE = new FieldType();
private static final FieldType FIELD_TYPE_STORE_TERM_VECTORS = new FieldType();

static {
FIELD_TYPE.setTokenized(false);
FIELD_TYPE.setOmitNorms(true);
FIELD_TYPE.setIndexOptions(IndexOptions.DOCS_AND_FREQS);

FIELD_TYPE_STORE_TERM_VECTORS.setTokenized(false);
FIELD_TYPE_STORE_TERM_VECTORS.setOmitNorms(true);
FIELD_TYPE_STORE_TERM_VECTORS.setIndexOptions(IndexOptions.DOCS_AND_FREQS);
FIELD_TYPE_STORE_TERM_VECTORS.setStoreTermVectors(true);
}

private float featureValue;

/**
* Create a feature.
*
* @param fieldName The name of the field to store the information into. All features may be
* stored in the same field.
* @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
* @param featureValue The value of the feature, must be a positive, finite, normal float.
*/
public XFeatureField(String fieldName, String featureName, float featureValue) {
this(fieldName, featureName, featureValue, false);
}

/**
* Create a feature.
*
* @param fieldName The name of the field to store the information into. All features may be
* stored in the same field.
* @param featureName The name of the feature, eg. 'pagerank`. It will be indexed as a term.
* @param featureValue The value of the feature, must be a positive, finite, normal float.
*/
public XFeatureField(String fieldName, String featureName, float featureValue, boolean storeTermVectors) {
super(fieldName, featureName, storeTermVectors ? FIELD_TYPE_STORE_TERM_VECTORS : FIELD_TYPE);
setFeatureValue(featureValue);
}

/**
* Update the feature value of this field.
*/
public void setFeatureValue(float featureValue) {
if (Float.isFinite(featureValue) == false) {
throw new IllegalArgumentException(
"featureValue must be finite, got: " + featureValue + " for feature " + fieldsData + " on field " + name
);
}
if (featureValue < Float.MIN_NORMAL) {
throw new IllegalArgumentException(
"featureValue must be a positive normal float, got: "
+ featureValue
+ " for feature "
+ fieldsData
+ " on field "
+ name
+ " which is less than the minimum positive normal float: "
+ Float.MIN_NORMAL
);
}
this.featureValue = featureValue;
}

@Override
public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
FeatureTokenStream stream;
if (reuse instanceof FeatureTokenStream) {
stream = (FeatureTokenStream) reuse;
} else {
stream = new FeatureTokenStream();
}

int freqBits = Float.floatToIntBits(featureValue);
stream.setValues((String) fieldsData, freqBits >>> 15);
return stream;
}

/**
* This is useful if you have multiple features sharing a name and you want to take action to
* deduplicate them.
*
* @return the feature value of this field.
*/
public float getFeatureValue() {
return featureValue;
}

private static final class FeatureTokenStream extends TokenStream {
private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
private final TermFrequencyAttribute freqAttribute = addAttribute(TermFrequencyAttribute.class);
private boolean used = true;
private String value = null;
private int freq = 0;

private FeatureTokenStream() {}

/**
* Sets the values
*/
void setValues(String value, int freq) {
this.value = value;
this.freq = freq;
}

@Override
public boolean incrementToken() {
if (used) {
return false;
}
clearAttributes();
termAttribute.append(value);
freqAttribute.setTermFrequency(freq);
used = true;
return true;
}

@Override
public void reset() {
used = false;
}

@Override
public void close() {
value = null;
}
}

public final class XFeatureField {
static final int MAX_FREQ = Float.floatToIntBits(Float.MAX_VALUE) >>> 15;

static float decodeFeatureValue(float freq) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.TermFrequencyAttribute;
import org.apache.lucene.document.FeatureField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
Expand Down Expand Up @@ -212,14 +213,14 @@ public void testDefaults() throws Exception {

List<IndexableField> fields = doc1.rootDoc().getFields("field");
assertEquals(2, fields.size());
assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class));
XFeatureField featureField1 = null;
XFeatureField featureField2 = null;
assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
FeatureField featureField1 = null;
FeatureField featureField2 = null;
for (IndexableField field : fields) {
if (field.stringValue().equals("ten")) {
featureField1 = (XFeatureField) field;
featureField1 = (FeatureField) field;
} else if (field.stringValue().equals("twenty")) {
featureField2 = (XFeatureField) field;
featureField2 = (FeatureField) field;
} else {
throw new UnsupportedOperationException();
}
Expand Down Expand Up @@ -314,14 +315,14 @@ public void testDotInFieldName() throws Exception {

List<IndexableField> fields = parsedDocument.rootDoc().getFields("field");
assertEquals(2, fields.size());
assertThat(fields.get(0), Matchers.instanceOf(XFeatureField.class));
XFeatureField featureField1 = null;
XFeatureField featureField2 = null;
assertThat(fields.get(0), Matchers.instanceOf(FeatureField.class));
FeatureField featureField1 = null;
FeatureField featureField2 = null;
for (IndexableField field : fields) {
if (field.stringValue().equals("foo.bar")) {
featureField1 = (XFeatureField) field;
featureField1 = (FeatureField) field;
} else if (field.stringValue().equals("foobar")) {
featureField2 = (XFeatureField) field;
featureField2 = (FeatureField) field;
} else {
throw new UnsupportedOperationException();
}
Expand Down Expand Up @@ -369,13 +370,13 @@ public void testHandlesMultiValuedFields() throws MapperParsingException, IOExce
}));

// then validate that the generate document stored both values appropriately and we have only the max value stored
XFeatureField barField = ((XFeatureField) doc1.rootDoc().getByKey("foo.field\\.bar"));
FeatureField barField = ((FeatureField) doc1.rootDoc().getByKey("foo.field\\.bar"));
assertEquals(20, barField.getFeatureValue(), 1);

XFeatureField storedBarField = ((XFeatureField) doc1.rootDoc().getFields("foo.field").get(1));
FeatureField storedBarField = ((FeatureField) doc1.rootDoc().getFields("foo.field").get(1));
assertEquals(20, storedBarField.getFeatureValue(), 1);

assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof XFeatureField).count());
assertEquals(3, doc1.rootDoc().getFields().stream().filter((f) -> f instanceof FeatureField).count());
}

public void testCannotBeUsedInMultiFields() {
Expand Down