Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.internal.hppc.IntIntHashMap;
import org.apache.lucene.internal.hppc.IntHashSet;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.util.FeatureFlag;

Expand Down Expand Up @@ -63,22 +63,22 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
static final class TrackingLengthFieldsConsumer extends FieldsConsumer {
final SegmentWriteState state;
final FieldsConsumer in;
final IntIntHashMap termsBytesPerField;
final IntHashSet seenFields;
final long[] totalBytes;

TrackingLengthFieldsConsumer(SegmentWriteState state, FieldsConsumer in) {
this.state = state;
this.in = in;
this.termsBytesPerField = new IntIntHashMap(state.fieldInfos.size());
this.totalBytes = new long[1];
// Alternatively, we can consider using a FixedBitSet here and size to max(fieldNumber). T
// his should be faster without worrying too much about memory usage.
this.seenFields = new IntHashSet(state.fieldInfos.size());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it's probably better to prioritize speed over memory efficiency here.

}

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
in.write(new TrackingLengthFields(fields, termsBytesPerField, state.fieldInfos), norms);
long totalBytes = 0;
for (int bytes : termsBytesPerField.values) {
totalBytes += bytes;
}
state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes));
in.write(new TrackingLengthFields(fields, state.fieldInfos, seenFields, totalBytes), norms);
state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes[0]));
}

@Override
Expand All @@ -88,13 +88,15 @@ public void close() throws IOException {
}

static final class TrackingLengthFields extends FilterLeafReader.FilterFields {
final IntIntHashMap termsBytesPerField;
final FieldInfos fieldInfos;
final IntHashSet seenFields;
final long[] totalBytes;

TrackingLengthFields(Fields in, IntIntHashMap termsBytesPerField, FieldInfos fieldInfos) {
TrackingLengthFields(Fields in, FieldInfos fieldInfos, IntHashSet seenFields, long[] totalBytes) {
super(in);
this.termsBytesPerField = termsBytesPerField;
this.seenFields = seenFields;
this.fieldInfos = fieldInfos;
this.totalBytes = totalBytes;
}

@Override
Expand All @@ -104,10 +106,13 @@ public Terms terms(String field) throws IOException {
return null;
}
int fieldNum = fieldInfos.fieldInfo(field).number;
return new TrackingLengthTerms(
terms,
bytes -> termsBytesPerField.put(fieldNum, Math.max(termsBytesPerField.getOrDefault(fieldNum, 0), bytes))
);
if (seenFields.add(fieldNum)) {
return new TrackingLengthTerms(terms, bytes -> totalBytes[0] += bytes);
} else {
// As far as I know only when bloom filter for _id filter gets written this method gets invoked twice for the same field.
// So maybe we can get rid of the seenFields here? And just keep track of whether _id field has been seen?
return terms;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's difficult to guarantee that we'll only ever invoke this method twice for _id fields. I think only checking the _id field would potentially open us up to some subtle bugs in the future.

}
}
}

Expand Down