Skip to content

Commit 6bf0a1b

Browse files
authored
Slightly improve TrackingPostingsInMemoryBytesCodec (#132905)
Replace int hashmap by a counter in TrackingPostingsInMemoryBytesCodec and use int hashset to keep track of seen fields.
1 parent 433dbf0 commit 6bf0a1b

File tree

1 file changed

+22
-16
lines changed

1 file changed

+22
-16
lines changed

server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
import org.apache.lucene.index.SegmentWriteState;
2323
import org.apache.lucene.index.Terms;
2424
import org.apache.lucene.index.TermsEnum;
25-
import org.apache.lucene.internal.hppc.IntIntHashMap;
25+
import org.apache.lucene.internal.hppc.IntHashSet;
2626
import org.apache.lucene.util.BytesRef;
2727
import org.elasticsearch.common.util.FeatureFlag;
2828

@@ -63,22 +63,22 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
6363
static final class TrackingLengthFieldsConsumer extends FieldsConsumer {
6464
final SegmentWriteState state;
6565
final FieldsConsumer in;
66-
final IntIntHashMap termsBytesPerField;
66+
final IntHashSet seenFields;
67+
final long[] totalBytes;
6768

6869
TrackingLengthFieldsConsumer(SegmentWriteState state, FieldsConsumer in) {
6970
this.state = state;
7071
this.in = in;
71-
this.termsBytesPerField = new IntIntHashMap(state.fieldInfos.size());
72+
this.totalBytes = new long[1];
73+
// Alternatively, we can consider using a FixedBitSet here and size to max(fieldNumber).
74+
// This should be faster without worrying too much about memory usage.
75+
this.seenFields = new IntHashSet(state.fieldInfos.size());
7276
}
7377

7478
@Override
7579
public void write(Fields fields, NormsProducer norms) throws IOException {
76-
in.write(new TrackingLengthFields(fields, termsBytesPerField, state.fieldInfos), norms);
77-
long totalBytes = 0;
78-
for (int bytes : termsBytesPerField.values) {
79-
totalBytes += bytes;
80-
}
81-
state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes));
80+
in.write(new TrackingLengthFields(fields, state.fieldInfos, seenFields, totalBytes), norms);
81+
state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes[0]));
8282
}
8383

8484
@Override
@@ -88,13 +88,15 @@ public void close() throws IOException {
8888
}
8989

9090
static final class TrackingLengthFields extends FilterLeafReader.FilterFields {
91-
final IntIntHashMap termsBytesPerField;
9291
final FieldInfos fieldInfos;
92+
final IntHashSet seenFields;
93+
final long[] totalBytes;
9394

94-
TrackingLengthFields(Fields in, IntIntHashMap termsBytesPerField, FieldInfos fieldInfos) {
95+
TrackingLengthFields(Fields in, FieldInfos fieldInfos, IntHashSet seenFields, long[] totalBytes) {
9596
super(in);
96-
this.termsBytesPerField = termsBytesPerField;
97+
this.seenFields = seenFields;
9798
this.fieldInfos = fieldInfos;
99+
this.totalBytes = totalBytes;
98100
}
99101

100102
@Override
@@ -104,10 +106,14 @@ public Terms terms(String field) throws IOException {
104106
return null;
105107
}
106108
int fieldNum = fieldInfos.fieldInfo(field).number;
107-
return new TrackingLengthTerms(
108-
terms,
109-
bytes -> termsBytesPerField.put(fieldNum, Math.max(termsBytesPerField.getOrDefault(fieldNum, 0), bytes))
110-
);
109+
if (seenFields.add(fieldNum)) {
110+
return new TrackingLengthTerms(terms, bytes -> totalBytes[0] += bytes);
111+
} else {
112+
// As far as I know only when bloom filter for _id filter gets written this method gets invoked twice for the same field.
113+
// So maybe we can get rid of the seenFields here? And just keep track of whether _id field has been seen? However, this
114+
// is fragile and could make us vulnerable to tricky bugs in the future if this is no longer the case.
115+
return terms;
116+
}
111117
}
112118
}
113119

0 commit comments

Comments
 (0)