Slightly improve TrackingPostingsInMemoryBytesCodec (#132905)

martijnvg · web-flow · commit 6bf0a1b11fdd · 2025-08-18T07:37:49.000+02:00
Replace int hashmap by a counter in TrackingPostingsInMemoryBytesCodec
and use int hashset to keep track of seen fields.
diff --git a/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java b/server/src/main/java/org/elasticsearch/index/codec/TrackingPostingsInMemoryBytesCodec.java
@@ -22,7 +22,7 @@
 import org.apache.lucene.index.SegmentWriteState;
 import org.apache.lucene.index.Terms;
 import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.internal.hppc.IntIntHashMap;
+import org.apache.lucene.internal.hppc.IntHashSet;
 import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.util.FeatureFlag;
 
@@ -63,22 +63,22 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
     static final class TrackingLengthFieldsConsumer extends FieldsConsumer {
         final SegmentWriteState state;
         final FieldsConsumer in;
-        final IntIntHashMap termsBytesPerField;
+        final IntHashSet seenFields;
+        final long[] totalBytes;
 
         TrackingLengthFieldsConsumer(SegmentWriteState state, FieldsConsumer in) {
             this.state = state;
             this.in = in;
-            this.termsBytesPerField = new IntIntHashMap(state.fieldInfos.size());
+            this.totalBytes = new long[1];
+            // Alternatively, we can consider using a FixedBitSet here and size to max(fieldNumber).
+            // This should be faster without worrying too much about memory usage.
+            this.seenFields = new IntHashSet(state.fieldInfos.size());
         }
 
         @Override
         public void write(Fields fields, NormsProducer norms) throws IOException {
-            in.write(new TrackingLengthFields(fields, termsBytesPerField, state.fieldInfos), norms);
-            long totalBytes = 0;
-            for (int bytes : termsBytesPerField.values) {
-                totalBytes += bytes;
-            }
-            state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes));
+            in.write(new TrackingLengthFields(fields, state.fieldInfos, seenFields, totalBytes), norms);
+            state.segmentInfo.putAttribute(IN_MEMORY_POSTINGS_BYTES_KEY, Long.toString(totalBytes[0]));
         }
 
         @Override
@@ -88,13 +88,15 @@ public void close() throws IOException {
     }
 
     static final class TrackingLengthFields extends FilterLeafReader.FilterFields {
-        final IntIntHashMap termsBytesPerField;
         final FieldInfos fieldInfos;
+        final IntHashSet seenFields;
+        final long[] totalBytes;
 
-        TrackingLengthFields(Fields in, IntIntHashMap termsBytesPerField, FieldInfos fieldInfos) {
+        TrackingLengthFields(Fields in, FieldInfos fieldInfos, IntHashSet seenFields, long[] totalBytes) {
             super(in);
-            this.termsBytesPerField = termsBytesPerField;
+            this.seenFields = seenFields;
             this.fieldInfos = fieldInfos;
+            this.totalBytes = totalBytes;
         }
 
         @Override
@@ -104,10 +106,14 @@ public Terms terms(String field) throws IOException {
                 return null;
             }
             int fieldNum = fieldInfos.fieldInfo(field).number;
-            return new TrackingLengthTerms(
-                terms,
-                bytes -> termsBytesPerField.put(fieldNum, Math.max(termsBytesPerField.getOrDefault(fieldNum, 0), bytes))
-            );
+            if (seenFields.add(fieldNum)) {
+                return new TrackingLengthTerms(terms, bytes -> totalBytes[0] += bytes);
+            } else {
+                // As far as I know only when bloom filter for _id filter gets written this method gets invoked twice for the same field.
+                // So maybe we can get rid of the seenFields here? And just keep track of whether _id field has been seen? However, this
+                // is fragile and could make us vulnerable to tricky bugs in the future if this is no longer the case.
+                return terms;
+            }
         }
     }