2222import org .apache .lucene .index .SegmentWriteState ;
2323import org .apache .lucene .index .Terms ;
2424import org .apache .lucene .index .TermsEnum ;
25- import org .apache .lucene .internal .hppc .IntIntHashMap ;
25+ import org .apache .lucene .internal .hppc .IntHashSet ;
2626import org .apache .lucene .util .BytesRef ;
2727import org .elasticsearch .common .util .FeatureFlag ;
2828
@@ -63,22 +63,22 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
6363 static final class TrackingLengthFieldsConsumer extends FieldsConsumer {
6464 final SegmentWriteState state ;
6565 final FieldsConsumer in ;
66- final IntIntHashMap termsBytesPerField ;
66+ final IntHashSet seenFields ;
67+ final long [] totalBytes ;
6768
6869 TrackingLengthFieldsConsumer (SegmentWriteState state , FieldsConsumer in ) {
6970 this .state = state ;
7071 this .in = in ;
71- this .termsBytesPerField = new IntIntHashMap (state .fieldInfos .size ());
72+ this .totalBytes = new long [1 ];
73+ // Alternatively, we can consider using a FixedBitSet here and size to max(fieldNumber).
74+ // This should be faster without worrying too much about memory usage.
75+ this .seenFields = new IntHashSet (state .fieldInfos .size ());
7276 }
7377
7478 @ Override
7579 public void write (Fields fields , NormsProducer norms ) throws IOException {
76- in .write (new TrackingLengthFields (fields , termsBytesPerField , state .fieldInfos ), norms );
77- long totalBytes = 0 ;
78- for (int bytes : termsBytesPerField .values ) {
79- totalBytes += bytes ;
80- }
81- state .segmentInfo .putAttribute (IN_MEMORY_POSTINGS_BYTES_KEY , Long .toString (totalBytes ));
80+ in .write (new TrackingLengthFields (fields , state .fieldInfos , seenFields , totalBytes ), norms );
81+ state .segmentInfo .putAttribute (IN_MEMORY_POSTINGS_BYTES_KEY , Long .toString (totalBytes [0 ]));
8282 }
8383
8484 @ Override
@@ -88,13 +88,15 @@ public void close() throws IOException {
8888 }
8989
9090 static final class TrackingLengthFields extends FilterLeafReader .FilterFields {
91- final IntIntHashMap termsBytesPerField ;
9291 final FieldInfos fieldInfos ;
92+ final IntHashSet seenFields ;
93+ final long [] totalBytes ;
9394
94- TrackingLengthFields (Fields in , IntIntHashMap termsBytesPerField , FieldInfos fieldInfos ) {
95+ TrackingLengthFields (Fields in , FieldInfos fieldInfos , IntHashSet seenFields , long [] totalBytes ) {
9596 super (in );
96- this .termsBytesPerField = termsBytesPerField ;
97+ this .seenFields = seenFields ;
9798 this .fieldInfos = fieldInfos ;
99+ this .totalBytes = totalBytes ;
98100 }
99101
100102 @ Override
@@ -104,10 +106,14 @@ public Terms terms(String field) throws IOException {
104106 return null ;
105107 }
106108 int fieldNum = fieldInfos .fieldInfo (field ).number ;
107- return new TrackingLengthTerms (
108- terms ,
109- bytes -> termsBytesPerField .put (fieldNum , Math .max (termsBytesPerField .getOrDefault (fieldNum , 0 ), bytes ))
110- );
109+ if (seenFields .add (fieldNum )) {
110+ return new TrackingLengthTerms (terms , bytes -> totalBytes [0 ] += bytes );
111+ } else {
112+ // As far as I know only when bloom filter for _id filter gets written this method gets invoked twice for the same field.
113+ // So maybe we can get rid of the seenFields here? And just keep track of whether _id field has been seen? However, this
114+ // is fragile and could make us vulnerable to tricky bugs in the future if this is no longer the case.
115+ return terms ;
116+ }
111117 }
112118 }
113119
0 commit comments