22
22
import org .apache .lucene .index .SegmentWriteState ;
23
23
import org .apache .lucene .index .Terms ;
24
24
import org .apache .lucene .index .TermsEnum ;
25
- import org .apache .lucene .internal .hppc .IntIntHashMap ;
25
+ import org .apache .lucene .internal .hppc .IntHashSet ;
26
26
import org .apache .lucene .util .BytesRef ;
27
27
import org .elasticsearch .common .util .FeatureFlag ;
28
28
@@ -63,22 +63,22 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException
63
63
static final class TrackingLengthFieldsConsumer extends FieldsConsumer {
64
64
final SegmentWriteState state ;
65
65
final FieldsConsumer in ;
66
- final IntIntHashMap termsBytesPerField ;
66
+ final IntHashSet seenFields ;
67
+ final long [] totalBytes ;
67
68
68
69
TrackingLengthFieldsConsumer (SegmentWriteState state , FieldsConsumer in ) {
69
70
this .state = state ;
70
71
this .in = in ;
71
- this .termsBytesPerField = new IntIntHashMap (state .fieldInfos .size ());
72
+ this .totalBytes = new long [1 ];
73
+ // Alternatively, we can consider using a FixedBitSet here and size to max(fieldNumber).
74
+ // This should be faster without worrying too much about memory usage.
75
+ this .seenFields = new IntHashSet (state .fieldInfos .size ());
72
76
}
73
77
74
78
@ Override
75
79
public void write (Fields fields , NormsProducer norms ) throws IOException {
76
- in .write (new TrackingLengthFields (fields , termsBytesPerField , state .fieldInfos ), norms );
77
- long totalBytes = 0 ;
78
- for (int bytes : termsBytesPerField .values ) {
79
- totalBytes += bytes ;
80
- }
81
- state .segmentInfo .putAttribute (IN_MEMORY_POSTINGS_BYTES_KEY , Long .toString (totalBytes ));
80
+ in .write (new TrackingLengthFields (fields , state .fieldInfos , seenFields , totalBytes ), norms );
81
+ state .segmentInfo .putAttribute (IN_MEMORY_POSTINGS_BYTES_KEY , Long .toString (totalBytes [0 ]));
82
82
}
83
83
84
84
@ Override
@@ -88,13 +88,15 @@ public void close() throws IOException {
88
88
}
89
89
90
90
static final class TrackingLengthFields extends FilterLeafReader .FilterFields {
91
- final IntIntHashMap termsBytesPerField ;
92
91
final FieldInfos fieldInfos ;
92
+ final IntHashSet seenFields ;
93
+ final long [] totalBytes ;
93
94
94
- TrackingLengthFields (Fields in , IntIntHashMap termsBytesPerField , FieldInfos fieldInfos ) {
95
+ TrackingLengthFields (Fields in , FieldInfos fieldInfos , IntHashSet seenFields , long [] totalBytes ) {
95
96
super (in );
96
- this .termsBytesPerField = termsBytesPerField ;
97
+ this .seenFields = seenFields ;
97
98
this .fieldInfos = fieldInfos ;
99
+ this .totalBytes = totalBytes ;
98
100
}
99
101
100
102
@ Override
@@ -104,10 +106,14 @@ public Terms terms(String field) throws IOException {
104
106
return null ;
105
107
}
106
108
int fieldNum = fieldInfos .fieldInfo (field ).number ;
107
- return new TrackingLengthTerms (
108
- terms ,
109
- bytes -> termsBytesPerField .put (fieldNum , Math .max (termsBytesPerField .getOrDefault (fieldNum , 0 ), bytes ))
110
- );
109
+ if (seenFields .add (fieldNum )) {
110
+ return new TrackingLengthTerms (terms , bytes -> totalBytes [0 ] += bytes );
111
+ } else {
112
+ // As far as I know only when bloom filter for _id filter gets written this method gets invoked twice for the same field.
113
+ // So maybe we can get rid of the seenFields here? And just keep track of whether _id field has been seen? However, this
114
+ // is fragile and could make us vulnerable to tricky bugs in the future if this is no longer the case.
115
+ return terms ;
116
+ }
111
117
}
112
118
}
113
119
0 commit comments