@@ -39,6 +39,15 @@ public class RAMStringIndexer
39
39
{
40
40
@ VisibleForTesting
41
41
public static int MAX_BLOCK_BYTE_POOL_SIZE = Integer .MAX_VALUE ;
42
+
43
+ /**
44
+ * Int2IntHashMap "docLengths" needs to resize when size reaches 348_966_081 (capacity * loadFactor). At that point, its capacity is 536870912.
45
+ * Its new capacity will be quadrupled and exceed Integer.MAX_VALUE.
46
+ *
47
+ * Pick 300_000_000 for simplicity to trigger segment flush.
48
+ */
49
+ private static final int MAX_DOCS_SIZE = 300_000_000 ;
50
+
42
51
private final BytesRefHash termsHash ;
43
52
private final RAMPostingSlices slices ;
44
53
// counters need to be separate so that we can trigger flushes if either ByteBlockPool hits maximum size
@@ -48,11 +57,19 @@ public class RAMStringIndexer
48
57
private int [] lastSegmentRowID = new int [RAMPostingSlices .DEFAULT_TERM_DICT_SIZE ];
49
58
50
59
private final boolean writeFrequencies ;
60
+ private final int maxDocSize ;
51
61
private final Int2IntHashMap docLengths = new Int2IntHashMap (Integer .MIN_VALUE );
52
62
53
63
public RAMStringIndexer (boolean writeFrequencies )
64
+ {
65
+ this (writeFrequencies , MAX_DOCS_SIZE );
66
+ }
67
+
68
+ @ VisibleForTesting
69
+ RAMStringIndexer (boolean writeFrequencies , int maxDocSize )
54
70
{
55
71
this .writeFrequencies = writeFrequencies ;
72
+ this .maxDocSize = maxDocSize ;
56
73
termsBytesUsed = Counter .newCounter ();
57
74
slicesBytesUsed = Counter .newCounter ();
58
75
@@ -65,7 +82,11 @@ public RAMStringIndexer(boolean writeFrequencies)
65
82
66
83
public long estimatedBytesUsed ()
67
84
{
68
- return termsBytesUsed .get () + slicesBytesUsed .get ();
85
+ // record the array memory usage from Int2IntHashMap docLengths:
86
+ // * array size is capacity * 2
87
+ // * 4 bytes per int
88
+ long docLengthsMemoryUsage = docLengths .capacity () * 2 * 4L ;
89
+ return docLengthsMemoryUsage + termsBytesUsed .get () + slicesBytesUsed .get () + slices .arrayMemoryUsage ();
69
90
}
70
91
71
92
public boolean requiresFlush ()
@@ -75,7 +96,9 @@ public boolean requiresFlush()
75
96
// be triggered by an addition, and the rest of the space in the final chunk will be wasted, as the bytesUsed
76
97
// counters track block allocation, not the size of additions. This means that we can't pass this check and then
77
98
// fail to add a term.
78
- return termsBytesUsed .get () >= MAX_BLOCK_BYTE_POOL_SIZE || slicesBytesUsed .get () >= MAX_BLOCK_BYTE_POOL_SIZE ;
99
+ return termsBytesUsed .get () >= MAX_BLOCK_BYTE_POOL_SIZE || slicesBytesUsed .get () >= MAX_BLOCK_BYTE_POOL_SIZE
100
+ // to avoid Int2IntHashMap new capacity overflow
101
+ || docLengths .size () >= maxDocSize ;
79
102
}
80
103
81
104
public boolean isEmpty ()
0 commit comments