Skip to content

Commit b38dadc

Browse files
jasonstackdriftx
authored andcommitted
CNDB-14773: avoid Int2IntHashMap overflow in RAMStringIndexer and improve memory track to include array memory usage (#1885)
### What is the issue #14773: Int2IntHashMap overflow when num of docs reaches 348_966_081 ### What does this PR fix and why was it fixed Trigger segment flush before overflow and include array memory usage to avoid undercounting memory usage
1 parent 1d6ebd5 commit b38dadc

File tree

3 files changed

+50
-2
lines changed

3 files changed

+50
-2
lines changed

src/java/org/apache/cassandra/index/sai/disk/RAMPostingSlices.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,11 @@ class RAMPostingSlices
4949
this.includeFrequencies = includeFrequencies;
5050
}
5151

52+
long arrayMemoryUsage()
53+
{
54+
return postingStarts.length * 4L + postingUptos.length * 4L + sizes.length * 4L;
55+
}
56+
5257
/**
5358
* Creates and returns a PostingList for the given term ID.
5459
*/

src/java/org/apache/cassandra/index/sai/disk/RAMStringIndexer.java

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,15 @@ public class RAMStringIndexer
3939
{
4040
@VisibleForTesting
4141
public static int MAX_BLOCK_BYTE_POOL_SIZE = Integer.MAX_VALUE;
42+
43+
/**
44+
* Int2IntHashMap "docLengths" needs to resize when size reaches 348_966_081 (capacity * loadFactor). At that point, its capacity is 536870912.
45+
* Its new capacity will be quadrupled and exceed Integer.MAX_VALUE.
46+
*
47+
* Pick 300_000_000 for simplicity to trigger segment flush.
48+
*/
49+
private static final int MAX_DOCS_SIZE = 300_000_000;
50+
4251
private final BytesRefHash termsHash;
4352
private final RAMPostingSlices slices;
4453
// counters need to be separate so that we can trigger flushes if either ByteBlockPool hits maximum size
@@ -48,11 +57,19 @@ public class RAMStringIndexer
4857
private int[] lastSegmentRowID = new int[RAMPostingSlices.DEFAULT_TERM_DICT_SIZE];
4958

5059
private final boolean writeFrequencies;
60+
private final int maxDocSize;
5161
private final Int2IntHashMap docLengths = new Int2IntHashMap(Integer.MIN_VALUE);
5262

5363
public RAMStringIndexer(boolean writeFrequencies)
64+
{
65+
this(writeFrequencies, MAX_DOCS_SIZE);
66+
}
67+
68+
@VisibleForTesting
69+
RAMStringIndexer(boolean writeFrequencies, int maxDocSize)
5470
{
5571
this.writeFrequencies = writeFrequencies;
72+
this.maxDocSize = maxDocSize;
5673
termsBytesUsed = Counter.newCounter();
5774
slicesBytesUsed = Counter.newCounter();
5875

@@ -65,7 +82,11 @@ public RAMStringIndexer(boolean writeFrequencies)
6582

6683
public long estimatedBytesUsed()
6784
{
68-
return termsBytesUsed.get() + slicesBytesUsed.get();
85+
// record the array memory usage from Int2IntHashMap docLengths:
86+
// * array size is capacity * 2
87+
// * 4 bytes per int
88+
long docLengthsMemoryUsage = docLengths.capacity() * 2 * 4L;
89+
return docLengthsMemoryUsage + termsBytesUsed.get() + slicesBytesUsed.get() + slices.arrayMemoryUsage();
6990
}
7091

7192
public boolean requiresFlush()
@@ -75,7 +96,9 @@ public boolean requiresFlush()
7596
// be triggered by an addition, and the rest of the space in the final chunk will be wasted, as the bytesUsed
7697
// counters track block allocation, not the size of additions. This means that we can't pass this check and then
7798
// fail to add a term.
78-
return termsBytesUsed.get() >= MAX_BLOCK_BYTE_POOL_SIZE || slicesBytesUsed.get() >= MAX_BLOCK_BYTE_POOL_SIZE;
99+
return termsBytesUsed.get() >= MAX_BLOCK_BYTE_POOL_SIZE || slicesBytesUsed.get() >= MAX_BLOCK_BYTE_POOL_SIZE
100+
// to avoid Int2IntHashMap new capacity overflow
101+
|| docLengths.size() >= maxDocSize;
79102
}
80103

81104
public boolean isEmpty()

test/unit/org/apache/cassandra/index/sai/disk/RAMStringIndexerTest.java

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,26 @@ public void test() throws Exception
7777
}
7878
}
7979

80+
@Test
81+
public void testLargeNumberOfDocs()
82+
{
83+
int maxDocsSize = 1000;
84+
RAMStringIndexer indexer = new RAMStringIndexer(false, maxDocsSize);
85+
86+
int startingRowId = 0;
87+
int i = 0;
88+
while (i++ < maxDocsSize)
89+
{
90+
int rowId = startingRowId + i;
91+
indexer.addAll(List.of(new BytesRef("0")), rowId);
92+
93+
if (i < maxDocsSize)
94+
assertFalse(indexer.requiresFlush());
95+
}
96+
97+
assertTrue(indexer.requiresFlush());
98+
}
99+
80100
@Test
81101
public void testWithFrequencies() throws Exception
82102
{

0 commit comments

Comments
 (0)