Skip to content

Commit fef8871

Browse files
authored
CNDB-13997 maintain total terms count in memindex (#1776)
Track the terms count when updating memtabel index and utilize it together with indexed rows in BM25 queries. The terms count is approximate, since memtable index doesn't account for deletes. This is needed for next step to calculate average document length on all documents of a table partition. Fixes confusion in tests between ID column values and position in the array with data. Changes values of column ID for BM25 to start from 0, so it corresponds to the original data array.
1 parent 1db6138 commit fef8871

File tree

3 files changed

+121
-72
lines changed

3 files changed

+121
-72
lines changed

src/java/org/apache/cassandra/index/sai/memory/TrieMemoryIndex.java

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,8 @@ public class TrieMemoryIndex extends MemoryIndex
9292
private final PrimaryKeysRemover primaryKeysRemover;
9393
private final boolean analyzerTransformsValue;
9494
private final Map<PrimaryKey, Integer> docLengths = new HashMap<>();
95-
private final AtomicInteger indexedRows = new AtomicInteger(0);
95+
private volatile int indexedRows = 0;
96+
private volatile long totalTermCount = 0;
9697

9798
private final Memtable memtable;
9899
private AbstractBounds<PartitionPosition> keyBounds;
@@ -134,7 +135,18 @@ public synchronized Map<PrimaryKey, Integer> getDocLengths()
134135
@Override
135136
public int indexedRows()
136137
{
137-
return indexedRows.get();
138+
return indexedRows;
139+
}
140+
141+
/**
142+
* The count of terms for indexed rows is maintained during insertions and updates.
143+
* Deletes are not accounted for. Thus, the count is approximated.
144+
*
145+
* @return the total number of terms in the indexed rows
146+
*/
147+
public long approximateTotalTermCount()
148+
{
149+
return totalTermCount;
138150
}
139151

140152
public synchronized void add(DecoratedKey key,
@@ -263,6 +275,12 @@ private void applyTransformer(PrimaryKey primaryKey,
263275
Object prev = docLengths.put(primaryKey, tokenCount);
264276
if (prev != null)
265277
{
278+
// An update first transforms with Accumulator to the new value,
279+
// then transforms with Remover from the old value.
280+
if (transformer instanceof PrimaryKeysAccumulator)
281+
totalTermCount += tokenCount;
282+
if (transformer instanceof PrimaryKeysRemover)
283+
totalTermCount -= tokenCount;
266284
// heap used for doc lengths
267285
long heapUsed = RamUsageEstimator.HASHTABLE_RAM_BYTES_PER_ENTRY
268286
+ primaryKey.ramBytesUsed() // TODO do we count these bytes?
@@ -271,7 +289,8 @@ private void applyTransformer(PrimaryKey primaryKey,
271289
}
272290
else
273291
{
274-
indexedRows.incrementAndGet();
292+
indexedRows++;
293+
totalTermCount += tokenCount;
275294
}
276295

277296
// memory used by the trie

src/java/org/apache/cassandra/index/sai/memory/TrieMemtableIndex.java

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,23 @@ public int indexedRows()
128128
return size;
129129
}
130130

131+
/**
132+
* Approximate total count of terms in the memory index.
133+
* The count is approximate because deletions are not accounted for.
134+
*
135+
* @return total count of terms for indexes rows.
136+
*/
137+
public long approximateTotalTermCount()
138+
{
139+
long count = 0;
140+
for (MemoryIndex memoryIndex : rangeIndexes)
141+
{
142+
assert memoryIndex instanceof TrieMemoryIndex;
143+
count += ((TrieMemoryIndex) memoryIndex).approximateTotalTermCount();
144+
}
145+
return count;
146+
}
147+
131148
@VisibleForTesting
132149
public int shardCount()
133150
{
@@ -411,8 +428,6 @@ private CloseableIterator<PrimaryKeyWithSortKey> orderByBM25(Stream<PrimaryKey>
411428
private BM25Utils.DocStats computeDocumentFrequencies(List<ByteBuffer> queryTerms, AbstractAnalyzer docAnalyzer)
412429
{
413430
var documentFrequencies = new HashMap<ByteBuffer, Long>();
414-
long docCount = 0;
415-
long totalTermCount = 0;
416431

417432
// count all documents in the queried column
418433
try (var it = memtable.makePartitionIterator(ColumnFilter.selection(RegularAndStaticColumns.of(indexContext.getDefinition())),
@@ -438,7 +453,6 @@ private BM25Utils.DocStats computeDocumentFrequencies(List<ByteBuffer> queryTerm
438453
while (docAnalyzer.hasNext())
439454
{
440455
ByteBuffer term = docAnalyzer.next();
441-
totalTermCount++;
442456
if (queryTerms.contains(term))
443457
queryTermsPerDoc.add(term);
444458
}
@@ -450,11 +464,10 @@ private BM25Utils.DocStats computeDocumentFrequencies(List<ByteBuffer> queryTerm
450464
for (ByteBuffer term : queryTermsPerDoc)
451465
documentFrequencies.merge(term, 1L, Long::sum);
452466

453-
docCount++;
454467
}
455468
}
456469
}
457-
return new BM25Utils.DocStats(documentFrequencies, docCount, totalTermCount);
470+
return new BM25Utils.DocStats(documentFrequencies, indexedRows(), approximateTotalTermCount());
458471
}
459472

460473
@Nullable

0 commit comments

Comments
 (0)