Skip to content

Commit 0076327

Browse files
committed
Add Branch.elementSize()
Capture the type dispatch in CNode.elementSize() into a Branch.elementSize(), which is then implemented by the two types. As a consequence, the type safety of MainNode.size() is improved as well. Signed-off-by: Robert Varga <robert.varga@pantheon.tech>
1 parent 7ff546a commit 0076327

File tree

8 files changed

+63
-34
lines changed

8 files changed

+63
-34
lines changed

triemap/src/main/java/tech/pantheon/triemap/Branch.java

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,11 @@
1919
* A Branch: either an {@link INode} or an {@link SNode}.
2020
*/
2121
sealed interface Branch<K, V> permits INode, SNode {
22-
// Nothing else
22+
/**
23+
* Return the number of entries for the purposes of {@link CNode#size(ImmutableTrieMap)}.
24+
*
25+
* @param ct TrieMap reference
26+
* @return The actual number of entries
27+
*/
28+
int elementSize(ImmutableTrieMap<K, V> ct);
2329
}

triemap/src/main/java/tech/pantheon/triemap/CNode.java

Lines changed: 44 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -284,43 +284,61 @@ int trySize() {
284284
}
285285

286286
@Override
287-
int size(final ImmutableTrieMap<?, ?> ct) {
287+
int size(final ImmutableTrieMap<K, V> ct) {
288288
int sz;
289289
return (sz = csize) != NO_SIZE ? sz : (csize = computeSize(ct));
290290
}
291291

292-
// lends itself towards being parallelizable by choosing
293-
// a random starting offset in the array
294-
// => if there are concurrent size computations, they start
295-
// at different positions, so they are more likely to
296-
// to be independent
297-
private int computeSize(final ImmutableTrieMap<?, ?> ct) {
292+
private int computeSize(final ImmutableTrieMap<K, V> ct) {
298293
final int len = array.length;
299294
return switch (len) {
300295
case 0 -> 0;
301-
case 1 -> elementSize(ct, array[0]);
302-
default -> {
303-
final int offset = ThreadLocalRandom.current().nextInt(len);
304-
int sz = 0;
305-
for (int i = offset; i < len; ++i) {
306-
sz += elementSize(ct, array[i]);
307-
}
308-
for (int i = 0; i < offset; ++i) {
309-
sz += elementSize(ct, array[i]);
310-
}
311-
yield sz;
312-
}
296+
case 1 -> array[0].elementSize(ct);
297+
default -> computeSize(ct, array, len);
313298
};
314299
}
315300

316-
private static int elementSize(final ImmutableTrieMap<?, ?> ct, final Branch<?, ?> elem) {
317-
if (elem instanceof SNode) {
318-
return 1;
319-
} else if (elem instanceof INode<?, ?> inode) {
320-
return inode.readSize(ct);
321-
} else {
322-
throw invalidElement(elem);
301+
// Lends itself towards being parallelizable by choosing a random starting offset in the array: if there are
302+
// concurrent size computations, they start at different positions, so they are more likely to be independent
303+
private static <K, V> int computeSize(final ImmutableTrieMap<K, V> ct, final Branch<K, V>[] array, final int len) {
304+
// TODO: The other side of this argument is that array is 2-32 items long, i.e. on OpenJDK 21 on x64 the array
305+
// ends up being 16 + (2-32) * (4/8) == 24-144 / 32-272 bytes each.
306+
//
307+
// When traversing we do not dereference SNodes, but each INode either returns a cached value or goes off
308+
// and branches (via a 16-byte object) branch to (eventually) this code in some other CNode. We also know
309+
// we have at least 2 entries to traverse.
310+
//
311+
// Taking into consideration a modern CPU, with:
312+
// - 12 physical cores: 4 P-cores (2 threads each), 8 E-cores (1 thread each)
313+
// - 64 byte cache line size
314+
// - L1d
315+
// - 48KiB L1d per P-core
316+
// - 32KiB L1d per E-core
317+
// - L2 unified
318+
// - 1.25MiB per P-core
319+
// - 2MiB for each 4 E-cores
320+
// - L3 unified 12MiB
321+
// it would seam that all things being optimal, each thread is using 24-32KiB L1d, 512-1024KiB L2 and
322+
// about 769KiB of L3.
323+
//
324+
// So three things:
325+
// 0) We really would like to prevent L1d bounces, so threads on different cores should be touching
326+
// different cachelines. We are looking at traversing 3-5 linear cache lines.
327+
// 1) Would it make sense to inline the loops below, for example by counting odds and evens into
328+
// separate variables, striding by 2 and then combining the two counters?
329+
// 2) On the other hand, doesn't JIT already take care of this? Is there something we can do better,
330+
// like making sure the starting offset is aligned just by taking less random entropy?
331+
//
332+
// Note: len >= 2 is enforced by the sole caller
333+
final int offset = ThreadLocalRandom.current().nextInt(len);
334+
int sz = 0;
335+
for (int i = offset; i < len; ++i) {
336+
sz += array[i].elementSize(ct);
337+
}
338+
for (int i = 0; i < offset; ++i) {
339+
sz += array[i].elementSize(ct);
323340
}
341+
return sz;
324342
}
325343

326344
private CNode<K, V> updatedAt(final int pos, final Branch<K, V> nn, final Gen ngen) {

triemap/src/main/java/tech/pantheon/triemap/INode.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,8 @@ INode<K, V> copyToGen(final TrieMap<K, V> ct, final Gen ngen) {
215215
return new INode<>(ngen, gcasRead(ct));
216216
}
217217

218-
int readSize(final ImmutableTrieMap<?, ?> ct) {
218+
@Override
219+
public int elementSize(final ImmutableTrieMap<K, V> ct) {
219220
return gcasReadNonNull(ct).size(ct);
220221
}
221222

triemap/src/main/java/tech/pantheon/triemap/ImmutableTrieMap.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ public V replace(final K key, final V value) {
104104

105105
@Override
106106
public int size() {
107-
return root.readSize(this);
107+
return root.elementSize(this);
108108
}
109109

110110
@Override

triemap/src/main/java/tech/pantheon/triemap/LNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ int trySize() {
3737
}
3838

3939
@Override
40-
int size(final ImmutableTrieMap<?, ?> ct) {
40+
int size(final ImmutableTrieMap<K, V> ct) {
4141
return size;
4242
}
4343
}

triemap/src/main/java/tech/pantheon/triemap/MainNode.java

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,10 @@ abstract sealed class MainNode<K, V> extends INode.TryGcas<K, V> permits CNode,
5050
abstract int trySize();
5151

5252
/**
53-
* Return the number of entries in this node, traversing it if need be. This method should be invoked only
54-
* on immutable snapshots.
53+
* Return the number of entries in this node, traversing it if need be.
5554
*
5655
* @param ct TrieMap reference
5756
* @return The actual number of entries.
5857
*/
59-
abstract int size(ImmutableTrieMap<?, ?> ct);
58+
abstract int size(ImmutableTrieMap<K, V> ct);
6059
}

triemap/src/main/java/tech/pantheon/triemap/SNode.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,11 @@ boolean matches(final int otherHc, final Object otherKey) {
3535
return new Result<>(value);
3636
}
3737

38+
@Override
39+
public int elementSize(final ImmutableTrieMap<K, V> ct) {
40+
return 1;
41+
}
42+
3843
@Override
3944
public int hashCode() {
4045
return AbstractEntry.hashCode(key, value);

triemap/src/main/java/tech/pantheon/triemap/TNode.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ int trySize() {
5656
}
5757

5858
@Override
59-
int size(final ImmutableTrieMap<?, ?> ct) {
59+
int size(final ImmutableTrieMap<K, V> ct) {
6060
return 1;
6161
}
6262

0 commit comments

Comments
 (0)