@@ -284,43 +284,61 @@ int trySize() {
284284 }
285285
286286 @ Override
287- int size (final ImmutableTrieMap <?, ? > ct ) {
287+ int size (final ImmutableTrieMap <K , V > ct ) {
288288 int sz ;
289289 return (sz = csize ) != NO_SIZE ? sz : (csize = computeSize (ct ));
290290 }
291291
292- // lends itself towards being parallelizable by choosing
293- // a random starting offset in the array
294- // => if there are concurrent size computations, they start
295- // at different positions, so they are more likely to
296- // to be independent
297- private int computeSize (final ImmutableTrieMap <?, ?> ct ) {
292+ private int computeSize (final ImmutableTrieMap <K , V > ct ) {
298293 final int len = array .length ;
299294 return switch (len ) {
300295 case 0 -> 0 ;
301- case 1 -> elementSize (ct , array [0 ]);
302- default -> {
303- final int offset = ThreadLocalRandom .current ().nextInt (len );
304- int sz = 0 ;
305- for (int i = offset ; i < len ; ++i ) {
306- sz += elementSize (ct , array [i ]);
307- }
308- for (int i = 0 ; i < offset ; ++i ) {
309- sz += elementSize (ct , array [i ]);
310- }
311- yield sz ;
312- }
296+ case 1 -> array [0 ].elementSize (ct );
297+ default -> computeSize (ct , array , len );
313298 };
314299 }
315300
316- private static int elementSize (final ImmutableTrieMap <?, ?> ct , final Branch <?, ?> elem ) {
317- if (elem instanceof SNode ) {
318- return 1 ;
319- } else if (elem instanceof INode <?, ?> inode ) {
320- return inode .readSize (ct );
321- } else {
322- throw invalidElement (elem );
301+ // Lends itself towards being parallelizable by choosing a random starting offset in the array: if there are
302+ // concurrent size computations, they start at different positions, so they are more likely to be independent
303+ private static <K , V > int computeSize (final ImmutableTrieMap <K , V > ct , final Branch <K , V >[] array , final int len ) {
304+ // TODO: The other side of this argument is that array is 2-32 items long, i.e. on OpenJDK 21 on x64 the array
305+ // ends up being 16 + (2-32) * (4/8) == 24-144 / 32-272 bytes each.
306+ //
307+ // When traversing we do not dereference SNodes, but each INode either returns a cached value or goes off
308+ // and branches (via a 16-byte object) branch to (eventually) this code in some other CNode. We also know
309+ // we have at least 2 entries to traverse.
310+ //
311+ // Taking into consideration a modern CPU, with:
312+ // - 12 physical cores: 4 P-cores (2 threads each), 8 E-cores (1 thread each)
313+ // - 64 byte cache line size
314+ // - L1d
315+ // - 48KiB L1d per P-core
316+ // - 32KiB L1d per E-core
317+ // - L2 unified
318+ // - 1.25MiB per P-core
319+ // - 2MiB for each 4 E-cores
320+ // - L3 unified 12MiB
321+ // it would seam that all things being optimal, each thread is using 24-32KiB L1d, 512-1024KiB L2 and
322+ // about 769KiB of L3.
323+ //
324+ // So three things:
325+ // 0) We really would like to prevent L1d bounces, so threads on different cores should be touching
326+ // different cachelines. We are looking at traversing 3-5 linear cache lines.
327+ // 1) Would it make sense to inline the loops below, for example by counting odds and evens into
328+ // separate variables, striding by 2 and then combining the two counters?
329+ // 2) On the other hand, doesn't JIT already take care of this? Is there something we can do better,
330+ // like making sure the starting offset is aligned just by taking less random entropy?
331+ //
332+ // Note: len >= 2 is enforced by the sole caller
333+ final int offset = ThreadLocalRandom .current ().nextInt (len );
334+ int sz = 0 ;
335+ for (int i = offset ; i < len ; ++i ) {
336+ sz += array [i ].elementSize (ct );
337+ }
338+ for (int i = 0 ; i < offset ; ++i ) {
339+ sz += array [i ].elementSize (ct );
323340 }
341+ return sz ;
324342 }
325343
326344 private CNode <K , V > updatedAt (final int pos , final Branch <K , V > nn , final Gen ngen ) {
0 commit comments