@@ -78,15 +78,13 @@ public String toString() {
7878 /**
7979 * Loads low cardinality singleton ordinals in using a cache of code point counts.
8080 * <p>
81- * If we haven't cached the counts for all ordinals then the process looks like:
81+ * It's very important to look up ordinals in ascending order. So, if we haven't
82+ * cached the counts for all ordinals, then the process looks like:
8283 * </p>
8384 * <ol>
84- * <li>Build an int[] containing the ordinals</li>
85- * <li>
86- * Sort a copy of the int[] and load the cache for each ordinal. The sorting
87- * is important here because ordinals are faster to resolved in ascending order.
88- * </li>
89- * <li>Walk the int[] in order, reading from the cache to build the page</li>
85+ * <li>Build an {@code int[]} containing the ordinals</li>
86+ * <li>Sort a copy of the {@code int[]} and load the count into the cache for each ordinal.</li>
87+ * <li>Walk the unsorted {@code int[]} reading from the cache to build the page</li>
9088 * </ol>
9189 * <p>
9290 * If we <strong>have</strong> cached the counts for all ordinals we load the
@@ -178,6 +176,9 @@ private int[] readOrds(BlockFactory factory, Docs docs, int offset) throws IOExc
178176 }
179177 }
180178
179+ /**
180+ * Fill the cache for all ords. We skip values {@code -1} which represent "no data".
181+ */
181182 private void fillCache (BlockFactory factory , int [] ords ) throws IOException {
182183 factory .adjustBreaker (RamUsageEstimator .sizeOf (ords ));
183184 try {
@@ -196,6 +197,10 @@ private void fillCache(BlockFactory factory, int[] ords) throws IOException {
196197 }
197198 }
198199
200+ /**
201+ * Build the results for a list of documents directly from the cache. We use this
202+ * if we're sure that all ordinals we're going to load are already cached.
203+ */
199204 private Block buildFromFilledCache (BlockFactory factory , Docs docs , int offset ) throws IOException {
200205 int count = docs .count () - offset ;
201206 try (IntBuilder builder = factory .ints (count )) {
@@ -211,6 +216,10 @@ private Block buildFromFilledCache(BlockFactory factory, Docs docs, int offset)
211216 }
212217 }
213218
219+ /**
220+ * Get the count of code points at the ord, reading from the cache if possible.
221+ * The {@code ord} must be {@code >= 0} or this will fail.
222+ */
214223 private int codePointsAtOrd (int ord ) throws IOException {
215224 if (cache [ord ] >= 0 ) {
216225 return cache [ord ];
@@ -253,7 +262,7 @@ public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFilt
253262 return buildFromFilledCache (factory , docs , offset );
254263 }
255264
256- int [] ords = readOrds (factory , docs , offset );
265+ int [] ords = readOrds (ordinals , warnings , factory , docs , offset );
257266 try {
258267 fillCache (factory , ords );
259268 return buildFromCache (factory , cache , ords );
@@ -297,36 +306,9 @@ private Block blockForSingleDoc(BlockFactory factory, int docId) throws IOExcept
297306 return factory .constantNulls (1 );
298307 }
299308
300- private int [] readOrds (BlockFactory factory , Docs docs , int offset ) throws IOException {
301- int count = docs .count () - offset ;
302- long size = sizeOfArray (count );
303- factory .adjustBreaker (size );
304- int [] ords = null ;
305- try {
306- ords = new int [docs .count () - offset ];
307- for (int i = offset ; i < docs .count (); i ++) {
308- int doc = docs .get (i );
309- if (ordinals .advanceExact (doc ) == false ) {
310- ords [i ] = -1 ;
311- continue ;
312- }
313- if (ordinals .docValueCount () != 1 ) {
314- registerSingleValueWarning (warnings );
315- ords [i ] = -1 ;
316- continue ;
317- }
318- ords [i ] = Math .toIntExact (ordinals .nextOrd ());
319- }
320- int [] result = ords ;
321- ords = null ;
322- return result ;
323- } finally {
324- if (ords != null ) {
325- factory .adjustBreaker (-size );
326- }
327- }
328- }
329-
309+ /**
310+ * Fill the cache for all ords. We skip values {@code -1} which represent "no data".
311+ */
330312 private void fillCache (BlockFactory factory , int [] ords ) throws IOException {
331313 factory .adjustBreaker (RamUsageEstimator .sizeOf (ords ));
332314 try {
@@ -345,6 +327,10 @@ private void fillCache(BlockFactory factory, int[] ords) throws IOException {
345327 }
346328 }
347329
330+ /**
331+ * Build the results for a list of documents directly from the cache. We use this
332+ * if we're sure that all ordinals we're going to load are already cached.
333+ */
348334 private Block buildFromFilledCache (BlockFactory factory , Docs docs , int offset ) throws IOException {
349335 int count = docs .count () - offset ;
350336 try (IntBuilder builder = factory .ints (count )) {
@@ -365,6 +351,10 @@ private Block buildFromFilledCache(BlockFactory factory, Docs docs, int offset)
365351 }
366352 }
367353
354+ /**
355+ * Get the count of code points at the ord, reading from the cache if possible.
356+ * The {@code ord} must be {@code >= 0} or this will fail.
357+ */
368358 private int codePointsAtOrd (int ord ) throws IOException {
369359 if (cache [ord ] >= 0 ) {
370360 return cache [ord ];
@@ -378,8 +368,21 @@ private int codePointsAtOrd(int ord) throws IOException {
378368 }
379369
380370 /**
381- * Loads a count of utf-8 code points for each ordinal doc by doc, without a cache. We use this when there
371+ * Loads a count of utf-8 code points for each ordinal without a cache. We use this when there
382372 * are many unique doc values and the cache hit rate is unlikely to be high.
373+ * <p>
374+ * It's very important to read values in sorted order so we:
375+ * </p>
376+ * <ul>
377+ * <li>Load the ordinals into an {@code int[]}, using -1 for "empty" values</li>
378+ * <li>Create a sorted copy of the {@code int[]}</li>
379+ * <li>Compact the sorted {@code int[]}s into a sorted list of unique, non "empty" ordinals</li>
380+ * <li>Count the code points for each of the sorted, compacted ordinals</li>
381+ * <li>
382+ * Walk the original ordinals {@code int[]} which are in doc order, building a {@link Block}
383+ * of counts.
384+ * </li>
385+ * </ul>
383386 */
384387 private static class ImmediateOrdinals extends BlockDocValuesReader {
385388 private final Warnings warnings ;
@@ -395,11 +398,32 @@ public Block read(BlockFactory factory, Docs docs, int offset, boolean nullsFilt
395398 if (docs .count () - offset == 1 ) {
396399 return blockForSingleDoc (factory , docs .get (offset ));
397400 }
398- try (IntBuilder builder = factory .ints (docs .count () - offset )) {
399- for (int i = offset ; i < docs .count (); i ++) {
400- read (docs .get (i ), builder );
401+
402+ int [] ords = readOrds (ordinals , warnings , factory , docs , offset );
403+ int [] sortedOrds = null ;
404+ int [] counts = null ;
405+ try {
406+ sortedOrds = sortedOrds (factory , ords );
407+ int compactedLength = compactSorted (sortedOrds );
408+ counts = counts (factory , sortedOrds , compactedLength );
409+ try (IntBuilder builder = factory .ints (ords .length )) {
410+ for (int ord : ords ) {
411+ if (ord >= 0 ) {
412+ builder .appendInt (counts [Arrays .binarySearch (sortedOrds , 0 , compactedLength , ord )]);
413+ } else {
414+ builder .appendNull ();
415+ }
416+ }
417+ return builder .build ();
418+ }
419+ } finally {
420+ factory .adjustBreaker (-RamUsageEstimator .shallowSizeOf (ords ));
421+ if (sortedOrds != null ) {
422+ factory .adjustBreaker (-RamUsageEstimator .shallowSizeOf (sortedOrds ));
423+ }
424+ if (counts != null ) {
425+ factory .adjustBreaker (-RamUsageEstimator .shallowSizeOf (counts ));
401426 }
402- return builder .build ();
403427 }
404428 }
405429
@@ -442,11 +466,93 @@ private Block blockForSingleDoc(BlockFactory factory, int docId) throws IOExcept
442466 return factory .constantNulls (1 );
443467 }
444468
469+ /**
470+ * Builds a sorted copy of the loaded ordinals.
471+ */
472+ private int [] sortedOrds (BlockFactory factory , int [] ords ) {
473+ factory .adjustBreaker (RamUsageEstimator .sizeOf (ords ));
474+ int [] sortedOrds = ords .clone ();
475+ Arrays .sort (sortedOrds );
476+ return sortedOrds ;
477+ }
478+
479+ /**
480+ * Compacts the array of sorted ordinals into an array of populated ({@code >= 0}), unique ordinals.
481+ * @return the length of the unique array
482+ */
483+ private int compactSorted (int [] sortedOrds ) {
484+ int c = 0 ;
485+ int i = 0 ;
486+ while (i < sortedOrds .length && sortedOrds [i ] < 0 ) {
487+ i ++;
488+ }
489+ while (i < sortedOrds .length ) {
490+ if (false == (i > 0 && sortedOrds [i - 1 ] == sortedOrds [i ])) {
491+ sortedOrds [c ++] = sortedOrds [i ];
492+ }
493+ i ++;
494+ }
495+ return c ;
496+ }
497+
498+ private int [] counts (BlockFactory factory , int [] compactedSortedOrds , int compactedLength ) throws IOException {
499+ long size = sizeOfArray (compactedLength );
500+ factory .adjustBreaker (size );
501+ int [] counts = new int [compactedLength ];
502+ for (int i = 0 ; i < counts .length ; i ++) {
503+ counts [i ] = codePointsAtOrd (compactedSortedOrds [i ]);
504+ }
505+ return counts ;
506+ }
507+
508+ /**
509+ * Get the count of code points at the ord.
510+ * The {@code ord} must be {@code >= 0} or this will fail.
511+ */
445512 private int codePointsAtOrd (long ord ) throws IOException {
446513 return UnicodeUtil .codePointCount (ordinals .lookupOrd (ord ));
447514 }
448515 }
449516
517+ /**
518+ * Load an ordinal for each position. Three cases:
519+ * <ul>
520+ * <li>There is a single ordinal at this position - load the ordinals value in to the array</li>
521+ * <li>There are no values at this position - load a -1 - we'll skip loading that later</li>
522+ * <li>There are <strong>many</strong> values at this position - load a -1 which we'll skip like above - and emit a warning</li>
523+ * </ul>
524+ */
525+ private static int [] readOrds (SortedSetDocValues ordinals , Warnings warnings , BlockFactory factory , Docs docs , int offset )
526+ throws IOException {
527+ int count = docs .count () - offset ;
528+ long size = sizeOfArray (count );
529+ factory .adjustBreaker (size );
530+ int [] ords = null ;
531+ try {
532+ ords = new int [docs .count () - offset ];
533+ for (int i = offset ; i < docs .count (); i ++) {
534+ int doc = docs .get (i );
535+ if (ordinals .advanceExact (doc ) == false ) {
536+ ords [i ] = -1 ;
537+ continue ;
538+ }
539+ if (ordinals .docValueCount () != 1 ) {
540+ registerSingleValueWarning (warnings );
541+ ords [i ] = -1 ;
542+ continue ;
543+ }
544+ ords [i ] = Math .toIntExact (ordinals .nextOrd ());
545+ }
546+ int [] result = ords ;
547+ ords = null ;
548+ return result ;
549+ } finally {
550+ if (ords != null ) {
551+ factory .adjustBreaker (-size );
552+ }
553+ }
554+ }
555+
450556 private static Block buildFromCache (BlockFactory factory , int [] cache , int [] ords ) {
451557 try (IntBuilder builder = factory .ints (ords .length )) {
452558 for (int ord : ords ) {
0 commit comments