1818
1919import java .io .IOException ;
2020import java .util .ArrayList ;
21+ import java .util .Arrays ;
2122import java .util .Collection ;
2223import java .util .Collections ;
2324import java .util .List ;
2425import java .util .Map ;
26+ import java .util .function .IntUnaryOperator ;
2527import java .util .logging .Level ;
2628import java .util .logging .Logger ;
2729import org .apache .lucene .document .Document ;
3537import org .apache .lucene .index .DirectoryReader ;
3638import org .apache .lucene .index .IndexWriter ;
3739import org .apache .lucene .index .LeafReader ;
40+ import org .apache .lucene .index .LeafReaderContext ;
3841import org .apache .lucene .index .MultiTerms ;
3942import org .apache .lucene .index .PostingsEnum ;
4043import org .apache .lucene .index .ReaderUtil ;
4447import org .apache .lucene .util .Accountables ;
4548import org .apache .lucene .util .BytesRef ;
4649import org .apache .lucene .util .IOUtils ;
50+ import org .apache .lucene .util .InPlaceMergeSorter ;
4751import org .apache .lucene .util .RamUsageEstimator ;
4852
4953/**
@@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
318322 // doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
319323 // instance recognizes. Therefore we do this check up front, before we hit
320324 // the cache.
321- if (ordinal < 0 || ordinal >= indexReader .maxDoc ()) {
322- return null ;
323- }
325+ checkOrdinalBounds (ordinal );
324326
325- // TODO: can we use an int-based hash impl, such as IntToObjectMap,
326- // wrapped as LRU?
327- Integer catIDInteger = Integer .valueOf (ordinal );
328- synchronized (categoryCache ) {
329- FacetLabel res = categoryCache .get (catIDInteger );
330- if (res != null ) {
331- return res ;
332- }
327+ FacetLabel [] ordinalPath = getPathFromCache (ordinal );
328+
329+ if (ordinalPath [0 ] != null ) {
330+ return ordinalPath [0 ];
333331 }
334332
335333 int readerIndex = ReaderUtil .subIndex (ordinal , indexReader .leaves ());
336334 LeafReader leafReader = indexReader .leaves ().get (readerIndex ).reader ();
337- // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
338335 BinaryDocValues values = leafReader .getBinaryDocValues (Consts .FULL );
339336
340337 FacetLabel ret ;
@@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException {
351348 }
352349
353350 synchronized (categoryCache ) {
354- categoryCache .put (catIDInteger , ret );
351+ categoryCache .put (ordinal , ret );
355352 }
356353
357354 return ret ;
358355 }
359356
357+ private FacetLabel [] getPathFromCache (int ... ordinals ) {
358+ FacetLabel [] facetLabels = new FacetLabel [ordinals .length ];
359+ // TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap,
360+ // wrapped as LRU?
361+ synchronized (categoryCache ) {
362+ for (int i = 0 ; i < ordinals .length ; i ++) {
363+ facetLabels [i ] = categoryCache .get (ordinals [i ]);
364+ }
365+ }
366+ return facetLabels ;
367+ }
368+
369+ /**
370+ * Checks if the ordinals in the array are >=0 and < {@code
371+ * DirectoryTaxonomyReader#indexReader.maxDoc()}
372+ *
373+ * @param ordinals Integer array of ordinals
374+ * @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is
375+ * out of bounds
376+ */
377+ private void checkOrdinalBounds (int ... ordinals ) throws IllegalArgumentException {
378+ for (int ordinal : ordinals ) {
379+ if (ordinal < 0 || ordinal >= indexReader .maxDoc ()) {
380+ throw new IllegalArgumentException (
381+ "ordinal "
382+ + ordinal
383+ + " is out of the range of the indexReader "
384+ + indexReader .toString ()
385+ + ". The maximum possible ordinal number is "
386+ + (indexReader .maxDoc () - 1 ));
387+ }
388+ }
389+ }
390+
391+ /**
392+ * Returns an array of FacetLabels for a given array of ordinals.
393+ *
394+ * <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
395+ * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
396+ * was created using StoredFields (with no performance gains) and uses DocValues based iteration
397+ * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
398+ *
399+ * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
400+ * index
401+ */
402+ @ Override
403+ public FacetLabel [] getBulkPath (int ... ordinals ) throws IOException {
404+ ensureOpen ();
405+ checkOrdinalBounds (ordinals );
406+
407+ int ordinalsLength = ordinals .length ;
408+ FacetLabel [] bulkPath = new FacetLabel [ordinalsLength ];
409+ // remember the original positions of ordinals before they are sorted
410+ int [] originalPosition = new int [ordinalsLength ];
411+ Arrays .setAll (originalPosition , IntUnaryOperator .identity ());
412+
413+ getPathFromCache (ordinals );
414+
415+ /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
416+ new InPlaceMergeSorter () {
417+ @ Override
418+ protected void swap (int i , int j ) {
419+ int x = ordinals [i ];
420+ ordinals [i ] = ordinals [j ];
421+ ordinals [j ] = x ;
422+
423+ x = originalPosition [i ];
424+ originalPosition [i ] = originalPosition [j ];
425+ originalPosition [j ] = x ;
426+ }
427+
428+ @ Override
429+ public int compare (int i , int j ) {
430+ return Integer .compare (ordinals [i ], ordinals [j ]);
431+ }
432+ }.sort (0 , ordinalsLength );
433+
434+ int readerIndex ;
435+ int leafReaderMaxDoc = 0 ;
436+ int leafReaderDocBase = 0 ;
437+ LeafReader leafReader ;
438+ LeafReaderContext leafReaderContext ;
439+ BinaryDocValues values = null ;
440+ List <Integer > uncachedOrdinalPositions = new ArrayList <>();
441+
442+ for (int i = 0 ; i < ordinalsLength ; i ++) {
443+ if (bulkPath [originalPosition [i ]] == null ) {
444+ /*
445+ If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
446+ Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
447+ (which is the maxDoc of the specific leaf)
448+ */
449+ if (values == null || ordinals [i ] >= leafReaderDocBase + leafReaderMaxDoc ) {
450+
451+ readerIndex = ReaderUtil .subIndex (ordinals [i ], indexReader .leaves ());
452+ leafReaderContext = indexReader .leaves ().get (readerIndex );
453+ leafReader = leafReaderContext .reader ();
454+ leafReaderMaxDoc = leafReader .maxDoc ();
455+ leafReaderDocBase = leafReaderContext .docBase ;
456+ values = leafReader .getBinaryDocValues (Consts .FULL );
457+
458+ /*
459+ If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
460+ */
461+ if (values == null ) {
462+ return super .getBulkPath (ordinals );
463+ }
464+ }
465+ // values is leaf specific so you only advance till you reach the target within the leaf
466+ boolean success = values .advanceExact (ordinals [i ] - leafReaderDocBase );
467+ assert success ;
468+ bulkPath [originalPosition [i ]] =
469+ new FacetLabel (FacetsConfig .stringToPath (values .binaryValue ().utf8ToString ()));
470+
471+ uncachedOrdinalPositions .add (i );
472+ }
473+ }
474+
475+ if (uncachedOrdinalPositions .isEmpty () == false ) {
476+ synchronized (categoryCache ) {
477+ for (int i : uncachedOrdinalPositions ) {
478+ // add the value to the categoryCache after computation
479+ categoryCache .put (ordinals [i ], bulkPath [originalPosition [i ]]);
480+ }
481+ }
482+ }
483+
484+ return bulkPath ;
485+ }
486+
360487 @ Override
361488 public int getSize () {
362489 ensureOpen ();
0 commit comments