18
18
19
19
import java .io .IOException ;
20
20
import java .util .ArrayList ;
21
+ import java .util .Arrays ;
21
22
import java .util .Collection ;
22
23
import java .util .Collections ;
23
24
import java .util .List ;
24
25
import java .util .Map ;
26
+ import java .util .function .IntUnaryOperator ;
25
27
import java .util .logging .Level ;
26
28
import java .util .logging .Logger ;
27
29
import org .apache .lucene .document .Document ;
35
37
import org .apache .lucene .index .DirectoryReader ;
36
38
import org .apache .lucene .index .IndexWriter ;
37
39
import org .apache .lucene .index .LeafReader ;
40
+ import org .apache .lucene .index .LeafReaderContext ;
38
41
import org .apache .lucene .index .MultiTerms ;
39
42
import org .apache .lucene .index .PostingsEnum ;
40
43
import org .apache .lucene .index .ReaderUtil ;
44
47
import org .apache .lucene .util .Accountables ;
45
48
import org .apache .lucene .util .BytesRef ;
46
49
import org .apache .lucene .util .IOUtils ;
50
+ import org .apache .lucene .util .InPlaceMergeSorter ;
47
51
import org .apache .lucene .util .RamUsageEstimator ;
48
52
49
53
/**
@@ -318,23 +322,16 @@ public FacetLabel getPath(int ordinal) throws IOException {
318
322
// doOpenIfChanged, we need to ensure that the ordinal is one that this DTR
319
323
// instance recognizes. Therefore we do this check up front, before we hit
320
324
// the cache.
321
- if (ordinal < 0 || ordinal >= indexReader .maxDoc ()) {
322
- return null ;
323
- }
325
+ checkOrdinalBounds (ordinal );
324
326
325
- // TODO: can we use an int-based hash impl, such as IntToObjectMap,
326
- // wrapped as LRU?
327
- Integer catIDInteger = Integer .valueOf (ordinal );
328
- synchronized (categoryCache ) {
329
- FacetLabel res = categoryCache .get (catIDInteger );
330
- if (res != null ) {
331
- return res ;
332
- }
327
+ FacetLabel [] ordinalPath = getPathFromCache (ordinal );
328
+
329
+ if (ordinalPath [0 ] != null ) {
330
+ return ordinalPath [0 ];
333
331
}
334
332
335
333
int readerIndex = ReaderUtil .subIndex (ordinal , indexReader .leaves ());
336
334
LeafReader leafReader = indexReader .leaves ().get (readerIndex ).reader ();
337
- // TODO: Use LUCENE-9476 to get the bulk lookup API for extracting BinaryDocValues
338
335
BinaryDocValues values = leafReader .getBinaryDocValues (Consts .FULL );
339
336
340
337
FacetLabel ret ;
@@ -351,12 +348,142 @@ public FacetLabel getPath(int ordinal) throws IOException {
351
348
}
352
349
353
350
synchronized (categoryCache ) {
354
- categoryCache .put (catIDInteger , ret );
351
+ categoryCache .put (ordinal , ret );
355
352
}
356
353
357
354
return ret ;
358
355
}
359
356
357
+ private FacetLabel [] getPathFromCache (int ... ordinals ) {
358
+ FacetLabel [] facetLabels = new FacetLabel [ordinals .length ];
359
+ // TODO LUCENE-10068: can we use an int-based hash impl, such as IntToObjectMap,
360
+ // wrapped as LRU?
361
+ synchronized (categoryCache ) {
362
+ for (int i = 0 ; i < ordinals .length ; i ++) {
363
+ facetLabels [i ] = categoryCache .get (ordinals [i ]);
364
+ }
365
+ }
366
+ return facetLabels ;
367
+ }
368
+
369
+ /**
370
+ * Checks if the ordinals in the array are >=0 and < {@code
371
+ * DirectoryTaxonomyReader#indexReader.maxDoc()}
372
+ *
373
+ * @param ordinals Integer array of ordinals
374
+ * @throws IllegalArgumentException Throw an IllegalArgumentException if one of the ordinals is
375
+ * out of bounds
376
+ */
377
+ private void checkOrdinalBounds (int ... ordinals ) throws IllegalArgumentException {
378
+ for (int ordinal : ordinals ) {
379
+ if (ordinal < 0 || ordinal >= indexReader .maxDoc ()) {
380
+ throw new IllegalArgumentException (
381
+ "ordinal "
382
+ + ordinal
383
+ + " is out of the range of the indexReader "
384
+ + indexReader .toString ()
385
+ + ". The maximum possible ordinal number is "
386
+ + (indexReader .maxDoc () - 1 ));
387
+ }
388
+ }
389
+ }
390
+
391
+ /**
392
+ * Returns an array of FacetLabels for a given array of ordinals.
393
+ *
394
+ * <p>This API is generally faster than iteratively calling {@link #getPath(int)} over an array of
395
+ * ordinals. It uses the {@link #getPath(int)} method iteratively when it detects that the index
396
+ * was created using StoredFields (with no performance gains) and uses DocValues based iteration
397
+ * when the index is based on BinaryDocValues. Lucene switched to BinaryDocValues in version 9.0
398
+ *
399
+ * @param ordinals Array of ordinals that are assigned to categories inserted into the taxonomy
400
+ * index
401
+ */
402
+ @ Override
403
+ public FacetLabel [] getBulkPath (int ... ordinals ) throws IOException {
404
+ ensureOpen ();
405
+ checkOrdinalBounds (ordinals );
406
+
407
+ int ordinalsLength = ordinals .length ;
408
+ FacetLabel [] bulkPath = new FacetLabel [ordinalsLength ];
409
+ // remember the original positions of ordinals before they are sorted
410
+ int [] originalPosition = new int [ordinalsLength ];
411
+ Arrays .setAll (originalPosition , IntUnaryOperator .identity ());
412
+
413
+ getPathFromCache (ordinals );
414
+
415
+ /* parallel sort the ordinals and originalPosition array based on the values in the ordinals array */
416
+ new InPlaceMergeSorter () {
417
+ @ Override
418
+ protected void swap (int i , int j ) {
419
+ int x = ordinals [i ];
420
+ ordinals [i ] = ordinals [j ];
421
+ ordinals [j ] = x ;
422
+
423
+ x = originalPosition [i ];
424
+ originalPosition [i ] = originalPosition [j ];
425
+ originalPosition [j ] = x ;
426
+ }
427
+
428
+ @ Override
429
+ public int compare (int i , int j ) {
430
+ return Integer .compare (ordinals [i ], ordinals [j ]);
431
+ }
432
+ }.sort (0 , ordinalsLength );
433
+
434
+ int readerIndex ;
435
+ int leafReaderMaxDoc = 0 ;
436
+ int leafReaderDocBase = 0 ;
437
+ LeafReader leafReader ;
438
+ LeafReaderContext leafReaderContext ;
439
+ BinaryDocValues values = null ;
440
+ List <Integer > uncachedOrdinalPositions = new ArrayList <>();
441
+
442
+ for (int i = 0 ; i < ordinalsLength ; i ++) {
443
+ if (bulkPath [originalPosition [i ]] == null ) {
444
+ /*
445
+ If ordinals[i] >= leafReaderDocBase + leafReaderMaxDoc then we find the next leaf that contains our ordinal.
446
+ Remember: ordinals[i] operates in the global ordinal space and hence we add leafReaderDocBase to the leafReaderMaxDoc
447
+ (which is the maxDoc of the specific leaf)
448
+ */
449
+ if (values == null || ordinals [i ] >= leafReaderDocBase + leafReaderMaxDoc ) {
450
+
451
+ readerIndex = ReaderUtil .subIndex (ordinals [i ], indexReader .leaves ());
452
+ leafReaderContext = indexReader .leaves ().get (readerIndex );
453
+ leafReader = leafReaderContext .reader ();
454
+ leafReaderMaxDoc = leafReader .maxDoc ();
455
+ leafReaderDocBase = leafReaderContext .docBase ;
456
+ values = leafReader .getBinaryDocValues (Consts .FULL );
457
+
458
+ /*
459
+ If the index is constructed with the older StoredFields it will not have any BinaryDocValues field and will return null
460
+ */
461
+ if (values == null ) {
462
+ return super .getBulkPath (ordinals );
463
+ }
464
+ }
465
+ // values is leaf specific so you only advance till you reach the target within the leaf
466
+ boolean success = values .advanceExact (ordinals [i ] - leafReaderDocBase );
467
+ assert success ;
468
+ bulkPath [originalPosition [i ]] =
469
+ new FacetLabel (FacetsConfig .stringToPath (values .binaryValue ().utf8ToString ()));
470
+
471
+ uncachedOrdinalPositions .add (i );
472
+ }
473
+ }
474
+
475
+ if (uncachedOrdinalPositions .isEmpty () == false ) {
476
+ synchronized (categoryCache ) {
477
+ for (int i : uncachedOrdinalPositions ) {
478
+ // add the value to the categoryCache after computation
479
+ categoryCache .put (ordinals [i ], bulkPath [originalPosition [i ]]);
480
+ }
481
+ }
482
+ }
483
+
484
+ return bulkPath ;
485
+ }
486
+
360
487
@ Override
361
488
public int getSize () {
362
489
ensureOpen ();
0 commit comments