Skip to content

Commit bb3167e

Browse files
committed
Impl intoBitset for IndexedDISI and Docvalues (apache#14529)
1 parent 9920b1e commit bb3167e

File tree

5 files changed

+293
-6
lines changed

5 files changed

+293
-6
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/IndexedDISI.java

Lines changed: 86 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,9 +163,9 @@ private static byte[] createRank(FixedBitSet buffer, byte denseRankPower) {
163163
*
164164
* @param it the document IDs.
165165
* @param out destination for the blocks.
166-
* @throws IOException if there was an error writing to out.
167166
* @return the number of jump-table entries following the blocks, -1 for no entries. This should
168167
* be stored in meta and used when creating an instance of IndexedDISI.
168+
* @throws IOException if there was an error writing to out.
169169
*/
170170
static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOException {
171171
return writeBitSet(it, out, DEFAULT_DENSE_RANK_POWER);
@@ -184,9 +184,9 @@ static short writeBitSet(DocIdSetIterator it, IndexOutput out) throws IOExceptio
184184
* disables DENSE rank. Recommended values are 8-12: Every 256-4096 docIDs or 4-64 longs.
185185
* {@link #DEFAULT_DENSE_RANK_POWER} is 9: Every 512 docIDs. This should be stored in meta and
186186
* used when creating an instance of IndexedDISI.
187-
* @throws IOException if there was an error writing to out.
188187
* @return the number of jump-table entries following the blocks, -1 for no entries. This should
189188
* be stored in meta and used when creating an instance of IndexedDISI.
189+
* @throws IOException if there was an error writing to out.
190190
*/
191191
public static short writeBitSet(DocIdSetIterator it, IndexOutput out, byte denseRankPower)
192192
throws IOException {
@@ -436,6 +436,7 @@ public static RandomAccessInput createJumpTable(
436436
int numberOfOnes;
437437
// Used with rank for jumps inside of DENSE as they are absolute instead of relative
438438
int denseOrigoIndex;
439+
FixedBitSet bitSet;
439440

440441
// ALL variables
441442
int gap;
@@ -491,6 +492,16 @@ public int advance(int target) throws IOException {
491492
return doc;
492493
}
493494

495+
@Override
496+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
497+
assert doc >= offset;
498+
while (doc < upTo && method.intoBitSetWithinBlock(this, upTo, bitSet, offset) == false) {
499+
readBlockHeader();
500+
boolean found = method.advanceWithinBlock(this, block);
501+
assert found;
502+
}
503+
}
504+
494505
public boolean advanceExact(int target) throws IOException {
495506
final int targetBlock = target & 0xFFFF0000;
496507
if (block < targetBlock) {
@@ -625,6 +636,25 @@ boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException
625636
disi.exists = false;
626637
return false;
627638
}
639+
640+
@Override
641+
boolean intoBitSetWithinBlock(IndexedDISI disi, int upTo, FixedBitSet bitSet, int offset)
642+
throws IOException {
643+
bitSet.set(disi.doc - offset);
644+
for (; disi.index < disi.nextBlockIndex; ) {
645+
int docInBlock = disi.slice.readShort() & 0xFFFF;
646+
int doc = disi.block | docInBlock;
647+
disi.index++;
648+
if (doc >= upTo) {
649+
disi.doc = doc;
650+
disi.exists = true;
651+
disi.nextExistDocInBlock = docInBlock;
652+
return true;
653+
}
654+
bitSet.set(doc - offset);
655+
}
656+
return false;
657+
}
628658
},
629659
DENSE {
630660
@Override
@@ -693,6 +723,34 @@ boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException
693723
disi.index = disi.numberOfOnes - Long.bitCount(leftBits);
694724
return (leftBits & 1L) != 0;
695725
}
726+
727+
@Override
728+
boolean intoBitSetWithinBlock(IndexedDISI disi, int upTo, FixedBitSet bitSet, int offset)
729+
throws IOException {
730+
if (disi.bitSet == null) {
731+
disi.bitSet = new FixedBitSet(BLOCK_SIZE);
732+
}
733+
734+
int sourceFrom = disi.doc & 0xFFFF;
735+
int sourceTo = Math.min(upTo - disi.block, BLOCK_SIZE);
736+
int destFrom = disi.doc - offset;
737+
738+
long fp = disi.slice.getFilePointer();
739+
disi.slice.seek(fp - Long.BYTES); // seek back a long to include current word (disi.word).
740+
int numWords = FixedBitSet.bits2words(sourceTo) - disi.wordIndex;
741+
disi.slice.readLongs(disi.bitSet.getBits(), disi.wordIndex, numWords);
742+
FixedBitSet.orRange(disi.bitSet, sourceFrom, bitSet, destFrom, sourceTo - sourceFrom);
743+
744+
int blockEnd = disi.block | 0xFFFF;
745+
if (upTo > blockEnd) {
746+
disi.slice.seek(disi.blockEnd);
747+
disi.index += disi.bitSet.cardinality(sourceFrom, sourceTo);
748+
return false;
749+
} else {
750+
disi.slice.seek(fp);
751+
return advanceWithinBlock(disi, upTo);
752+
}
753+
}
696754
},
697755
ALL {
698756
@Override
@@ -707,6 +765,19 @@ boolean advanceExactWithinBlock(IndexedDISI disi, int target) {
707765
disi.index = target - disi.gap;
708766
return true;
709767
}
768+
769+
@Override
770+
boolean intoBitSetWithinBlock(IndexedDISI disi, int upTo, FixedBitSet bitSet, int offset) {
771+
final int blockEnd = disi.block | 0xFFFF;
772+
if (upTo <= blockEnd) {
773+
bitSet.set(disi.doc - offset, upTo - offset);
774+
advanceWithinBlock(disi, upTo);
775+
return true;
776+
} else {
777+
bitSet.set(disi.doc - offset, blockEnd - offset + 1);
778+
return false;
779+
}
780+
}
710781
};
711782

712783
/**
@@ -720,6 +791,19 @@ boolean advanceExactWithinBlock(IndexedDISI disi, int target) {
720791
* return whether this document exists.
721792
*/
722793
abstract boolean advanceExactWithinBlock(IndexedDISI disi, int target) throws IOException;
794+
795+
/**
796+
* Similar to {@link DocIdSetIterator#intoBitSet}, load docs in this block into a bitset. This
797+
* method returns true if there are remaining docs (gte upTo) in the block, otherwise false.
798+
* When false return, fp of {@link IndexedDISI#slice} is at {@link IndexedDISI#blockEnd} and
799+
* {@link IndexedDISI#index} is correct but other status vars are undefined. Caller should
800+
* decode the header of next block by {@link #readBlockHeader()}.
801+
*
802+
* <p>Caller need to make sure {@link IndexedDISI#doc} greater than or equals to {@link
803+
* IndexedDISI#block} and less than {@code upTo} when calling this.
804+
*/
805+
abstract boolean intoBitSetWithinBlock(
806+
IndexedDISI disi, int upTo, FixedBitSet bitSet, int offset) throws IOException;
723807
}
724808

725809
/**

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesProducer.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import org.apache.lucene.store.RandomAccessInput;
5151
import org.apache.lucene.store.ReadAdvice;
5252
import org.apache.lucene.util.BytesRef;
53+
import org.apache.lucene.util.FixedBitSet;
5354
import org.apache.lucene.util.IOUtils;
5455
import org.apache.lucene.util.LongValues;
5556
import org.apache.lucene.util.compress.LZ4;
@@ -500,6 +501,11 @@ public int docID() {
500501
return disi.docID();
501502
}
502503

504+
@Override
505+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
506+
disi.intoBitSet(upTo, bitSet, offset);
507+
}
508+
503509
@Override
504510
public long cost() {
505511
return disi.cost();
@@ -781,6 +787,11 @@ public int advance(int target) throws IOException {
781787
public boolean advanceExact(int target) throws IOException {
782788
return disi.advanceExact(target);
783789
}
790+
791+
@Override
792+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
793+
disi.intoBitSet(upTo, bitSet, offset);
794+
}
784795
}
785796

786797
@Override
@@ -987,6 +998,11 @@ public int advance(int target) throws IOException {
987998
return disi.advance(target);
988999
}
9891000

1001+
@Override
1002+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
1003+
disi.intoBitSet(upTo, bitSet, offset);
1004+
}
1005+
9901006
@Override
9911007
public long cost() {
9921008
return disi.cost();
@@ -1491,6 +1507,12 @@ public int docValueCount() {
14911507
return count;
14921508
}
14931509

1510+
@Override
1511+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
1512+
set = false;
1513+
disi.intoBitSet(upTo, bitSet, offset);
1514+
}
1515+
14941516
private void set() {
14951517
if (set == false) {
14961518
final int index = disi.index();
@@ -1641,6 +1663,12 @@ public int advance(int target) throws IOException {
16411663
return disi.advance(target);
16421664
}
16431665

1666+
@Override
1667+
public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException {
1668+
set = false;
1669+
disi.intoBitSet(upTo, bitSet, offset);
1670+
}
1671+
16441672
@Override
16451673
public long cost() {
16461674
return disi.cost();

lucene/core/src/java/org/apache/lucene/index/DocValuesIterator.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ abstract class DocValuesIterator extends DocIdSetIterator {
2626
* {@code target} must be greater than or equal to the current {@link #docID() doc ID} and must be
2727
* a valid doc ID, ie. &ge; 0 and &lt; {@code maxDoc}. After this method returns, {@link #docID()}
2828
* returns {@code target}.
29+
*
30+
* <p>Note: it is illegal to call {@link DocIdSetIterator#intoBitSet} or {@link
31+
* DocIdSetIterator#docIDRunEnd()} when this method returns false.
2932
*/
3033
public abstract boolean advanceExact(int target) throws IOException;
3134
}

lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestIndexedDISI.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,16 @@ private void doTest(BitSet set, Directory dir) throws IOException {
526526
}
527527
}
528528

529+
for (int step : new int[] {100, 1000, 10000, 100000}) {
530+
try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) {
531+
IndexedDISI disi =
532+
new IndexedDISI(in, 0L, length, jumpTableentryCount, denseRankPower, cardinality);
533+
BitSetIterator disi2 = new BitSetIterator(set, cardinality);
534+
int disi2length = set.length();
535+
assertIntoBitsetRandomized(disi, disi2, disi2length, step);
536+
}
537+
}
538+
529539
dir.deleteFile("foo");
530540
}
531541

@@ -555,6 +565,55 @@ private void assertAdvanceExactRandomized(
555565
}
556566
}
557567

568+
private void assertIntoBitsetRandomized(
569+
IndexedDISI disi, BitSetIterator disi2, int disi2length, int step) throws IOException {
570+
int index = -1;
571+
FixedBitSet set1 = new FixedBitSet(step);
572+
FixedBitSet set2 = new FixedBitSet(step);
573+
574+
for (int upTo = 0; upTo < disi2length; ) {
575+
int lastUpTo = upTo;
576+
upTo += TestUtil.nextInt(random(), 0, step);
577+
int offset = TestUtil.nextInt(random(), lastUpTo, upTo);
578+
579+
if (disi.docID() < offset) {
580+
disi.advance(offset);
581+
}
582+
int doc = disi2.docID();
583+
while (doc < offset) {
584+
index++;
585+
doc = disi2.nextDoc();
586+
}
587+
while (doc < upTo) {
588+
set2.set(doc - offset);
589+
index++;
590+
doc = disi2.nextDoc();
591+
}
592+
593+
disi.intoBitSet(upTo, set1, offset);
594+
assertEquals(index, disi.index());
595+
assertEquals(disi2.docID(), disi.docID());
596+
597+
BitSetIterator expected = new BitSetIterator(set2, set2.cardinality());
598+
BitSetIterator actual = new BitSetIterator(set1, set1.cardinality());
599+
for (int expectedDoc = expected.nextDoc();
600+
expectedDoc != DocIdSetIterator.NO_MORE_DOCS;
601+
expectedDoc = expected.nextDoc()) {
602+
int actualDoc = actual.nextDoc();
603+
assertEquals(expectedDoc + offset, actualDoc + offset); // plus offset for better message.
604+
}
605+
assertEquals(DocIdSetIterator.NO_MORE_DOCS, actual.nextDoc());
606+
607+
if (disi2.docID() != DocIdSetIterator.NO_MORE_DOCS) {
608+
assertEquals(disi2.nextDoc(), disi.nextDoc());
609+
assertEquals(++index, disi.index());
610+
}
611+
612+
set1.clear();
613+
set2.clear();
614+
}
615+
}
616+
558617
private void assertSingleStepEquality(IndexedDISI disi, BitSetIterator disi2) throws IOException {
559618
int i = 0;
560619
for (int doc = disi2.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = disi2.nextDoc()) {

0 commit comments

Comments
 (0)