Skip to content

Commit 1d5d458

Browse files
authored
LUCENE-9613: Encode ordinals like numerics. (#186)
This helps simplify the code, and also adds some optimizations to ordinals like better compression for long runs of equal values or fields that are used in index sorts.
1 parent 495bf67 commit 1d5d458

File tree

3 files changed

+203
-354
lines changed

3 files changed

+203
-354
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesConsumer.java

Lines changed: 106 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import org.apache.lucene.index.EmptyDocValuesProducer;
3535
import org.apache.lucene.index.FieldInfo;
3636
import org.apache.lucene.index.IndexFileNames;
37+
import org.apache.lucene.index.NumericDocValues;
3738
import org.apache.lucene.index.SegmentWriteState;
3839
import org.apache.lucene.index.SortedDocValues;
3940
import org.apache.lucene.index.SortedNumericDocValues;
@@ -49,6 +50,7 @@
4950
import org.apache.lucene.util.BytesRef;
5051
import org.apache.lucene.util.BytesRefBuilder;
5152
import org.apache.lucene.util.IOUtils;
53+
import org.apache.lucene.util.LongsRef;
5254
import org.apache.lucene.util.MathUtil;
5355
import org.apache.lucene.util.StringHelper;
5456
import org.apache.lucene.util.compress.LZ4;
@@ -466,54 +468,47 @@ public void addSortedField(FieldInfo field, DocValuesProducer valuesProducer) th
466468

467469
private void doAddSortedField(FieldInfo field, DocValuesProducer valuesProducer)
468470
throws IOException {
469-
SortedDocValues values = valuesProducer.getSorted(field);
470-
int numDocsWithField = 0;
471-
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
472-
numDocsWithField++;
473-
}
474-
475-
if (numDocsWithField == 0) {
476-
meta.writeLong(-2); // docsWithFieldOffset
477-
meta.writeLong(0L); // docsWithFieldLength
478-
meta.writeShort((short) -1); // jumpTableEntryCount
479-
meta.writeByte((byte) -1); // denseRankPower
480-
} else if (numDocsWithField == maxDoc) {
481-
meta.writeLong(-1); // docsWithFieldOffset
482-
meta.writeLong(0L); // docsWithFieldLength
483-
meta.writeShort((short) -1); // jumpTableEntryCount
484-
meta.writeByte((byte) -1); // denseRankPower
485-
} else {
486-
long offset = data.getFilePointer();
487-
meta.writeLong(offset); // docsWithFieldOffset
488-
values = valuesProducer.getSorted(field);
489-
final short jumpTableentryCount =
490-
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
491-
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
492-
meta.writeShort(jumpTableentryCount);
493-
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
494-
}
495-
496-
meta.writeInt(numDocsWithField);
497-
if (values.getValueCount() <= 1) {
498-
meta.writeByte((byte) 0); // bitsPerValue
499-
meta.writeLong(0L); // ordsOffset
500-
meta.writeLong(0L); // ordsLength
501-
} else {
502-
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
503-
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
504-
long start = data.getFilePointer();
505-
meta.writeLong(start); // ordsOffset
506-
DirectWriter writer = DirectWriter.getInstance(data, numDocsWithField, numberOfBitsPerOrd);
507-
values = valuesProducer.getSorted(field);
508-
for (int doc = values.nextDoc();
509-
doc != DocIdSetIterator.NO_MORE_DOCS;
510-
doc = values.nextDoc()) {
511-
writer.add(values.ordValue());
512-
}
513-
writer.finish();
514-
meta.writeLong(data.getFilePointer() - start); // ordsLength
515-
}
516-
471+
writeValues(
472+
field,
473+
new EmptyDocValuesProducer() {
474+
@Override
475+
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
476+
SortedDocValues sorted = valuesProducer.getSorted(field);
477+
NumericDocValues sortedOrds =
478+
new NumericDocValues() {
479+
@Override
480+
public long longValue() throws IOException {
481+
return sorted.ordValue();
482+
}
483+
484+
@Override
485+
public boolean advanceExact(int target) throws IOException {
486+
return sorted.advanceExact(target);
487+
}
488+
489+
@Override
490+
public int docID() {
491+
return sorted.docID();
492+
}
493+
494+
@Override
495+
public int nextDoc() throws IOException {
496+
return sorted.nextDoc();
497+
}
498+
499+
@Override
500+
public int advance(int target) throws IOException {
501+
return sorted.advance(target);
502+
}
503+
504+
@Override
505+
public long cost() {
506+
return sorted.cost();
507+
}
508+
};
509+
return DocValues.singleton(sortedOrds);
510+
}
511+
});
517512
addTermsDict(DocValues.singleton(valuesProducer.getSorted(field)));
518513
}
519514

@@ -669,7 +664,11 @@ public void addSortedNumericField(FieldInfo field, DocValuesProducer valuesProdu
669664
throws IOException {
670665
meta.writeInt(field.number);
671666
meta.writeByte(Lucene90DocValuesFormat.SORTED_NUMERIC);
667+
doAddSortedNumericField(field, valuesProducer);
668+
}
672669

670+
private void doAddSortedNumericField(FieldInfo field, DocValuesProducer valuesProducer)
671+
throws IOException {
673672
long[] stats = writeValues(field, valuesProducer);
674673
int numDocsWithField = Math.toIntExact(stats[0]);
675674
long numValues = stats[1];
@@ -731,60 +730,65 @@ public SortedDocValues getSorted(FieldInfo field) throws IOException {
731730
}
732731
meta.writeByte((byte) 1); // multiValued (1 = multiValued)
733732

734-
assert numDocsWithField != 0;
735-
if (numDocsWithField == maxDoc) {
736-
meta.writeLong(-1); // docsWithFieldOffset
737-
meta.writeLong(0L); // docsWithFieldLength
738-
meta.writeShort((short) -1); // jumpTableEntryCount
739-
meta.writeByte((byte) -1); // denseRankPower
740-
} else {
741-
long offset = data.getFilePointer();
742-
meta.writeLong(offset); // docsWithFieldOffset
743-
values = valuesProducer.getSortedSet(field);
744-
final short jumpTableEntryCount =
745-
IndexedDISI.writeBitSet(values, data, IndexedDISI.DEFAULT_DENSE_RANK_POWER);
746-
meta.writeLong(data.getFilePointer() - offset); // docsWithFieldLength
747-
meta.writeShort(jumpTableEntryCount);
748-
meta.writeByte(IndexedDISI.DEFAULT_DENSE_RANK_POWER);
749-
}
750-
751-
int numberOfBitsPerOrd = DirectWriter.unsignedBitsRequired(values.getValueCount() - 1);
752-
meta.writeByte((byte) numberOfBitsPerOrd); // bitsPerValue
753-
long start = data.getFilePointer();
754-
meta.writeLong(start); // ordsOffset
755-
DirectWriter writer = DirectWriter.getInstance(data, numOrds, numberOfBitsPerOrd);
756-
values = valuesProducer.getSortedSet(field);
757-
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
758-
for (long ord = values.nextOrd();
759-
ord != SortedSetDocValues.NO_MORE_ORDS;
760-
ord = values.nextOrd()) {
761-
writer.add(ord);
762-
}
763-
}
764-
writer.finish();
765-
meta.writeLong(data.getFilePointer() - start); // ordsLength
766-
767-
meta.writeInt(numDocsWithField);
768-
start = data.getFilePointer();
769-
meta.writeLong(start); // addressesOffset
770-
meta.writeVInt(DIRECT_MONOTONIC_BLOCK_SHIFT);
771-
772-
final DirectMonotonicWriter addressesWriter =
773-
DirectMonotonicWriter.getInstance(
774-
meta, data, numDocsWithField + 1, DIRECT_MONOTONIC_BLOCK_SHIFT);
775-
long addr = 0;
776-
addressesWriter.add(addr);
777-
values = valuesProducer.getSortedSet(field);
778-
for (int doc = values.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = values.nextDoc()) {
779-
values.nextOrd();
780-
addr++;
781-
while (values.nextOrd() != SortedSetDocValues.NO_MORE_ORDS) {
782-
addr++;
783-
}
784-
addressesWriter.add(addr);
785-
}
786-
addressesWriter.finish();
787-
meta.writeLong(data.getFilePointer() - start); // addressesLength
733+
doAddSortedNumericField(
734+
field,
735+
new EmptyDocValuesProducer() {
736+
@Override
737+
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
738+
SortedSetDocValues values = valuesProducer.getSortedSet(field);
739+
return new SortedNumericDocValues() {
740+
741+
long[] ords = LongsRef.EMPTY_LONGS;
742+
int i, docValueCount;
743+
744+
@Override
745+
public long nextValue() throws IOException {
746+
return ords[i++];
747+
}
748+
749+
@Override
750+
public int docValueCount() {
751+
return docValueCount;
752+
}
753+
754+
@Override
755+
public boolean advanceExact(int target) throws IOException {
756+
throw new UnsupportedOperationException();
757+
}
758+
759+
@Override
760+
public int docID() {
761+
return values.docID();
762+
}
763+
764+
@Override
765+
public int nextDoc() throws IOException {
766+
int doc = values.nextDoc();
767+
if (doc != NO_MORE_DOCS) {
768+
docValueCount = 0;
769+
for (long ord = values.nextOrd();
770+
ord != SortedSetDocValues.NO_MORE_ORDS;
771+
ord = values.nextOrd()) {
772+
ords = ArrayUtil.grow(ords, docValueCount + 1);
773+
ords[docValueCount++] = ord;
774+
}
775+
i = 0;
776+
}
777+
return doc;
778+
}
779+
780+
@Override
781+
public int advance(int target) throws IOException {
782+
throw new UnsupportedOperationException();
783+
}
784+
785+
@Override
786+
public long cost() {
787+
return values.cost();
788+
}
789+
};
790+
}
791+
});
788792

789793
addTermsDict(values);
790794
}

lucene/core/src/java/org/apache/lucene/codecs/lucene90/Lucene90DocValuesFormat.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
* by accumulating the {@link Long#bitCount(long) bit counts} of the visited longs. Advancing
5353
* &gt;= 512 documents is performed by skipping to the start of the needed 512 document
5454
* sub-block and iterating to the specific document within that block. The index for the
55-
* sub-block that is skipped to is retrieved from a rank-table positioned beforethe bit set.
55+
* sub-block that is skipped to is retrieved from a rank-table positioned before the bit set.
5656
* The rank-table holds the origo index numbers for all 512 documents sub-blocks, represented
5757
* as an unsigned short for each 128 blocks.
5858
* <li>ALL: This strategy is used when a block contains exactly 65536 documents, meaning that the

0 commit comments

Comments
 (0)