Skip to content

Commit bf534ee

Browse files
committed
init
1 parent cdc7d87 commit bf534ee

File tree

8 files changed

+242
-30
lines changed

8 files changed

+242
-30
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
8686
*/
8787
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
8888

89+
/** Version that store continuous arcs label as range in FST. */
90+
public static final int VERSION_ARCS_CONTINUOUS = 2;
91+
8992
/** Current terms format. */
90-
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
93+
public static final int VERSION_CURRENT = VERSION_ARCS_CONTINUOUS;
9194

9295
/** Extension of terms index file */
9396
static final String TERMS_INDEX_EXTENSION = "tip";

lucene/core/src/java/org/apache/lucene/util/fst/FST.java

Lines changed: 63 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@ public enum INPUT_TYPE {
9696
*/
9797
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
9898

99+
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
100+
99101
// Increment version to change it
100102
private static final String FILE_FORMAT_NAME = "FST";
101103
private static final int VERSION_START = 6;
@@ -251,7 +253,10 @@ public String toString() {
251253
.append(numArcs())
252254
.append(")")
253255
.append("(")
254-
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
256+
.append(
257+
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
258+
? "da"
259+
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
255260
.append(")");
256261
}
257262
return b.toString();
@@ -293,8 +298,8 @@ public int arcIdx() {
293298

294299
/**
295300
* Node header flags. Only meaningful to check if the value is either {@link
296-
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
297-
* == 0).
301+
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
302+
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
298303
*/
299304
public byte nodeFlags() {
300305
return nodeFlags;
@@ -326,7 +331,7 @@ public int numArcs() {
326331

327332
/**
328333
* First label of a direct addressing node. Only valid if nodeFlags == {@link
329-
* #ARCS_FOR_DIRECT_ADDRESSING}.
334+
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
330335
*/
331336
int firstLabel() {
332337
return firstLabel;
@@ -646,7 +651,9 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
646651
} else {
647652
in.setPosition(follow.target());
648653
byte flags = arc.nodeFlags = in.readByte();
649-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
654+
if (flags == ARCS_FOR_BINARY_SEARCH
655+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
656+
|| flags == ARCS_FOR_CONTINUOUS) {
650657
// Special arc which is actually a node header for fixed length arcs.
651658
// Jump straight to end to find the last arc.
652659
arc.numArcs = in.readVInt();
@@ -657,10 +664,14 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
657664
arc.firstLabel = readLabel(in);
658665
arc.posArcsStart = in.getPosition();
659666
readLastArcByDirectAddressing(arc, in);
660-
} else {
667+
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
661668
arc.arcIdx = arc.numArcs() - 2;
662669
arc.posArcsStart = in.getPosition();
663670
readNextRealArc(arc, in);
671+
} else {
672+
arc.firstLabel = readLabel(in);
673+
arc.posArcsStart = in.getPosition();
674+
readLastArcByContinuous(arc, in);
664675
}
665676
} else {
666677
arc.flags = flags;
@@ -733,7 +744,9 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in
733744
in.setPosition(nodeAddress);
734745

735746
byte flags = arc.nodeFlags = in.readByte();
736-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
747+
if (flags == ARCS_FOR_BINARY_SEARCH
748+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
749+
|| flags == ARCS_FOR_CONTINUOUS) {
737750
// Special arc which is actually a node header for fixed length arcs.
738751
arc.numArcs = in.readVInt();
739752
arc.bytesPerArc = in.readVInt();
@@ -742,6 +755,8 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in
742755
readPresenceBytes(arc, in);
743756
arc.firstLabel = readLabel(in);
744757
arc.presenceIndex = -1;
758+
} else if (flags == ARCS_FOR_CONTINUOUS) {
759+
arc.firstLabel = readLabel(in);
745760
}
746761
arc.posArcsStart = in.getPosition();
747762
} else {
@@ -766,7 +781,9 @@ boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException {
766781
} else {
767782
in.setPosition(follow.target());
768783
byte flags = in.readByte();
769-
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
784+
return flags == ARCS_FOR_BINARY_SEARCH
785+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
786+
|| flags == ARCS_FOR_CONTINUOUS;
770787
}
771788
}
772789

@@ -794,16 +811,18 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
794811

795812
in.setPosition(arc.nextArc());
796813
byte flags = in.readByte();
797-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
814+
if (flags == ARCS_FOR_BINARY_SEARCH
815+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
816+
|| flags == ARCS_FOR_CONTINUOUS) {
798817
// System.out.println(" nextArc fixed length arc");
799818
// Special arc which is actually a node header for fixed length arcs.
800819
int numArcs = in.readVInt();
801820
in.readVInt(); // Skip bytesPerArc.
802821
if (flags == ARCS_FOR_BINARY_SEARCH) {
803822
in.readByte(); // Skip arc flags.
804-
} else {
823+
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
805824
in.skipBytes(getNumPresenceBytes(numArcs));
806-
}
825+
} // Nothing to do for ARCS_FOR_CONTINUOUS
807826
}
808827
} else {
809828
switch (arc.nodeFlags()) {
@@ -819,6 +838,8 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
819838
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
820839
assert nextIndex != -1;
821840
return arc.firstLabel() + nextIndex;
841+
case ARCS_FOR_CONTINUOUS:
842+
return arc.firstLabel() + arc.arcIdx() + 1;
822843
default:
823844
// Variable length arcs - linear search.
824845
assert arc.bytesPerArc() == 0;
@@ -842,6 +863,20 @@ public Arc<T> readArcByIndex(Arc<T> arc, final BytesReader in, int idx) throws I
842863
return readArc(arc, in);
843864
}
844865

866+
/**
867+
* Reads a Continuous node arc, with the provided index in the label range.
868+
*
869+
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
870+
*/
871+
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
872+
throws IOException {
873+
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
874+
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
875+
arc.arcIdx = rangeIndex;
876+
arc.flags = in.readByte();
877+
return readArc(arc, in);
878+
}
879+
845880
/**
846881
* Reads a present direct addressing node arc, with the provided index in the label range.
847882
*
@@ -881,6 +916,10 @@ public Arc<T> readLastArcByDirectAddressing(Arc<T> arc, final BytesReader in) th
881916
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
882917
}
883918

919+
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
920+
return readArcByContinuous(arc, in, arc.numArcs() - 1);
921+
}
922+
884923
/** Never returns null, but you should never call this if arc.isLast() is true. */
885924
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
886925

@@ -889,6 +928,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
889928

890929
switch (arc.nodeFlags()) {
891930
case ARCS_FOR_BINARY_SEARCH:
931+
case ARCS_FOR_CONTINUOUS:
892932
assert arc.bytesPerArc() > 0;
893933
arc.arcIdx++;
894934
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
@@ -917,7 +957,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
917957
* positioned just after the arc flags byte.
918958
*/
919959
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
920-
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
960+
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
921961
arc.label = arc.firstLabel() + arc.arcIdx();
922962
} else {
923963
arc.label = readLabel(in);
@@ -1060,6 +1100,17 @@ public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesRe
10601100
}
10611101
}
10621102
return null;
1103+
} else if (flags == ARCS_FOR_CONTINUOUS) {
1104+
arc.numArcs = in.readVInt();
1105+
arc.bytesPerArc = in.readVInt();
1106+
arc.firstLabel = readLabel(in);
1107+
arc.posArcsStart = in.getPosition();
1108+
int arcIndex = labelToMatch - arc.firstLabel();
1109+
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
1110+
return null; // Before or after label range.
1111+
}
1112+
arc.arcIdx = arcIndex - 1;
1113+
return readNextRealArc(arc, in);
10631114
}
10641115

10651116
// Linear scan

lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.apache.lucene.util.fst;
1818

1919
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
20+
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
2021
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
2122
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
2223
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
@@ -112,6 +113,7 @@ public class FSTCompiler<T> {
112113
long nodeCount;
113114
long binarySearchNodeCount;
114115
long directAddressingNodeCount;
116+
long continuousNodeCount;
115117

116118
final boolean allowFixedLengthArcs;
117119
final float directAddressingMaxOversizingFactor;
@@ -444,9 +446,15 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
444446

445447
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
446448
assert labelRange > 0;
447-
if (shouldExpandNodeWithDirectAddressing(
449+
boolean continuousLable = labelRange == nodeIn.numArcs;
450+
if (continuousLable) {
451+
writeNodeForDirectAddressingOrContinuous(
452+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
453+
continuousNodeCount++;
454+
} else if (shouldExpandNodeWithDirectAddressing(
448455
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
449-
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
456+
writeNodeForDirectAddressingOrContinuous(
457+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
450458
directAddressingNodeCount++;
451459
} else {
452460
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
@@ -577,18 +585,19 @@ private void writeNodeForBinarySearch(
577585
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
578586
}
579587

580-
private void writeNodeForDirectAddressing(
588+
private void writeNodeForDirectAddressingOrContinuous(
581589
FSTCompiler.UnCompiledNode<T> nodeIn,
582590
long startAddress,
583591
int maxBytesPerArcWithoutLabel,
584-
int labelRange) {
592+
int labelRange,
593+
boolean continuous) {
585594
// Expand the arcs backwards in a buffer because we remove the labels.
586595
// So the obtained arcs might occupy less space. This is the reason why this
587596
// whole method is more complex.
588597
// Drop the label bytes since we can infer the label based on the arc index,
589598
// the presence bits, and the first label. Keep the first label.
590599
int headerMaxLen = 11;
591-
int numPresenceBytes = getNumPresenceBytes(labelRange);
600+
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
592601
long srcPos = bytes.getPosition();
593602
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
594603
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
@@ -619,7 +628,7 @@ private void writeNodeForDirectAddressing(
619628
// metadata.
620629
fixedLengthArcsBuffer
621630
.resetPosition()
622-
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
631+
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
623632
.writeVInt(labelRange) // labelRange instead of numArcs.
624633
.writeVInt(
625634
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
@@ -641,8 +650,10 @@ private void writeNodeForDirectAddressing(
641650
writeOffset += headerLen;
642651

643652
// Write the presence bits
644-
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
645-
writeOffset += numPresenceBytes;
653+
if (continuous == false) {
654+
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
655+
writeOffset += numPresenceBytes;
656+
}
646657

647658
// Write the first label and the arcs.
648659
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);

0 commit comments

Comments
 (0)