Skip to content

Commit f09590e

Browse files
committed
init
1 parent 2e12a35 commit f09590e

File tree

8 files changed

+245
-34
lines changed

8 files changed

+245
-34
lines changed

lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
8686
*/
8787
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
8888

89+
/** Version that store continuous arcs label as range in FST. */
90+
public static final int VERSION_ARCS_CONTINUOUS = 2;
91+
8992
/** Current terms format. */
90-
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
93+
public static final int VERSION_CURRENT = VERSION_ARCS_CONTINUOUS;
9194

9295
/** Extension of terms index file */
9396
static final String TERMS_INDEX_EXTENSION = "tip";

lucene/core/src/java/org/apache/lucene/util/fst/FST.java

Lines changed: 65 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ public enum INPUT_TYPE {
9393
*/
9494
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
9595

96+
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
97+
9698
// Increment version to change it
9799
private static final String FILE_FORMAT_NAME = "FST";
98100
private static final int VERSION_START = 6;
@@ -250,7 +252,10 @@ public String toString() {
250252
.append(numArcs())
251253
.append(")")
252254
.append("(")
253-
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
255+
.append(
256+
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
257+
? "da"
258+
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
254259
.append(")");
255260
}
256261
return b.toString();
@@ -292,8 +297,8 @@ public int arcIdx() {
292297

293298
/**
294299
* Node header flags. Only meaningful to check if the value is either {@link
295-
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
296-
* == 0).
300+
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
301+
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
297302
*/
298303
public byte nodeFlags() {
299304
return nodeFlags;
@@ -325,7 +330,7 @@ public int numArcs() {
325330

326331
/**
327332
* First label of a direct addressing node. Only valid if nodeFlags == {@link
328-
* #ARCS_FOR_DIRECT_ADDRESSING}.
333+
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
329334
*/
330335
int firstLabel() {
331336
return firstLabel;
@@ -664,7 +669,9 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
664669
} else {
665670
in.setPosition(follow.target());
666671
byte flags = arc.nodeFlags = in.readByte();
667-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
672+
if (flags == ARCS_FOR_BINARY_SEARCH
673+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
674+
|| flags == ARCS_FOR_CONTINUOUS) {
668675
// Special arc which is actually a node header for fixed length arcs.
669676
// Jump straight to end to find the last arc.
670677
arc.numArcs = in.readVInt();
@@ -675,10 +682,14 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
675682
arc.firstLabel = readLabel(in);
676683
arc.posArcsStart = in.getPosition();
677684
readLastArcByDirectAddressing(arc, in);
678-
} else {
685+
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
679686
arc.arcIdx = arc.numArcs() - 2;
680687
arc.posArcsStart = in.getPosition();
681688
readNextRealArc(arc, in);
689+
} else {
690+
arc.firstLabel = readLabel(in);
691+
arc.posArcsStart = in.getPosition();
692+
readLastArcByContinuous(arc, in);
682693
}
683694
} else {
684695
arc.flags = flags;
@@ -752,7 +763,9 @@ public Arc<T> readFirstRealTargetArc(long nodeAddress, Arc<T> arc, final BytesRe
752763
// System.out.println(" flags=" + arc.flags);
753764

754765
byte flags = arc.nodeFlags = in.readByte();
755-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
766+
if (flags == ARCS_FOR_BINARY_SEARCH
767+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
768+
|| flags == ARCS_FOR_CONTINUOUS) {
756769
// System.out.println(" fixed length arc");
757770
// Special arc which is actually a node header for fixed length arcs.
758771
arc.numArcs = in.readVInt();
@@ -762,6 +775,8 @@ public Arc<T> readFirstRealTargetArc(long nodeAddress, Arc<T> arc, final BytesRe
762775
readPresenceBytes(arc, in);
763776
arc.firstLabel = readLabel(in);
764777
arc.presenceIndex = -1;
778+
} else if (flags == ARCS_FOR_CONTINUOUS) {
779+
arc.firstLabel = readLabel(in);
765780
}
766781
arc.posArcsStart = in.getPosition();
767782
// System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + "
@@ -784,7 +799,9 @@ boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException {
784799
} else {
785800
in.setPosition(follow.target());
786801
byte flags = in.readByte();
787-
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
802+
return flags == ARCS_FOR_BINARY_SEARCH
803+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
804+
|| flags == ARCS_FOR_CONTINUOUS;
788805
}
789806
}
790807

@@ -812,16 +829,18 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
812829

813830
in.setPosition(arc.nextArc());
814831
byte flags = in.readByte();
815-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
832+
if (flags == ARCS_FOR_BINARY_SEARCH
833+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
834+
|| flags == ARCS_FOR_CONTINUOUS) {
816835
// System.out.println(" nextArc fixed length arc");
817836
// Special arc which is actually a node header for fixed length arcs.
818837
int numArcs = in.readVInt();
819838
in.readVInt(); // Skip bytesPerArc.
820839
if (flags == ARCS_FOR_BINARY_SEARCH) {
821840
in.readByte(); // Skip arc flags.
822-
} else {
841+
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
823842
in.skipBytes(getNumPresenceBytes(numArcs));
824-
}
843+
} // Nothing to do for ARCS_FOR_CONTINUOUS
825844
}
826845
} else {
827846
if (arc.bytesPerArc() != 0) {
@@ -830,15 +849,17 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
830849
if (arc.nodeFlags() == ARCS_FOR_BINARY_SEARCH) {
831850
// Point to next arc, -1 to skip arc flags.
832851
in.setPosition(arc.posArcsStart() - (1 + arc.arcIdx()) * (long) arc.bytesPerArc() - 1);
833-
} else {
834-
assert arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING;
852+
} else if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
835853
// Direct addressing node. The label is not stored but rather inferred
836854
// based on first label and arc index in the range.
837855
assert BitTable.assertIsValid(arc, in);
838856
assert BitTable.isBitSet(arc.arcIdx(), arc, in);
839857
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
840858
assert nextIndex != -1;
841859
return arc.firstLabel() + nextIndex;
860+
} else {
861+
assert arc.nodeFlags() == ARCS_FOR_CONTINUOUS;
862+
return arc.firstLabel() + arc.arcIdx() + 1;
842863
}
843864
} else {
844865
// Arcs have variable length.
@@ -860,6 +881,20 @@ public Arc<T> readArcByIndex(Arc<T> arc, final BytesReader in, int idx) throws I
860881
return readArc(arc, in);
861882
}
862883

884+
/**
885+
* Reads a Continuous node arc, with the provided index in the label range.
886+
*
887+
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
888+
*/
889+
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
890+
throws IOException {
891+
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
892+
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
893+
arc.arcIdx = rangeIndex;
894+
arc.flags = in.readByte();
895+
return readArc(arc, in);
896+
}
897+
863898
/**
864899
* Reads a present direct addressing node arc, with the provided index in the label range.
865900
*
@@ -899,6 +934,10 @@ public Arc<T> readLastArcByDirectAddressing(Arc<T> arc, final BytesReader in) th
899934
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
900935
}
901936

937+
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
938+
return readArcByContinuous(arc, in, arc.numArcs() - 1);
939+
}
940+
902941
/** Never returns null, but you should never call this if arc.isLast() is true. */
903942
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
904943

@@ -907,6 +946,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
907946

908947
switch (arc.nodeFlags()) {
909948
case ARCS_FOR_BINARY_SEARCH:
949+
case ARCS_FOR_CONTINUOUS:
910950
assert arc.bytesPerArc() > 0;
911951
arc.arcIdx++;
912952
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
@@ -935,7 +975,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
935975
* positioned just after the arc flags byte.
936976
*/
937977
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
938-
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
978+
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
939979
arc.label = arc.firstLabel() + arc.arcIdx();
940980
} else {
941981
arc.label = readLabel(in);
@@ -1078,6 +1118,17 @@ public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesRe
10781118
}
10791119
}
10801120
return null;
1121+
} else if (flags == ARCS_FOR_CONTINUOUS) {
1122+
arc.numArcs = in.readVInt();
1123+
arc.bytesPerArc = in.readVInt();
1124+
arc.firstLabel = readLabel(in);
1125+
arc.posArcsStart = in.getPosition();
1126+
int arcIndex = labelToMatch - arc.firstLabel();
1127+
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
1128+
return null; // Before or after label range.
1129+
}
1130+
arc.arcIdx = arcIndex - 1;
1131+
return readNextRealArc(arc, in);
10811132
}
10821133

10831134
// Linear scan

lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.apache.lucene.util.fst;
1818

1919
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
20+
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
2021
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
2122
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
2223
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
@@ -124,6 +125,7 @@ public class FSTCompiler<T> {
124125
long nodeCount;
125126
long binarySearchNodeCount;
126127
long directAddressingNodeCount;
128+
long continuousNodeCount;
127129

128130
final boolean allowFixedLengthArcs;
129131
final float directAddressingMaxOversizingFactor;
@@ -508,9 +510,15 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
508510

509511
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
510512
assert labelRange > 0;
511-
if (shouldExpandNodeWithDirectAddressing(
513+
boolean continuousLable = labelRange == nodeIn.numArcs;
514+
if (continuousLable) {
515+
writeNodeForDirectAddressingOrContinuous(
516+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
517+
continuousNodeCount++;
518+
} else if (shouldExpandNodeWithDirectAddressing(
512519
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
513-
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
520+
writeNodeForDirectAddressingOrContinuous(
521+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
514522
directAddressingNodeCount++;
515523
} else {
516524
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
@@ -641,18 +649,19 @@ private void writeNodeForBinarySearch(
641649
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
642650
}
643651

644-
private void writeNodeForDirectAddressing(
652+
private void writeNodeForDirectAddressingOrContinuous(
645653
FSTCompiler.UnCompiledNode<T> nodeIn,
646654
long startAddress,
647655
int maxBytesPerArcWithoutLabel,
648-
int labelRange) {
656+
int labelRange,
657+
boolean continuous) {
649658
// Expand the arcs backwards in a buffer because we remove the labels.
650659
// So the obtained arcs might occupy less space. This is the reason why this
651660
// whole method is more complex.
652661
// Drop the label bytes since we can infer the label based on the arc index,
653662
// the presence bits, and the first label. Keep the first label.
654663
int headerMaxLen = 11;
655-
int numPresenceBytes = getNumPresenceBytes(labelRange);
664+
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
656665
long srcPos = bytes.getPosition();
657666
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
658667
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
@@ -683,7 +692,7 @@ private void writeNodeForDirectAddressing(
683692
// metadata.
684693
fixedLengthArcsBuffer
685694
.resetPosition()
686-
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
695+
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
687696
.writeVInt(labelRange) // labelRange instead of numArcs.
688697
.writeVInt(
689698
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
@@ -705,8 +714,10 @@ private void writeNodeForDirectAddressing(
705714
writeOffset += headerLen;
706715

707716
// Write the presence bits
708-
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
709-
writeOffset += numPresenceBytes;
717+
if (continuous == false) {
718+
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
719+
writeOffset += numPresenceBytes;
720+
}
710721

711722
// Write the first label and the arcs.
712723
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);

0 commit comments

Comments
 (0)