Skip to content

Commit b2d6736

Browse files
easyicemikemccand
authored andcommitted
Specialize arc store for continuous label in FST (#12748)
* init * review fix and reuse duplicate code * rebase * tidy * CHANGES.txt * bump version * rebase * CHANGES.txt
1 parent 7f63410 commit b2d6736

File tree

9 files changed

+261
-51
lines changed

9 files changed

+261
-51
lines changed

lucene/CHANGES.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,14 +156,14 @@ Optimizations
156156

157157
* GITHUB#12381: Skip docs with DocValues in NumericLeafComparator. (Lu Xugang, Adrien Grand)
158158

159-
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Zhang Chao)
160-
161159
* GITHUB#12784: Cache buckets to speed up BytesRefHash#sort. (Guo Feng)
162160

163161
* GITHUB#12806: Utilize exact kNN search when gathering k >= numVectors in a segment (Ben Trent)
164162

165163
* GITHUB#12782: Use group-varint encoding for the tail of postings. (Adrien Grand, Zhang Chao)
166164

165+
* GITHUB#12748: Specialize arc store for continuous label in FST. (Guo Feng, Chao Zhang)
166+
167167
Changes in runtime behavior
168168
---------------------
169169

lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,11 @@ public final class Lucene90BlockTreeTermsReader extends FieldsProducer {
8686
*/
8787
public static final int VERSION_MSB_VLONG_OUTPUT = 1;
8888

89+
/** The version that specialize arc store for continuous label in FST. */
90+
public static final int VERSION_FST_CONTINUOUS_ARCS = 2;
91+
8992
/** Current terms format. */
90-
public static final int VERSION_CURRENT = VERSION_MSB_VLONG_OUTPUT;
93+
public static final int VERSION_CURRENT = VERSION_FST_CONTINUOUS_ARCS;
9194

9295
/** Extension of terms index file */
9396
static final String TERMS_INDEX_EXTENSION = "tip";

lucene/core/src/java/org/apache/lucene/util/fst/FST.java

Lines changed: 71 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,19 @@ public enum INPUT_TYPE {
9898
*/
9999
static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6;
100100

101+
/**
102+
* Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly
103+
* with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations
104+
* that will not occur at the same time.
105+
*/
106+
static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH;
107+
101108
// Increment version to change it
102109
private static final String FILE_FORMAT_NAME = "FST";
103110
private static final int VERSION_START = 6;
104111
private static final int VERSION_LITTLE_ENDIAN = 8;
105-
static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN;
112+
private static final int VERSION_CONTINUOUS_ARCS = 9;
113+
static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS;
106114

107115
// Never serialized; just used to represent the virtual
108116
// final node w/ no arcs:
@@ -243,7 +251,10 @@ public String toString() {
243251
.append(numArcs())
244252
.append(")")
245253
.append("(")
246-
.append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs")
254+
.append(
255+
nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING
256+
? "da"
257+
: nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs")
247258
.append(")");
248259
}
249260
return b.toString();
@@ -285,8 +296,8 @@ public int arcIdx() {
285296

286297
/**
287298
* Node header flags. Only meaningful to check if the value is either {@link
288-
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc
289-
* == 0).
299+
* #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link
300+
* #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0).
290301
*/
291302
public byte nodeFlags() {
292303
return nodeFlags;
@@ -318,7 +329,7 @@ public int numArcs() {
318329

319330
/**
320331
* First label of a direct addressing node. Only valid if nodeFlags == {@link
321-
* #ARCS_FOR_DIRECT_ADDRESSING}.
332+
* #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}.
322333
*/
323334
int firstLabel() {
324335
return firstLabel;
@@ -653,7 +664,9 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
653664
} else {
654665
in.setPosition(follow.target());
655666
byte flags = arc.nodeFlags = in.readByte();
656-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
667+
if (flags == ARCS_FOR_BINARY_SEARCH
668+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
669+
|| flags == ARCS_FOR_CONTINUOUS) {
657670
// Special arc which is actually a node header for fixed length arcs.
658671
// Jump straight to end to find the last arc.
659672
arc.numArcs = in.readVInt();
@@ -664,10 +677,14 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc
664677
arc.firstLabel = readLabel(in);
665678
arc.posArcsStart = in.getPosition();
666679
readLastArcByDirectAddressing(arc, in);
667-
} else {
680+
} else if (flags == ARCS_FOR_BINARY_SEARCH) {
668681
arc.arcIdx = arc.numArcs() - 2;
669682
arc.posArcsStart = in.getPosition();
670683
readNextRealArc(arc, in);
684+
} else {
685+
arc.firstLabel = readLabel(in);
686+
arc.posArcsStart = in.getPosition();
687+
readLastArcByContinuous(arc, in);
671688
}
672689
} else {
673690
arc.flags = flags;
@@ -740,7 +757,9 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in
740757
in.setPosition(nodeAddress);
741758

742759
byte flags = arc.nodeFlags = in.readByte();
743-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
760+
if (flags == ARCS_FOR_BINARY_SEARCH
761+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
762+
|| flags == ARCS_FOR_CONTINUOUS) {
744763
// Special arc which is actually a node header for fixed length arcs.
745764
arc.numArcs = in.readVInt();
746765
arc.bytesPerArc = in.readVInt();
@@ -749,6 +768,8 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in
749768
readPresenceBytes(arc, in);
750769
arc.firstLabel = readLabel(in);
751770
arc.presenceIndex = -1;
771+
} else if (flags == ARCS_FOR_CONTINUOUS) {
772+
arc.firstLabel = readLabel(in);
752773
}
753774
arc.posArcsStart = in.getPosition();
754775
} else {
@@ -773,7 +794,9 @@ boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException {
773794
} else {
774795
in.setPosition(follow.target());
775796
byte flags = in.readByte();
776-
return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING;
797+
return flags == ARCS_FOR_BINARY_SEARCH
798+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
799+
|| flags == ARCS_FOR_CONTINUOUS;
777800
}
778801
}
779802

@@ -801,16 +824,18 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
801824

802825
in.setPosition(arc.nextArc());
803826
byte flags = in.readByte();
804-
if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) {
827+
if (flags == ARCS_FOR_BINARY_SEARCH
828+
|| flags == ARCS_FOR_DIRECT_ADDRESSING
829+
|| flags == ARCS_FOR_CONTINUOUS) {
805830
// System.out.println(" nextArc fixed length arc");
806831
// Special arc which is actually a node header for fixed length arcs.
807832
int numArcs = in.readVInt();
808833
in.readVInt(); // Skip bytesPerArc.
809834
if (flags == ARCS_FOR_BINARY_SEARCH) {
810835
in.readByte(); // Skip arc flags.
811-
} else {
836+
} else if (flags == ARCS_FOR_DIRECT_ADDRESSING) {
812837
in.skipBytes(getNumPresenceBytes(numArcs));
813-
}
838+
} // Nothing to do for ARCS_FOR_CONTINUOUS
814839
}
815840
} else {
816841
switch (arc.nodeFlags()) {
@@ -826,6 +851,8 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException {
826851
int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in);
827852
assert nextIndex != -1;
828853
return arc.firstLabel() + nextIndex;
854+
case ARCS_FOR_CONTINUOUS:
855+
return arc.firstLabel() + arc.arcIdx() + 1;
829856
default:
830857
// Variable length arcs - linear search.
831858
assert arc.bytesPerArc() == 0;
@@ -849,6 +876,20 @@ public Arc<T> readArcByIndex(Arc<T> arc, final BytesReader in, int idx) throws I
849876
return readArc(arc, in);
850877
}
851878

879+
/**
880+
* Reads a Continuous node arc, with the provided index in the label range.
881+
*
882+
* @param rangeIndex The index of the arc in the label range. It must be within the label range.
883+
*/
884+
public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex)
885+
throws IOException {
886+
assert rangeIndex >= 0 && rangeIndex < arc.numArcs();
887+
in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc());
888+
arc.arcIdx = rangeIndex;
889+
arc.flags = in.readByte();
890+
return readArc(arc, in);
891+
}
892+
852893
/**
853894
* Reads a present direct addressing node arc, with the provided index in the label range.
854895
*
@@ -888,6 +929,11 @@ public Arc<T> readLastArcByDirectAddressing(Arc<T> arc, final BytesReader in) th
888929
return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex);
889930
}
890931

932+
/** Reads the last arc of a continuous node. */
933+
public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException {
934+
return readArcByContinuous(arc, in, arc.numArcs() - 1);
935+
}
936+
891937
/** Never returns null, but you should never call this if arc.isLast() is true. */
892938
public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
893939

@@ -896,6 +942,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
896942

897943
switch (arc.nodeFlags()) {
898944
case ARCS_FOR_BINARY_SEARCH:
945+
case ARCS_FOR_CONTINUOUS:
899946
assert arc.bytesPerArc() > 0;
900947
arc.arcIdx++;
901948
assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs();
@@ -924,7 +971,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti
924971
* positioned just after the arc flags byte.
925972
*/
926973
private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException {
927-
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) {
974+
if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) {
928975
arc.label = arc.firstLabel() + arc.arcIdx();
929976
} else {
930977
arc.label = readLabel(in);
@@ -1067,6 +1114,17 @@ public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesRe
10671114
}
10681115
}
10691116
return null;
1117+
} else if (flags == ARCS_FOR_CONTINUOUS) {
1118+
arc.numArcs = in.readVInt();
1119+
arc.bytesPerArc = in.readVInt();
1120+
arc.firstLabel = readLabel(in);
1121+
arc.posArcsStart = in.getPosition();
1122+
int arcIndex = labelToMatch - arc.firstLabel();
1123+
if (arcIndex < 0 || arcIndex >= arc.numArcs()) {
1124+
return null; // Before or after label range.
1125+
}
1126+
arc.arcIdx = arcIndex - 1;
1127+
return readNextRealArc(arc, in);
10701128
}
10711129

10721130
// Linear scan

lucene/core/src/java/org/apache/lucene/util/fst/FSTCompiler.java

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
package org.apache.lucene.util.fst;
1818

1919
import static org.apache.lucene.util.fst.FST.ARCS_FOR_BINARY_SEARCH;
20+
import static org.apache.lucene.util.fst.FST.ARCS_FOR_CONTINUOUS;
2021
import static org.apache.lucene.util.fst.FST.ARCS_FOR_DIRECT_ADDRESSING;
2122
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_FINAL_OUTPUT;
2223
import static org.apache.lucene.util.fst.FST.BIT_ARC_HAS_OUTPUT;
@@ -113,6 +114,7 @@ public class FSTCompiler<T> {
113114
long nodeCount;
114115
long binarySearchNodeCount;
115116
long directAddressingNodeCount;
117+
long continuousNodeCount;
116118

117119
final boolean allowFixedLengthArcs;
118120
final float directAddressingMaxOversizingFactor;
@@ -445,9 +447,15 @@ long addNode(FSTCompiler.UnCompiledNode<T> nodeIn) throws IOException {
445447

446448
int labelRange = nodeIn.arcs[nodeIn.numArcs - 1].label - nodeIn.arcs[0].label + 1;
447449
assert labelRange > 0;
448-
if (shouldExpandNodeWithDirectAddressing(
450+
boolean continuousLabel = labelRange == nodeIn.numArcs;
451+
if (continuousLabel) {
452+
writeNodeForDirectAddressingOrContinuous(
453+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, true);
454+
continuousNodeCount++;
455+
} else if (shouldExpandNodeWithDirectAddressing(
449456
nodeIn, maxBytesPerArc, maxBytesPerArcWithoutLabel, labelRange)) {
450-
writeNodeForDirectAddressing(nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange);
457+
writeNodeForDirectAddressingOrContinuous(
458+
nodeIn, startAddress, maxBytesPerArcWithoutLabel, labelRange, false);
451459
directAddressingNodeCount++;
452460
} else {
453461
writeNodeForBinarySearch(nodeIn, startAddress, maxBytesPerArc);
@@ -578,18 +586,19 @@ private void writeNodeForBinarySearch(
578586
bytes.writeBytes(startAddress, fixedLengthArcsBuffer.getBytes(), 0, headerLen);
579587
}
580588

581-
private void writeNodeForDirectAddressing(
589+
private void writeNodeForDirectAddressingOrContinuous(
582590
FSTCompiler.UnCompiledNode<T> nodeIn,
583591
long startAddress,
584592
int maxBytesPerArcWithoutLabel,
585-
int labelRange) {
593+
int labelRange,
594+
boolean continuous) {
586595
// Expand the arcs backwards in a buffer because we remove the labels.
587596
// So the obtained arcs might occupy less space. This is the reason why this
588597
// whole method is more complex.
589598
// Drop the label bytes since we can infer the label based on the arc index,
590599
// the presence bits, and the first label. Keep the first label.
591600
int headerMaxLen = 11;
592-
int numPresenceBytes = getNumPresenceBytes(labelRange);
601+
int numPresenceBytes = continuous ? 0 : getNumPresenceBytes(labelRange);
593602
long srcPos = bytes.getPosition();
594603
int totalArcBytes = numLabelBytesPerArc[0] + nodeIn.numArcs * maxBytesPerArcWithoutLabel;
595604
int bufferOffset = headerMaxLen + numPresenceBytes + totalArcBytes;
@@ -620,7 +629,7 @@ private void writeNodeForDirectAddressing(
620629
// metadata.
621630
fixedLengthArcsBuffer
622631
.resetPosition()
623-
.writeByte(ARCS_FOR_DIRECT_ADDRESSING)
632+
.writeByte(continuous ? ARCS_FOR_CONTINUOUS : ARCS_FOR_DIRECT_ADDRESSING)
624633
.writeVInt(labelRange) // labelRange instead of numArcs.
625634
.writeVInt(
626635
maxBytesPerArcWithoutLabel); // maxBytesPerArcWithoutLabel instead of maxBytesPerArc.
@@ -642,8 +651,10 @@ private void writeNodeForDirectAddressing(
642651
writeOffset += headerLen;
643652

644653
// Write the presence bits
645-
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
646-
writeOffset += numPresenceBytes;
654+
if (continuous == false) {
655+
writePresenceBits(nodeIn, writeOffset, numPresenceBytes);
656+
writeOffset += numPresenceBytes;
657+
}
647658

648659
// Write the first label and the arcs.
649660
bytes.writeBytes(writeOffset, fixedLengthArcsBuffer.getBytes(), bufferOffset, totalArcBytes);

0 commit comments

Comments
 (0)