-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Specialize arc store for continuous label in FST #12748
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d7528d8
3854010
71a2d5a
8948453
0a6261b
1ebb658
61e89ed
e3b0657
178151f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -98,11 +98,19 @@ public enum INPUT_TYPE { | |
| */ | ||
| static final byte ARCS_FOR_DIRECT_ADDRESSING = 1 << 6; | ||
|
|
||
| /** | ||
| * Value of the arc flags to declare a node with continuous arcs designed for pos the arc directly | ||
| * with labelToPos - firstLabel. like {@link #ARCS_FOR_BINARY_SEARCH} we use flag combinations | ||
| * that will not occur at the same time. | ||
| */ | ||
| static final byte ARCS_FOR_CONTINUOUS = ARCS_FOR_DIRECT_ADDRESSING + ARCS_FOR_BINARY_SEARCH; | ||
|
|
||
| // Increment version to change it | ||
| private static final String FILE_FORMAT_NAME = "FST"; | ||
| private static final int VERSION_START = 6; | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm shouldn't we bump the
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think so, we expect to throw
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am not sure, if the change is backward compatible, maybe we can also keep the
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm I'd like to protect against an older version of Lucene (w/o this change) trying to read an FST written with a newer version (with this change). If we bump the version, that older version would throw an understandable error, but if we don't, it'd be some strange assertion error or so? I realize it'd be hard to even reach such a situation (you'd have to be using FSTs directly or so), but still when we make such changes to our format I think it's good practice to bump the version.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
+1 to also bump the version in
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you very much for your guidance! it's very helpful! |
||
| private static final int VERSION_LITTLE_ENDIAN = 8; | ||
| static final int VERSION_CURRENT = VERSION_LITTLE_ENDIAN; | ||
| private static final int VERSION_CONTINUOUS_ARCS = 9; | ||
| static final int VERSION_CURRENT = VERSION_CONTINUOUS_ARCS; | ||
|
|
||
| // Never serialized; just used to represent the virtual | ||
| // final node w/ no arcs: | ||
|
|
@@ -243,7 +251,10 @@ public String toString() { | |
| .append(numArcs()) | ||
| .append(")") | ||
| .append("(") | ||
| .append(nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING ? "da" : "bs") | ||
| .append( | ||
| nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING | ||
| ? "da" | ||
| : nodeFlags() == ARCS_FOR_CONTINUOUS ? "cs" : "bs") | ||
| .append(")"); | ||
| } | ||
| return b.toString(); | ||
|
|
@@ -285,8 +296,8 @@ public int arcIdx() { | |
|
|
||
| /** | ||
| * Node header flags. Only meaningful to check if the value is either {@link | ||
| * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} (other value when bytesPerArc | ||
| * == 0). | ||
| * #ARCS_FOR_BINARY_SEARCH} or {@link #ARCS_FOR_DIRECT_ADDRESSING} or {@link | ||
| * #ARCS_FOR_CONTINUOUS} (other value when bytesPerArc == 0). | ||
| */ | ||
| public byte nodeFlags() { | ||
| return nodeFlags; | ||
|
|
@@ -318,7 +329,7 @@ public int numArcs() { | |
|
|
||
| /** | ||
| * First label of a direct addressing node. Only valid if nodeFlags == {@link | ||
| * #ARCS_FOR_DIRECT_ADDRESSING}. | ||
| * #ARCS_FOR_DIRECT_ADDRESSING} or {@link #ARCS_FOR_CONTINUOUS}. | ||
| */ | ||
| int firstLabel() { | ||
| return firstLabel; | ||
|
|
@@ -653,7 +664,9 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc | |
| } else { | ||
| in.setPosition(follow.target()); | ||
| byte flags = arc.nodeFlags = in.readByte(); | ||
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { | ||
| if (flags == ARCS_FOR_BINARY_SEARCH | ||
| || flags == ARCS_FOR_DIRECT_ADDRESSING | ||
| || flags == ARCS_FOR_CONTINUOUS) { | ||
| // Special arc which is actually a node header for fixed length arcs. | ||
| // Jump straight to end to find the last arc. | ||
| arc.numArcs = in.readVInt(); | ||
|
|
@@ -664,10 +677,14 @@ Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc, BytesReader in) throws IOExc | |
| arc.firstLabel = readLabel(in); | ||
| arc.posArcsStart = in.getPosition(); | ||
| readLastArcByDirectAddressing(arc, in); | ||
| } else { | ||
| } else if (flags == ARCS_FOR_BINARY_SEARCH) { | ||
| arc.arcIdx = arc.numArcs() - 2; | ||
| arc.posArcsStart = in.getPosition(); | ||
| readNextRealArc(arc, in); | ||
| } else { | ||
| arc.firstLabel = readLabel(in); | ||
| arc.posArcsStart = in.getPosition(); | ||
| readLastArcByContinuous(arc, in); | ||
| } | ||
| } else { | ||
| arc.flags = flags; | ||
|
|
@@ -740,7 +757,9 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in | |
| in.setPosition(nodeAddress); | ||
|
|
||
| byte flags = arc.nodeFlags = in.readByte(); | ||
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { | ||
| if (flags == ARCS_FOR_BINARY_SEARCH | ||
| || flags == ARCS_FOR_DIRECT_ADDRESSING | ||
| || flags == ARCS_FOR_CONTINUOUS) { | ||
| // Special arc which is actually a node header for fixed length arcs. | ||
| arc.numArcs = in.readVInt(); | ||
| arc.bytesPerArc = in.readVInt(); | ||
|
|
@@ -749,6 +768,8 @@ private void readFirstArcInfo(long nodeAddress, Arc<T> arc, final BytesReader in | |
| readPresenceBytes(arc, in); | ||
| arc.firstLabel = readLabel(in); | ||
| arc.presenceIndex = -1; | ||
| } else if (flags == ARCS_FOR_CONTINUOUS) { | ||
| arc.firstLabel = readLabel(in); | ||
| } | ||
| arc.posArcsStart = in.getPosition(); | ||
| } else { | ||
|
|
@@ -773,7 +794,9 @@ boolean isExpandedTarget(Arc<T> follow, BytesReader in) throws IOException { | |
| } else { | ||
| in.setPosition(follow.target()); | ||
| byte flags = in.readByte(); | ||
| return flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING; | ||
| return flags == ARCS_FOR_BINARY_SEARCH | ||
| || flags == ARCS_FOR_DIRECT_ADDRESSING | ||
| || flags == ARCS_FOR_CONTINUOUS; | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -801,16 +824,18 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException { | |
|
|
||
| in.setPosition(arc.nextArc()); | ||
| byte flags = in.readByte(); | ||
| if (flags == ARCS_FOR_BINARY_SEARCH || flags == ARCS_FOR_DIRECT_ADDRESSING) { | ||
| if (flags == ARCS_FOR_BINARY_SEARCH | ||
| || flags == ARCS_FOR_DIRECT_ADDRESSING | ||
| || flags == ARCS_FOR_CONTINUOUS) { | ||
| // System.out.println(" nextArc fixed length arc"); | ||
| // Special arc which is actually a node header for fixed length arcs. | ||
| int numArcs = in.readVInt(); | ||
| in.readVInt(); // Skip bytesPerArc. | ||
| if (flags == ARCS_FOR_BINARY_SEARCH) { | ||
| in.readByte(); // Skip arc flags. | ||
| } else { | ||
| } else if (flags == ARCS_FOR_DIRECT_ADDRESSING) { | ||
| in.skipBytes(getNumPresenceBytes(numArcs)); | ||
| } | ||
| } // Nothing to do for ARCS_FOR_CONTINUOUS | ||
| } | ||
| } else { | ||
| switch (arc.nodeFlags()) { | ||
|
|
@@ -826,6 +851,8 @@ int readNextArcLabel(Arc<T> arc, BytesReader in) throws IOException { | |
| int nextIndex = BitTable.nextBitSet(arc.arcIdx(), arc, in); | ||
| assert nextIndex != -1; | ||
| return arc.firstLabel() + nextIndex; | ||
| case ARCS_FOR_CONTINUOUS: | ||
| return arc.firstLabel() + arc.arcIdx() + 1; | ||
| default: | ||
| // Variable length arcs - linear search. | ||
| assert arc.bytesPerArc() == 0; | ||
|
|
@@ -849,6 +876,20 @@ public Arc<T> readArcByIndex(Arc<T> arc, final BytesReader in, int idx) throws I | |
| return readArc(arc, in); | ||
| } | ||
|
|
||
| /** | ||
| * Reads a Continuous node arc, with the provided index in the label range. | ||
| * | ||
| * @param rangeIndex The index of the arc in the label range. It must be within the label range. | ||
| */ | ||
| public Arc<T> readArcByContinuous(Arc<T> arc, final BytesReader in, int rangeIndex) | ||
| throws IOException { | ||
| assert rangeIndex >= 0 && rangeIndex < arc.numArcs(); | ||
| in.setPosition(arc.posArcsStart() - rangeIndex * (long) arc.bytesPerArc()); | ||
| arc.arcIdx = rangeIndex; | ||
| arc.flags = in.readByte(); | ||
| return readArc(arc, in); | ||
| } | ||
|
|
||
| /** | ||
| * Reads a present direct addressing node arc, with the provided index in the label range. | ||
| * | ||
|
|
@@ -888,6 +929,11 @@ public Arc<T> readLastArcByDirectAddressing(Arc<T> arc, final BytesReader in) th | |
| return readArcByDirectAddressing(arc, in, arc.numArcs() - 1, presenceIndex); | ||
| } | ||
|
|
||
| /** Reads the last arc of a continuous node. */ | ||
| public Arc<T> readLastArcByContinuous(Arc<T> arc, final BytesReader in) throws IOException { | ||
| return readArcByContinuous(arc, in, arc.numArcs() - 1); | ||
| } | ||
|
|
||
| /** Never returns null, but you should never call this if arc.isLast() is true. */ | ||
| public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException { | ||
|
|
||
|
|
@@ -896,6 +942,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti | |
|
|
||
| switch (arc.nodeFlags()) { | ||
| case ARCS_FOR_BINARY_SEARCH: | ||
| case ARCS_FOR_CONTINUOUS: | ||
| assert arc.bytesPerArc() > 0; | ||
| arc.arcIdx++; | ||
| assert arc.arcIdx() >= 0 && arc.arcIdx() < arc.numArcs(); | ||
|
|
@@ -924,7 +971,7 @@ public Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOExcepti | |
| * positioned just after the arc flags byte. | ||
| */ | ||
| private Arc<T> readArc(Arc<T> arc, BytesReader in) throws IOException { | ||
| if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING) { | ||
| if (arc.nodeFlags() == ARCS_FOR_DIRECT_ADDRESSING || arc.nodeFlags() == ARCS_FOR_CONTINUOUS) { | ||
| arc.label = arc.firstLabel() + arc.arcIdx(); | ||
| } else { | ||
| arc.label = readLabel(in); | ||
|
|
@@ -1067,6 +1114,17 @@ public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc, BytesRe | |
| } | ||
| } | ||
| return null; | ||
| } else if (flags == ARCS_FOR_CONTINUOUS) { | ||
| arc.numArcs = in.readVInt(); | ||
| arc.bytesPerArc = in.readVInt(); | ||
| arc.firstLabel = readLabel(in); | ||
| arc.posArcsStart = in.getPosition(); | ||
| int arcIndex = labelToMatch - arc.firstLabel(); | ||
| if (arcIndex < 0 || arcIndex >= arc.numArcs()) { | ||
| return null; // Before or after label range. | ||
| } | ||
| arc.arcIdx = arcIndex - 1; | ||
| return readNextRealArc(arc, in); | ||
| } | ||
|
|
||
| // Linear scan | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a comment explaining this arc optimization case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1 , It is important.