diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 67767f23d839..0fa027ab7f4d 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -85,6 +85,8 @@ Improvements Optimizations --------------------- +* GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui) + * GITHUB#14011: Reduce allocation rate in HNSW concurrent merge. (Viliam Durina) * GITHUB#14022: Optimize DFS marking of connected components in HNSW by reducing stack depth, improving performance and reducing allocations. (Viswanath Kuchibhotla) diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java index e6c297befc0c..39e35cc83608 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnum.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.PrintStream; +import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.index.BaseTermsEnum; @@ -183,7 +184,7 @@ OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length, lo // " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + // term.length + " vs prefix=" + f.prefix); - if (f.prefix > targetBeforeCurrentLength) { + if (f.prefixLength > targetBeforeCurrentLength) { // System.out.println(" do rewind!"); f.rewind(); } else { @@ -191,11 +192,11 @@ OrdsSegmentTermsEnumFrame pushFrame(FST.Arc arc, long fp, int length, lo // System.out.println(" skip rewind!"); // } } - assert length == f.prefix; + assert length == f.prefixLength; assert termOrd == f.termOrdOrig; } else { f.nextEnt = -1; - f.prefix = length; + f.prefixLength = length; f.state.termBlockOrd = 0; f.termOrdOrig = termOrd; // System.out.println("set termOrdOrig=" + termOrd); @@ -308,31 +309,18 @@ public boolean seekExact(final BytesRef target) throws IOException { } if (cmp == 0) { - final int targetUptoMid = targetUpto; - // Second compare the rest of the term, but // don't save arc/output/frame; we only do this // to find out if the target term is before, // equal or after the current term - final int targetLimit2 = Math.min(target.length, term.length()); - while (targetUpto < targetLimit2) { - cmp = - (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); - // if (DEBUG) { - // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + - // targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + - // targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); - // } - if (cmp != 0) { - break; - } - targetUpto++; - } - - if (cmp == 0) { - cmp = term.length() - target.length; - } - targetUpto = targetUptoMid; + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); } if (cmp < 0) { @@ -424,7 +412,7 @@ public boolean seekExact(final BytesRef target) throws IOException { // toHex(targetLabel)); // } - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; // validIndexPrefix = targetUpto; currentFrame.scanToFloorFrame(target); @@ -484,7 +472,7 @@ public boolean seekExact(final BytesRef target) throws IOException { } // validIndexPrefix = targetUpto; - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; currentFrame.scanToFloorFrame(target); @@ -600,28 +588,16 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { } if (cmp == 0) { - final int targetUptoMid = targetUpto; // Second compare the rest of the term, but // don't save arc/output/frame: - final int targetLimit2 = Math.min(target.length, term.length()); - while (targetUpto < targetLimit2) { - cmp = - (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); - // if (DEBUG) { - // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit - // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) - // + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); - // } - if (cmp != 0) { - break; - } - targetUpto++; - } - - if (cmp == 0) { - cmp = term.length() - target.length; - } - targetUpto = targetUptoMid; + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); } if (cmp < 0) { @@ -710,7 +686,7 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { // toHex(targetLabel)); // } - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; // validIndexPrefix = targetUpto; currentFrame.scanToFloorFrame(target); @@ -771,7 +747,7 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { } // validIndexPrefix = targetUpto; - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; currentFrame.scanToFloorFrame(target); @@ -809,7 +785,7 @@ private void printSeekState(PrintStream out) throws IOException { while (true) { OrdsSegmentTermsEnumFrame f = getFrame(ord); assert f != null; - final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); + final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefixLength); if (f.nextEnt == -1) { out.println( " frame " @@ -820,7 +796,7 @@ private void printSeekState(PrintStream out) throws IOException { + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" - + f.prefix + + f.prefixLength + " prefix=" + ToStringUtils.bytesRefToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) @@ -850,7 +826,7 @@ private void printSeekState(PrintStream out) throws IOException { + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" - + f.prefix + + f.prefixLength + " prefix=" + ToStringUtils.bytesRefToString(prefix) + " nextEnt=" @@ -877,12 +853,14 @@ private void printSeekState(PrintStream out) throws IOException { } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; - if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) { + if (f.prefixLength > 0 + && isSeekFrame + && f.arc.label() != (term.byteAt(f.prefixLength - 1) & 0xFF)) { out.println( " broken seek state: arc.label=" + (char) f.arc.label() + " vs term byte=" - + (char) (term.byteAt(f.prefix - 1) & 0xFF)); + + (char) (term.byteAt(f.prefixLength - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } Output output = Util.get(fr.index, prefix); @@ -911,7 +889,7 @@ private void printSeekState(PrintStream out) throws IOException { if (f == currentFrame) { break; } - if (f.prefix == validIndexPrefix) { + if (f.prefixLength == validIndexPrefix) { isSeekFrame = false; } ord++; @@ -993,7 +971,7 @@ public BytesRef next() throws IOException { // Note that the seek state (last seek) has been // invalidated beyond this depth - validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength); // if (DEBUG) { // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); // } diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java index cd98a3e0f8cf..aeab06a94401 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsSegmentTermsEnumFrame.java @@ -17,6 +17,7 @@ package org.apache.lucene.codecs.blocktreeords; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; import org.apache.lucene.index.IndexOptions; @@ -54,7 +55,7 @@ final class OrdsSegmentTermsEnumFrame { final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block - int prefix; + int prefixLength; // Number of entries (term or sub-block) in this block int entCount; @@ -295,11 +296,11 @@ public boolean nextLeaf() { : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp + " termOrd=" + termOrd; nextEnt++; termOrd++; - suffix = suffixesReader.readVInt(); + suffixLength = suffixesReader.readVInt(); startBytePos = suffixesReader.getPosition(); - ste.term.setLength(prefix + suffix); + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(ste.term.length()); - suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); // A normal term ste.termExists = true; return false; @@ -312,11 +313,11 @@ public boolean nextNonLeaf() { : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; nextEnt++; final int code = suffixesReader.readVInt(); - suffix = code >>> 1; + suffixLength = code >>> 1; startBytePos = suffixesReader.getPosition(); - ste.term.setLength(prefix + suffix); + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(ste.term.length()); - suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); if ((code & 1) == 0) { // A normal term ste.termExists = true; @@ -342,7 +343,7 @@ public boolean nextNonLeaf() { // floor blocks we "typically" get public void scanToFloorFrame(BytesRef target) { - if (!isFloor || target.length <= prefix) { + if (!isFloor || target.length <= prefixLength) { // if (DEBUG) { // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + // target.length + " vs prefix=" + prefix); @@ -350,7 +351,7 @@ public void scanToFloorFrame(BytesRef target) { return; } - final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF; // if (DEBUG) { // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) @@ -532,7 +533,7 @@ public void decodeMetaData() throws IOException { // Used only by assert private boolean prefixMatches(BytesRef target) { - for (int bytePos = 0; bytePos < prefix; bytePos++) { + for (int bytePos = 0; bytePos < prefixLength; bytePos++) { if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { return false; } @@ -586,7 +587,7 @@ public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOExcept } private int startBytePos; - private int suffix; + private int suffixLength; private long subCode; // Target's prefix matches this block's prefix; we @@ -613,13 +614,11 @@ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOEx assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - // nextTerm: while(nextEnt < entCount) { - nextTerm: - while (true) { + do { nextEnt++; termOrd++; - suffix = suffixesReader.readVInt(); + suffixLength = suffixesReader.readVInt(); // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); @@ -630,63 +629,41 @@ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOEx // + ToStringUtils.bytesRefToString(suffixBytesRef)); // } - final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); - suffixesReader.skipBytes(suffix); - - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; - - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while (true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } - - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning - - if (nextEnt == entCount) { - if (exactOnly) { - fillTerm(); - } - // We are done scanning this block - break nextTerm; - } else { - continue nextTerm; - } - } else if (cmp > 0) { - - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); + suffixesReader.skipBytes(suffixLength); + + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - // if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: - assert ste.termExists; - fillTerm(); - // if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; - } + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } - } + } while (nextEnt < entCount); // It is possible (and OK) that terms index pointed us // at this block, but, we scanned the entire block and @@ -730,13 +707,11 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - // nextTerm: while(nextEnt < entCount) { - nextTerm: - while (true) { + while (nextEnt < entCount) { nextEnt++; final int code = suffixesReader.readVInt(); - suffix = code >>> 1; + suffixLength = code >>> 1; // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); // suffixBytesRef.bytes = suffixBytes; @@ -748,9 +723,8 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I // } ste.termExists = (code & 1) == 0; - final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); - suffixesReader.skipBytes(suffix); + suffixesReader.skipBytes(suffixLength); // Must save ord before we skip over a sub-block in case we push, below: long prevTermOrd = termOrd; if (ste.termExists) { @@ -763,73 +737,53 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I lastSubFP = fp - subCode; } - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; - - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while (true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } - - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning - - if (nextEnt == entCount) { - if (exactOnly) { - fillTerm(); - // termExists = true; - } - // We are done scanning this block - break nextTerm; - } else { - continue nextTerm; - } - } else if (cmp > 0) { - - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - if (!exactOnly && !ste.termExists) { - // We are on a sub-block, and caller wants - // us to position to the next term after - // the target, so we must recurse into the - // sub-frame(s): + if (!exactOnly && !ste.termExists) { + // We are on a sub-block, and caller wants + // us to position to the next term after + // the target, so we must recurse into the + // sub-frame(s): + ste.currentFrame = + ste.pushFrame( + null, ste.currentFrame.lastSubFP, prefixLength + suffixLength, prevTermOrd); + ste.currentFrame.loadBlock(); + while (ste.currentFrame.next()) { ste.currentFrame = - ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen, prevTermOrd); + ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length(), prevTermOrd); ste.currentFrame.loadBlock(); - while (ste.currentFrame.next()) { - ste.currentFrame = - ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length(), prevTermOrd); - ste.currentFrame.loadBlock(); - } } + } - // if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: - assert ste.termExists; - fillTerm(); - // if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; - } + assert ste.termExists; + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } } @@ -854,9 +808,9 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I } private void fillTerm() { - final int termLength = prefix + suffix; - ste.term.setLength(prefix + suffix); + final int termLength = prefixLength + suffixLength; + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(termLength); - System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); + System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength); } } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnum.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnum.java index ea2cc686b6b0..e380196b98f8 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnum.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnum.java @@ -18,6 +18,7 @@ import java.io.IOException; import java.io.PrintStream; +import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.ImpactsEnum; @@ -180,17 +181,17 @@ IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc> arc, long // " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + // term.length + " vs prefix=" + f.prefix); - if (f.prefix > targetBeforeCurrentLength) { + if (f.prefixLength > targetBeforeCurrentLength) { f.rewind(); } else { // if (DEBUG) { // System.out.println(" skip rewind!"); // } } - assert length == f.prefix; + assert length == f.prefixLength; } else { f.nextEnt = -1; - f.prefix = length; + f.prefixLength = length; f.state.termBlockOrd = 0; f.fpOrig = f.fp = fp; f.lastSubFP = -1; @@ -331,31 +332,18 @@ public boolean seekExact(final BytesRef target, long minIDVersion) throws IOExce } if (cmp == 0) { - final int targetUptoMid = targetUpto; - // Second compare the rest of the term, but // don't save arc/output/frame; we only do this // to find out if the target term is before, // equal or after the current term - final int targetLimit2 = Math.min(target.length, term.length()); - while (targetUpto < targetLimit2) { - cmp = - (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); - // if (DEBUG) { - // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + - // targetLimit + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + - // targetUpto]) + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); - // } - if (cmp != 0) { - break; - } - targetUpto++; - } - - if (cmp == 0) { - cmp = term.length() - target.length; - } - targetUpto = targetUptoMid; + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); } if (cmp < 0) { @@ -471,7 +459,7 @@ public boolean seekExact(final BytesRef target, long minIDVersion) throws IOExce // Integer.toHexString(targetLabel) + " termExists=" + termExists); // } - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; // validIndexPrefix = targetUpto; currentFrame.scanToFloorFrame(target); @@ -585,7 +573,7 @@ public boolean seekExact(final BytesRef target, long minIDVersion) throws IOExce } // validIndexPrefix = targetUpto; - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; currentFrame.scanToFloorFrame(target); @@ -718,28 +706,16 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { } if (cmp == 0) { - final int targetUptoMid = targetUpto; // Second compare the rest of the term, but // don't save arc/output/frame: - final int targetLimit2 = Math.min(target.length, term.length()); - while (targetUpto < targetLimit2) { - cmp = - (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); - // if (DEBUG) { - // System.out.println(" cycle2 targetUpto=" + targetUpto + " (vs limit=" + targetLimit - // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) - // + " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")"); - // } - if (cmp != 0) { - break; - } - targetUpto++; - } - - if (cmp == 0) { - cmp = term.length() - target.length; - } - targetUpto = targetUptoMid; + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); } if (cmp < 0) { @@ -826,7 +802,7 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { // toHex(targetLabel)); // } - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; // validIndexPrefix = targetUpto; currentFrame.scanToFloorFrame(target); @@ -887,7 +863,7 @@ public SeekStatus seekCeil(final BytesRef target) throws IOException { } // validIndexPrefix = targetUpto; - validIndexPrefix = currentFrame.prefix; + validIndexPrefix = currentFrame.prefixLength; currentFrame.scanToFloorFrame(target); @@ -925,7 +901,7 @@ private void printSeekState(PrintStream out) throws IOException { while (true) { IDVersionSegmentTermsEnumFrame f = getFrame(ord); assert f != null; - final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefix); + final BytesRef prefix = new BytesRef(term.bytes(), 0, f.prefixLength); if (f.nextEnt == -1) { out.println( " frame " @@ -936,7 +912,7 @@ private void printSeekState(PrintStream out) throws IOException { + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" - + f.prefix + + f.prefixLength + " prefix=" + ToStringUtils.bytesRefToString(prefix) + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) @@ -964,7 +940,7 @@ private void printSeekState(PrintStream out) throws IOException { + f.fp + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + " prefixLen=" - + f.prefix + + f.prefixLength + " prefix=" + ToStringUtils.bytesRefToString(prefix) + " nextEnt=" @@ -989,12 +965,14 @@ private void printSeekState(PrintStream out) throws IOException { } if (fr.index != null) { assert !isSeekFrame || f.arc != null : "isSeekFrame=" + isSeekFrame + " f.arc=" + f.arc; - if (f.prefix > 0 && isSeekFrame && f.arc.label() != (term.byteAt(f.prefix - 1) & 0xFF)) { + if (f.prefixLength > 0 + && isSeekFrame + && f.arc.label() != (term.byteAt(f.prefixLength - 1) & 0xFF)) { out.println( " broken seek state: arc.label=" + (char) f.arc.label() + " vs term byte=" - + (char) (term.byteAt(f.prefix - 1) & 0xFF)); + + (char) (term.byteAt(f.prefixLength - 1) & 0xFF)); throw new RuntimeException("seek state is broken"); } Pair output = Util.get(fr.index, prefix); @@ -1023,7 +1001,7 @@ private void printSeekState(PrintStream out) throws IOException { if (f == currentFrame) { break; } - if (f.prefix == validIndexPrefix) { + if (f.prefixLength == validIndexPrefix) { isSeekFrame = false; } ord++; @@ -1103,7 +1081,7 @@ public BytesRef next() throws IOException { // Note that the seek state (last seek) has been // invalidated beyond this depth - validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefix); + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength); // if (DEBUG) { // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); // } diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnumFrame.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnumFrame.java index 4ecac0a93adc..45f5aba6fa71 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnumFrame.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionSegmentTermsEnumFrame.java @@ -17,6 +17,7 @@ package org.apache.lucene.sandbox.codecs.idversion; import java.io.IOException; +import java.util.Arrays; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.index.TermsEnum.SeekStatus; import org.apache.lucene.store.ByteArrayDataInput; @@ -52,7 +53,7 @@ final class IDVersionSegmentTermsEnumFrame { final ByteArrayDataInput floorDataReader = new ByteArrayDataInput(); // Length of prefix shared by all terms in this block - int prefix; + int prefixLength; // Number of entries (term or sub-block) in this block int entCount; @@ -262,11 +263,11 @@ public boolean nextLeaf() { assert nextEnt != -1 && nextEnt < entCount : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; nextEnt++; - suffix = suffixesReader.readVInt(); + suffixLength = suffixesReader.readVInt(); startBytePos = suffixesReader.getPosition(); - ste.term.setLength(prefix + suffix); + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(ste.term.length()); - suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); // A normal term ste.termExists = true; return false; @@ -279,11 +280,11 @@ public boolean nextNonLeaf() { : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; nextEnt++; final int code = suffixesReader.readVInt(); - suffix = code >>> 1; + suffixLength = code >>> 1; startBytePos = suffixesReader.getPosition(); - ste.term.setLength(prefix + suffix); + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(ste.term.length()); - suffixesReader.readBytes(ste.term.bytes(), prefix, suffix); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); if ((code & 1) == 0) { // A normal term ste.termExists = true; @@ -307,7 +308,7 @@ public boolean nextNonLeaf() { // floor blocks we "typically" get public void scanToFloorFrame(BytesRef target) { - if (!isFloor || target.length <= prefix) { + if (!isFloor || target.length <= prefixLength) { // if (DEBUG) { // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + // target.length + " vs prefix=" + prefix); @@ -315,7 +316,7 @@ public void scanToFloorFrame(BytesRef target) { return; } - final int targetLabel = target.bytes[target.offset + prefix] & 0xFF; + final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF; // if (DEBUG) { // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + ((char) @@ -415,7 +416,7 @@ public void decodeMetaData() throws IOException { // Used only by assert private boolean prefixMatches(BytesRef target) { - for (int bytePos = 0; bytePos < prefix; bytePos++) { + for (int bytePos = 0; bytePos < prefixLength; bytePos++) { if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { return false; } @@ -466,7 +467,7 @@ public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOExcept } private int startBytePos; - private int suffix; + private int suffixLength; private long subCode; // Target's prefix matches this block's prefix; we @@ -493,12 +494,10 @@ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOEx assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - // nextTerm: while(nextEnt < entCount) { - nextTerm: - while (true) { + do { nextEnt++; - suffix = suffixesReader.readVInt(); + suffixLength = suffixesReader.readVInt(); // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); @@ -509,76 +508,41 @@ public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOEx // + ToStringUtils.bytesRefToString(suffixBytesRef)); // } - final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); - suffixesReader.skipBytes(suffix); - - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; - - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while (true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } - - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning - - if (nextEnt == entCount) { - if (exactOnly) { - fillTerm(); - } - // We are done scanning this block - break nextTerm; - } else { - continue nextTerm; - } - } else if (cmp > 0) { - - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); - - if (!exactOnly && !ste.termExists) { - // We are on a sub-block, and caller wants - // us to position to the next term after - // the target, so we must recurse into the - // sub-frame(s): - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); - ste.currentFrame.loadBlock(); - while (ste.currentFrame.next()) { - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); - ste.currentFrame.loadBlock(); - } - } + suffixesReader.skipBytes(suffixLength); + + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - // if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: - assert ste.termExists; - fillTerm(); - // if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; - } + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } - } + } while (nextEnt < entCount); // It is possible (and OK) that terms index pointed us // at this block, but, we scanned the entire block and @@ -622,13 +586,11 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I assert prefixMatches(target); // Loop over each entry (term or sub-block) in this block: - // nextTerm: while(nextEnt < entCount) { - nextTerm: - while (true) { + while (nextEnt < entCount) { nextEnt++; final int code = suffixesReader.readVInt(); - suffix = code >>> 1; + suffixLength = code >>> 1; // if (DEBUG) { // BytesRef suffixBytesRef = new BytesRef(); // suffixBytesRef.bytes = suffixBytes; @@ -640,9 +602,8 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I // } ste.termExists = (code & 1) == 0; - final int termLen = prefix + suffix; startBytePos = suffixesReader.getPosition(); - suffixesReader.skipBytes(suffix); + suffixesReader.skipBytes(suffixLength); if (ste.termExists) { state.termBlockOrd++; subCode = 0; @@ -651,71 +612,51 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I lastSubFP = fp - subCode; } - final int targetLimit = target.offset + (target.length < termLen ? target.length : termLen); - int targetPos = target.offset + prefix; - - // Loop over bytes in the suffix, comparing to - // the target - int bytePos = startBytePos; - while (true) { - final int cmp; - final boolean stop; - if (targetPos < targetLimit) { - cmp = (suffixBytes[bytePos++] & 0xFF) - (target.bytes[targetPos++] & 0xFF); - stop = false; - } else { - assert targetPos == targetLimit; - cmp = termLen - target.length; - stop = true; - } + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); - if (cmp < 0) { - // Current entry is still before the target; - // keep scanning - - if (nextEnt == entCount) { - if (exactOnly) { - fillTerm(); - // termExists = true; - } - // We are done scanning this block - break nextTerm; - } else { - continue nextTerm; - } - } else if (cmp > 0) { - - // Done! Current entry is after target -- - // return NOT_FOUND: - fillTerm(); - - if (!exactOnly && !ste.termExists) { - // We are on a sub-block, and caller wants - // us to position to the next term after - // the target, so we must recurse into the - // sub-frame(s): - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, termLen); + if (!exactOnly && !ste.termExists) { + // We are on a sub-block, and caller wants + // us to position to the next term after + // the target, so we must recurse into the + // sub-frame(s): + ste.currentFrame = + ste.pushFrame(null, ste.currentFrame.lastSubFP, prefixLength + suffixLength); + ste.currentFrame.loadBlock(); + while (ste.currentFrame.next()) { + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); ste.currentFrame.loadBlock(); - while (ste.currentFrame.next()) { - ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); - ste.currentFrame.loadBlock(); - } } + } - // if (DEBUG) System.out.println(" not found"); - return SeekStatus.NOT_FOUND; - } else if (stop) { - // Exact match! + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! - // This cannot be a sub-block because we - // would have followed the index to this - // sub-block from the start: + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: - assert ste.termExists; - fillTerm(); - // if (DEBUG) System.out.println(" found!"); - return SeekStatus.FOUND; - } + assert ste.termExists; + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; } } @@ -740,9 +681,9 @@ public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws I } private void fillTerm() { - final int termLength = prefix + suffix; - ste.term.setLength(prefix + suffix); + final int termLength = prefixLength + suffixLength; + ste.term.setLength(prefixLength + suffixLength); ste.term.grow(termLength); - System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefix, suffix); + System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength); } }