diff --git a/gradle/generation/forUtil.gradle b/gradle/generation/forUtil.gradle index b55fd0204fd3..b73c69b6f12d 100644 --- a/gradle/generation/forUtil.gradle +++ b/gradle/generation/forUtil.gradle @@ -23,7 +23,7 @@ configure(project(":lucene:core")) { description "Regenerate gen_ForUtil.py" group "generation" - def genDir = file("src/java/org/apache/lucene/codecs/lucene101") + def genDir = file("src/java/org/apache/lucene/codecs/lucene103") def genScript = file("${genDir}/gen_ForUtil.py") def genOutput = file("${genDir}/ForUtil.java") @@ -48,7 +48,7 @@ configure(project(":lucene:core")) { description "Regenerate gen_ForDeltaUtil.py" group "generation" - def genDir = file("src/java/org/apache/lucene/codecs/lucene101") + def genDir = file("src/java/org/apache/lucene/codecs/lucene103") def genScript = file("${genDir}/gen_ForDeltaUtil.py") def genOutput = file("${genDir}/ForDeltaUtil.java") @@ -197,5 +197,55 @@ configure(project(":lucene:backward-codecs")) { andThenTasks: ["spotlessJava", "spotlessJavaApply"], mustRunBefore: [ "compileJava" ] ]) + + task generateForUtil101Internal() { + description "Regenerate gen_ForUtil.py" + group "generation" + + def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene101") + def genScript = file("${genDir}/gen_ForUtil.py") + def genOutput = file("${genDir}/ForUtil.java") + + inputs.file genScript + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("python3") + args = [ '-B', genScript ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateForUtil101Internal, [ + andThenTasks: ["spotlessJava", "spotlessJavaApply"], + mustRunBefore: [ "compileJava" ] + ]) + + task generateForDeltaUtil101Internal() { + description "Regenerate gen_ForDeltaUtil.py" + group "generation" + + def genDir = file("src/java/org/apache/lucene/backward_codecs/lucene101") + def genScript = file("${genDir}/gen_ForDeltaUtil.py") + def genOutput = file("${genDir}/ForDeltaUtil.java") + + inputs.file genScript + outputs.file genOutput + + doLast { + quietExec { + workingDir genDir + executable project.externalTool("python3") + args = [ '-B', genScript ] + } + } + } + + regenerate.dependsOn wrapWithPersistentChecksums(generateForDeltaUtil101Internal, [ + andThenTasks: ["spotlessJava", "spotlessJavaApply"], + mustRunBefore: [ "compileJava" ] + ]) } diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 645506f4c642..aca1abb0e0f6 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -77,6 +77,8 @@ Optimizations * GITHUB#14425: KeywordField.newSetQuery() reuses prefixed terms (Mikhail Khludnev) +* GITHUB#14333: Introduce a specialized trie for block tree index, instead of FST. (Guo Feng) + Bug Fixes --------------------- (No changes) diff --git a/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil101.json b/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil101.json new file mode 100644 index 000000000000..2cd3169551d3 --- /dev/null +++ b/lucene/backward-codecs/src/generated/checksums/generateForDeltaUtil101.json @@ -0,0 +1,4 @@ +{ + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForDeltaUtil.java": "f4cff08d9a5dd99f5332c2f9f6d386f0d7f58677", + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForDeltaUtil.py": "ea46cd6b2384fc1cddb8c1dc5e30bf5f76054d91" +} \ No newline at end of file diff --git a/lucene/backward-codecs/src/generated/checksums/generateForUtil101.json b/lucene/backward-codecs/src/generated/checksums/generateForUtil101.json new file mode 100644 index 000000000000..99aee2265c90 --- /dev/null +++ b/lucene/backward-codecs/src/generated/checksums/generateForUtil101.json @@ -0,0 +1,4 @@ +{ + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForUtil.java": "5a7c2e1e09780a2ccd31c22a1e1fa47443cf2a32", + "lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForUtil.py": "c98cce3be0698048ebda6beaa9d404f25089930d" +} \ No newline at end of file diff --git a/lucene/backward-codecs/src/java/module-info.java b/lucene/backward-codecs/src/java/module-info.java index 41057c95bbf3..88cbd5909512 100644 --- a/lucene/backward-codecs/src/java/module-info.java +++ b/lucene/backward-codecs/src/java/module-info.java @@ -31,6 +31,7 @@ exports org.apache.lucene.backward_codecs.lucene86; exports org.apache.lucene.backward_codecs.lucene87; exports org.apache.lucene.backward_codecs.lucene90; + exports org.apache.lucene.backward_codecs.lucene90.blocktree; exports org.apache.lucene.backward_codecs.lucene91; exports org.apache.lucene.backward_codecs.lucene92; exports org.apache.lucene.backward_codecs.lucene94; @@ -38,6 +39,7 @@ exports org.apache.lucene.backward_codecs.lucene99; exports org.apache.lucene.backward_codecs.lucene912; exports org.apache.lucene.backward_codecs.lucene100; + exports org.apache.lucene.backward_codecs.lucene101; exports org.apache.lucene.backward_codecs.packed; exports org.apache.lucene.backward_codecs.store; @@ -48,7 +50,8 @@ org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat, org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat, org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat, - org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat; + org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat, + org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with org.apache.lucene.backward_codecs.lucene90.Lucene90HnswVectorsFormat, org.apache.lucene.backward_codecs.lucene91.Lucene91HnswVectorsFormat, @@ -67,5 +70,6 @@ org.apache.lucene.backward_codecs.lucene95.Lucene95Codec, org.apache.lucene.backward_codecs.lucene99.Lucene99Codec, org.apache.lucene.backward_codecs.lucene912.Lucene912Codec, - org.apache.lucene.backward_codecs.lucene100.Lucene100Codec; + org.apache.lucene.backward_codecs.lucene100.Lucene100Codec, + org.apache.lucene.backward_codecs.lucene101.Lucene101Codec; } diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForDeltaUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForDeltaUtil.java new file mode 100644 index 000000000000..cced2474ec7e --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForDeltaUtil.java @@ -0,0 +1,470 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import static org.apache.lucene.backward_codecs.lucene101.ForUtil.*; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in a Java int to + * get SIMD-like speedups. If bitsPerValue <= 4 then we pack 4 ints per Java int else if + * bitsPerValue <= 11 we pack 2 ints per Java int else we use scalar operations. + */ +public final class ForDeltaUtil { + + private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2; + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + private static void prefixSum8(int[] arr, int base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final int l0 = base; + final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum16(int[] arr, int base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final int l0 = base; + final int l1 = base + arr[HALF_BLOCK_SIZE - 1]; + for (int i = 0; i < HALF_BLOCK_SIZE; ++i) { + arr[i] += l0; + arr[HALF_BLOCK_SIZE + i] += l1; + } + } + + private static void prefixSum32(int[] arr, int base) { + arr[0] += base; + for (int i = 1; i < BLOCK_SIZE; ++i) { + arr[i] += arr[i - 1]; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** + * Return the number of bits per value required to store the given array containing strictly + * positive numbers. + */ + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; + } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); + } + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); + } else { + primitiveSize = 32; + } + encode(ints, bitsPerValue, primitiveSize, out, tmp); + } + + /** Delta-decode 128 integers into {@code ints}. */ + void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) + throws IOException { + switch (bitsPerValue) { + case 1: + decode1(pdu, ints); + prefixSum8(ints, base); + break; + case 2: + decode2(pdu, ints); + prefixSum8(ints, base); + break; + case 3: + decode3(pdu, tmp, ints); + prefixSum8(ints, base); + break; + case 4: + decode4To16(pdu, ints); + prefixSum16(ints, base); + break; + case 5: + decode5To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 6: + decode6To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 7: + decode7To16(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 8: + decode8To16(pdu, ints); + prefixSum16(ints, base); + break; + case 9: + decode9(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 10: + decode10(pdu, tmp, ints); + prefixSum16(ints, base); + break; + case 11: + decode11To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 12: + decode12To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 13: + decode13To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 14: + decode14To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 15: + decode15To32(pdu, tmp, ints); + prefixSum32(ints, base); + break; + case 16: + decode16To32(pdu, ints); + prefixSum32(ints, base); + break; + default: + if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) { + throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue); + } + decodeSlow(bitsPerValue, pdu, tmp, ints); + prefixSum32(ints, base); + break; + } + } + + private static void decode4To16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(16, ints, 12, 4, MASK16_4, ints, 48, MASK16_4); + } + + private static void decode5To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(20, ints, 11, 5, MASK16_5, tmp, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 5, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= tmp[tmpIdx + 3] << 1; + l0 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 0] = l0; + } + } + + private static void decode6To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(24, ints, 10, 6, MASK16_6, tmp, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 8; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 4; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode7To16(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(28, ints, 9, 7, MASK16_7, tmp, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 4; ++iter, tmpIdx += 7, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 5; + l0 |= tmp[tmpIdx + 1] << 3; + l0 |= tmp[tmpIdx + 2] << 1; + l0 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 3] & MASK16_1) << 6; + l1 |= tmp[tmpIdx + 4] << 4; + l1 |= tmp[tmpIdx + 5] << 2; + l1 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode8To16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(32, ints, 8, 8, MASK16_8, ints, 32, MASK16_8); + } + + private static void decode11To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(44, ints, 21, 11, MASK32_11, tmp, 0, MASK32_10); + for (int iter = 0, tmpIdx = 0, intsIdx = 88; iter < 4; ++iter, tmpIdx += 11, intsIdx += 10) { + int l0 = tmp[tmpIdx + 0] << 1; + l0 |= (tmp[tmpIdx + 1] >>> 9) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_9) << 2; + l1 |= (tmp[tmpIdx + 2] >>> 8) & MASK32_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK32_8) << 3; + l2 |= (tmp[tmpIdx + 3] >>> 7) & MASK32_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK32_7) << 4; + l3 |= (tmp[tmpIdx + 4] >>> 6) & MASK32_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 4] & MASK32_6) << 5; + l4 |= (tmp[tmpIdx + 5] >>> 5) & MASK32_5; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 5] & MASK32_5) << 6; + l5 |= (tmp[tmpIdx + 6] >>> 4) & MASK32_6; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 6] & MASK32_4) << 7; + l6 |= (tmp[tmpIdx + 7] >>> 3) & MASK32_7; + ints[intsIdx + 6] = l6; + int l7 = (tmp[tmpIdx + 7] & MASK32_3) << 8; + l7 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_8; + ints[intsIdx + 7] = l7; + int l8 = (tmp[tmpIdx + 8] & MASK32_2) << 9; + l8 |= (tmp[tmpIdx + 9] >>> 1) & MASK32_9; + ints[intsIdx + 8] = l8; + int l9 = (tmp[tmpIdx + 9] & MASK32_1) << 10; + l9 |= tmp[tmpIdx + 10] << 0; + ints[intsIdx + 9] = l9; + } + } + + private static void decode12To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(48, ints, 20, 12, MASK32_12, tmp, 0, MASK32_8); + for (int iter = 0, tmpIdx = 0, intsIdx = 96; iter < 16; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 4) & MASK32_4; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK32_4) << 8; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode13To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(52, ints, 19, 13, MASK32_13, tmp, 0, MASK32_6); + for (int iter = 0, tmpIdx = 0, intsIdx = 104; iter < 4; ++iter, tmpIdx += 13, intsIdx += 6) { + int l0 = tmp[tmpIdx + 0] << 7; + l0 |= tmp[tmpIdx + 1] << 1; + l0 |= (tmp[tmpIdx + 2] >>> 5) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK32_5) << 8; + l1 |= tmp[tmpIdx + 3] << 2; + l1 |= (tmp[tmpIdx + 4] >>> 4) & MASK32_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 4] & MASK32_4) << 9; + l2 |= tmp[tmpIdx + 5] << 3; + l2 |= (tmp[tmpIdx + 6] >>> 3) & MASK32_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 6] & MASK32_3) << 10; + l3 |= tmp[tmpIdx + 7] << 4; + l3 |= (tmp[tmpIdx + 8] >>> 2) & MASK32_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 8] & MASK32_2) << 11; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= (tmp[tmpIdx + 10] >>> 1) & MASK32_5; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 10] & MASK32_1) << 12; + l5 |= tmp[tmpIdx + 11] << 6; + l5 |= tmp[tmpIdx + 12] << 0; + ints[intsIdx + 5] = l5; + } + } + + private static void decode14To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(56, ints, 18, 14, MASK32_14, tmp, 0, MASK32_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 112; iter < 8; ++iter, tmpIdx += 7, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 6; + l0 |= tmp[tmpIdx + 2] << 2; + l0 |= (tmp[tmpIdx + 3] >>> 2) & MASK32_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 3] & MASK32_2) << 12; + l1 |= tmp[tmpIdx + 4] << 8; + l1 |= tmp[tmpIdx + 5] << 4; + l1 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode15To32(PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + pdu.splitInts(60, ints, 17, 15, MASK32_15, tmp, 0, MASK32_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 120; iter < 4; ++iter, tmpIdx += 15, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 13; + l0 |= tmp[tmpIdx + 1] << 11; + l0 |= tmp[tmpIdx + 2] << 9; + l0 |= tmp[tmpIdx + 3] << 7; + l0 |= tmp[tmpIdx + 4] << 5; + l0 |= tmp[tmpIdx + 5] << 3; + l0 |= tmp[tmpIdx + 6] << 1; + l0 |= (tmp[tmpIdx + 7] >>> 1) & MASK32_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 7] & MASK32_1) << 14; + l1 |= tmp[tmpIdx + 8] << 12; + l1 |= tmp[tmpIdx + 9] << 10; + l1 |= tmp[tmpIdx + 10] << 8; + l1 |= tmp[tmpIdx + 11] << 6; + l1 |= tmp[tmpIdx + 12] << 4; + l1 |= tmp[tmpIdx + 13] << 2; + l1 |= tmp[tmpIdx + 14] << 0; + ints[intsIdx + 1] = l1; + } + } + + private static void decode16To32(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(64, ints, 16, 16, MASK32_16, ints, 64, MASK32_16); + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForUtil.java new file mode 100644 index 000000000000..61a58bf64d29 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/ForUtil.java @@ -0,0 +1,532 @@ +// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ Encodes multiple integers in one to get + * SIMD-like speedups. If bitsPerValue <= 8 then we pack 4 ints per Java int else if bitsPerValue + * <= 16 we pack 2 ints per Java int else we do scalar operations. + */ +public final class ForUtil { + + static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; + + static int expandMask16(int mask16) { + return mask16 | (mask16 << 16); + } + + static int expandMask8(int mask8) { + return expandMask16(mask8 | (mask8 << 8)); + } + + static int mask32(int bitsPerValue) { + return (1 << bitsPerValue) - 1; + } + + static int mask16(int bitsPerValue) { + return expandMask16((1 << bitsPerValue) - 1); + } + + static int mask8(int bitsPerValue) { + return expandMask8((1 << bitsPerValue) - 1); + } + + static void expand8(int[] arr) { + for (int i = 0; i < 32; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[32 + i] = (l >>> 16) & 0xFF; + arr[64 + i] = (l >>> 8) & 0xFF; + arr[96 + i] = l & 0xFF; + } + } + + static void collapse8(int[] arr) { + for (int i = 0; i < 32; ++i) { + arr[i] = (arr[i] << 24) | (arr[32 + i] << 16) | (arr[64 + i] << 8) | arr[96 + i]; + } + } + + static void expand16(int[] arr) { + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 16) & 0xFFFF; + arr[64 + i] = l & 0xFFFF; + } + } + + static void collapse16(int[] arr) { + for (int i = 0; i < 64; ++i) { + arr[i] = (arr[i] << 16) | arr[64 + i]; + } + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException { + final int nextPrimitive; + if (bitsPerValue <= 8) { + nextPrimitive = 8; + collapse8(ints); + } else if (bitsPerValue <= 16) { + nextPrimitive = 16; + collapse16(ints); + } else { + nextPrimitive = 32; + } + encode(ints, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) + throws IOException { + final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE; + + final int numIntsPerShift = bitsPerValue * 4; + int idx = 0; + int shift = primitiveSize - bitsPerValue; + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] = ints[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] |= ints[idx++] << shift; + } + } + + final int remainingBitsPerInt = shift + bitsPerValue; + final int maskRemainingBitsPerInt; + if (primitiveSize == 8) { + maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt]; + } else if (primitiveSize == 16) { + maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt]; + } else { + maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + } + + int tmpIdx = 0; + int remainingBitsPerValue = bitsPerValue; + while (idx < numInts) { + if (remainingBitsPerValue >= remainingBitsPerInt) { + remainingBitsPerValue -= remainingBitsPerInt; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt; + if (remainingBitsPerValue == 0) { + idx++; + remainingBitsPerValue = bitsPerValue; + } + } else { + final int mask1, mask2; + if (primitiveSize == 8) { + mask1 = MASKS8[remainingBitsPerValue]; + mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue]; + } else if (primitiveSize == 16) { + mask1 = MASKS16[remainingBitsPerValue]; + mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue]; + } else { + mask1 = MASKS32[remainingBitsPerValue]; + mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue]; + } + tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue); + remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2; + } + } + + for (int i = 0; i < numIntsPerShift; ++i) { + out.writeInt(tmp[i]); + } + } + + /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ + static int numBytes(int bitsPerValue) { + return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); + } + + static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + final int numInts = bitsPerValue << 2; + final int mask = MASKS32[bitsPerValue]; + pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1); + final int remainingBitsPerInt = 32 - bitsPerValue; + final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + int tmpIdx = 0; + int remainingBits = remainingBitsPerInt; + for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) { + int b = bitsPerValue - remainingBits; + int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; + while (b >= remainingBitsPerInt) { + b -= remainingBitsPerInt; + l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b; + } + if (b > 0) { + l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b]; + remainingBits = remainingBitsPerInt - b; + } else { + remainingBits = remainingBitsPerInt; + } + ints[intsIdx] = l; + } + } + + static final int[] MASKS8 = new int[8]; + static final int[] MASKS16 = new int[16]; + static final int[] MASKS32 = new int[32]; + + static { + for (int i = 0; i < 8; ++i) { + MASKS8[i] = mask8(i); + } + for (int i = 0; i < 16; ++i) { + MASKS16[i] = mask16(i); + } + for (int i = 0; i < 32; ++i) { + MASKS32[i] = mask32(i); + } + } + + // mark values in array as final ints to avoid the cost of reading array, arrays should only be + // used when the idx is a variable + static final int MASK8_1 = MASKS8[1]; + static final int MASK8_2 = MASKS8[2]; + static final int MASK8_3 = MASKS8[3]; + static final int MASK8_4 = MASKS8[4]; + static final int MASK8_5 = MASKS8[5]; + static final int MASK8_6 = MASKS8[6]; + static final int MASK8_7 = MASKS8[7]; + static final int MASK16_1 = MASKS16[1]; + static final int MASK16_2 = MASKS16[2]; + static final int MASK16_3 = MASKS16[3]; + static final int MASK16_4 = MASKS16[4]; + static final int MASK16_5 = MASKS16[5]; + static final int MASK16_6 = MASKS16[6]; + static final int MASK16_7 = MASKS16[7]; + static final int MASK16_8 = MASKS16[8]; + static final int MASK16_9 = MASKS16[9]; + static final int MASK16_10 = MASKS16[10]; + static final int MASK16_11 = MASKS16[11]; + static final int MASK16_12 = MASKS16[12]; + static final int MASK16_13 = MASKS16[13]; + static final int MASK16_14 = MASKS16[14]; + static final int MASK16_15 = MASKS16[15]; + static final int MASK32_1 = MASKS32[1]; + static final int MASK32_2 = MASKS32[2]; + static final int MASK32_3 = MASKS32[3]; + static final int MASK32_4 = MASKS32[4]; + static final int MASK32_5 = MASKS32[5]; + static final int MASK32_6 = MASKS32[6]; + static final int MASK32_7 = MASKS32[7]; + static final int MASK32_8 = MASKS32[8]; + static final int MASK32_9 = MASKS32[9]; + static final int MASK32_10 = MASKS32[10]; + static final int MASK32_11 = MASKS32[11]; + static final int MASK32_12 = MASKS32[12]; + static final int MASK32_13 = MASKS32[13]; + static final int MASK32_14 = MASKS32[14]; + static final int MASK32_15 = MASKS32[15]; + static final int MASK32_16 = MASKS32[16]; + + /** Decode 128 integers into {@code ints}. */ + void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException { + switch (bitsPerValue) { + case 1: + decode1(pdu, ints); + expand8(ints); + break; + case 2: + decode2(pdu, ints); + expand8(ints); + break; + case 3: + decode3(pdu, tmp, ints); + expand8(ints); + break; + case 4: + decode4(pdu, ints); + expand8(ints); + break; + case 5: + decode5(pdu, tmp, ints); + expand8(ints); + break; + case 6: + decode6(pdu, tmp, ints); + expand8(ints); + break; + case 7: + decode7(pdu, tmp, ints); + expand8(ints); + break; + case 8: + decode8(pdu, ints); + expand8(ints); + break; + case 9: + decode9(pdu, tmp, ints); + expand16(ints); + break; + case 10: + decode10(pdu, tmp, ints); + expand16(ints); + break; + case 11: + decode11(pdu, tmp, ints); + expand16(ints); + break; + case 12: + decode12(pdu, tmp, ints); + expand16(ints); + break; + case 13: + decode13(pdu, tmp, ints); + expand16(ints); + break; + case 14: + decode14(pdu, tmp, ints); + expand16(ints); + break; + case 15: + decode15(pdu, tmp, ints); + expand16(ints); + break; + case 16: + decode16(pdu, ints); + expand16(ints); + break; + default: + decodeSlow(bitsPerValue, pdu, tmp, ints); + break; + } + } + + static void decode1(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(4, ints, 7, 1, MASK8_1, ints, 28, MASK8_1); + } + + static void decode2(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(8, ints, 6, 2, MASK8_2, ints, 24, MASK8_2); + } + + static void decode3(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(12, ints, 5, 3, MASK8_3, tmp, 0, MASK8_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 4; ++iter, tmpIdx += 3, intsIdx += 2) { + int l0 = tmp[tmpIdx + 0] << 1; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 2; + l1 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 1] = l1; + } + } + + static void decode4(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.splitInts(16, ints, 4, 4, MASK8_4, ints, 16, MASK8_4); + } + + static void decode5(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(20, ints, 3, 5, MASK8_5, tmp, 0, MASK8_3); + for (int iter = 0, tmpIdx = 0, intsIdx = 20; iter < 4; ++iter, tmpIdx += 5, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 1) & MASK8_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK8_1) << 4; + l1 |= tmp[tmpIdx + 2] << 1; + l1 |= (tmp[tmpIdx + 3] >>> 2) & MASK8_1; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK8_2) << 3; + l2 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode6(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(24, ints, 2, 6, MASK8_6, tmp, 0, MASK8_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 24; iter < 8; ++iter, tmpIdx += 3, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= tmp[tmpIdx + 1] << 2; + l0 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode7(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(28, ints, 1, 7, MASK8_7, tmp, 0, MASK8_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 28; iter < 4; ++iter, tmpIdx += 7, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 5; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 3; + l0 |= tmp[tmpIdx + 4] << 2; + l0 |= tmp[tmpIdx + 5] << 1; + l0 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode8(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.in.readInts(ints, 0, 32); + } + + static void decode9(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(36, ints, 7, 9, MASK16_9, tmp, 0, MASK16_7); + for (int iter = 0, tmpIdx = 0, intsIdx = 36; iter < 4; ++iter, tmpIdx += 9, intsIdx += 7) { + int l0 = tmp[tmpIdx + 0] << 2; + l0 |= (tmp[tmpIdx + 1] >>> 5) & MASK16_2; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_5) << 4; + l1 |= (tmp[tmpIdx + 2] >>> 3) & MASK16_4; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 2] & MASK16_3) << 6; + l2 |= (tmp[tmpIdx + 3] >>> 1) & MASK16_6; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 3] & MASK16_1) << 8; + l3 |= tmp[tmpIdx + 4] << 1; + l3 |= (tmp[tmpIdx + 5] >>> 6) & MASK16_1; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 5] & MASK16_6) << 3; + l4 |= (tmp[tmpIdx + 6] >>> 4) & MASK16_3; + ints[intsIdx + 4] = l4; + int l5 = (tmp[tmpIdx + 6] & MASK16_4) << 5; + l5 |= (tmp[tmpIdx + 7] >>> 2) & MASK16_5; + ints[intsIdx + 5] = l5; + int l6 = (tmp[tmpIdx + 7] & MASK16_2) << 7; + l6 |= tmp[tmpIdx + 8] << 0; + ints[intsIdx + 6] = l6; + } + } + + static void decode10(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(40, ints, 6, 10, MASK16_10, tmp, 0, MASK16_6); + for (int iter = 0, tmpIdx = 0, intsIdx = 40; iter < 8; ++iter, tmpIdx += 5, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 4; + l0 |= (tmp[tmpIdx + 1] >>> 2) & MASK16_4; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 1] & MASK16_2) << 8; + l1 |= tmp[tmpIdx + 2] << 2; + l1 |= (tmp[tmpIdx + 3] >>> 4) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 3] & MASK16_4) << 6; + l2 |= tmp[tmpIdx + 4] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode11(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(44, ints, 5, 11, MASK16_11, tmp, 0, MASK16_5); + for (int iter = 0, tmpIdx = 0, intsIdx = 44; iter < 4; ++iter, tmpIdx += 11, intsIdx += 5) { + int l0 = tmp[tmpIdx + 0] << 6; + l0 |= tmp[tmpIdx + 1] << 1; + l0 |= (tmp[tmpIdx + 2] >>> 4) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 2] & MASK16_4) << 7; + l1 |= tmp[tmpIdx + 3] << 2; + l1 |= (tmp[tmpIdx + 4] >>> 3) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 4] & MASK16_3) << 8; + l2 |= tmp[tmpIdx + 5] << 3; + l2 |= (tmp[tmpIdx + 6] >>> 2) & MASK16_3; + ints[intsIdx + 2] = l2; + int l3 = (tmp[tmpIdx + 6] & MASK16_2) << 9; + l3 |= tmp[tmpIdx + 7] << 4; + l3 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_4; + ints[intsIdx + 3] = l3; + int l4 = (tmp[tmpIdx + 8] & MASK16_1) << 10; + l4 |= tmp[tmpIdx + 9] << 5; + l4 |= tmp[tmpIdx + 10] << 0; + ints[intsIdx + 4] = l4; + } + } + + static void decode12(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(48, ints, 4, 12, MASK16_12, tmp, 0, MASK16_4); + for (int iter = 0, tmpIdx = 0, intsIdx = 48; iter < 16; ++iter, tmpIdx += 3, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 8; + l0 |= tmp[tmpIdx + 1] << 4; + l0 |= tmp[tmpIdx + 2] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode13(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(52, ints, 3, 13, MASK16_13, tmp, 0, MASK16_3); + for (int iter = 0, tmpIdx = 0, intsIdx = 52; iter < 4; ++iter, tmpIdx += 13, intsIdx += 3) { + int l0 = tmp[tmpIdx + 0] << 10; + l0 |= tmp[tmpIdx + 1] << 7; + l0 |= tmp[tmpIdx + 2] << 4; + l0 |= tmp[tmpIdx + 3] << 1; + l0 |= (tmp[tmpIdx + 4] >>> 2) & MASK16_1; + ints[intsIdx + 0] = l0; + int l1 = (tmp[tmpIdx + 4] & MASK16_2) << 11; + l1 |= tmp[tmpIdx + 5] << 8; + l1 |= tmp[tmpIdx + 6] << 5; + l1 |= tmp[tmpIdx + 7] << 2; + l1 |= (tmp[tmpIdx + 8] >>> 1) & MASK16_2; + ints[intsIdx + 1] = l1; + int l2 = (tmp[tmpIdx + 8] & MASK16_1) << 12; + l2 |= tmp[tmpIdx + 9] << 9; + l2 |= tmp[tmpIdx + 10] << 6; + l2 |= tmp[tmpIdx + 11] << 3; + l2 |= tmp[tmpIdx + 12] << 0; + ints[intsIdx + 2] = l2; + } + } + + static void decode14(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(56, ints, 2, 14, MASK16_14, tmp, 0, MASK16_2); + for (int iter = 0, tmpIdx = 0, intsIdx = 56; iter < 8; ++iter, tmpIdx += 7, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 12; + l0 |= tmp[tmpIdx + 1] << 10; + l0 |= tmp[tmpIdx + 2] << 8; + l0 |= tmp[tmpIdx + 3] << 6; + l0 |= tmp[tmpIdx + 4] << 4; + l0 |= tmp[tmpIdx + 5] << 2; + l0 |= tmp[tmpIdx + 6] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode15(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException { + pdu.splitInts(60, ints, 1, 15, MASK16_15, tmp, 0, MASK16_1); + for (int iter = 0, tmpIdx = 0, intsIdx = 60; iter < 4; ++iter, tmpIdx += 15, intsIdx += 1) { + int l0 = tmp[tmpIdx + 0] << 14; + l0 |= tmp[tmpIdx + 1] << 13; + l0 |= tmp[tmpIdx + 2] << 12; + l0 |= tmp[tmpIdx + 3] << 11; + l0 |= tmp[tmpIdx + 4] << 10; + l0 |= tmp[tmpIdx + 5] << 9; + l0 |= tmp[tmpIdx + 6] << 8; + l0 |= tmp[tmpIdx + 7] << 7; + l0 |= tmp[tmpIdx + 8] << 6; + l0 |= tmp[tmpIdx + 9] << 5; + l0 |= tmp[tmpIdx + 10] << 4; + l0 |= tmp[tmpIdx + 11] << 3; + l0 |= tmp[tmpIdx + 12] << 2; + l0 |= tmp[tmpIdx + 13] << 1; + l0 |= tmp[tmpIdx + 14] << 0; + ints[intsIdx + 0] = l0; + } + } + + static void decode16(PostingDecodingUtil pdu, int[] ints) throws IOException { + pdu.in.readInts(ints, 0, 64); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101Codec.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101Codec.java index 2b764b876856..2fde12e09ae8 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101Codec.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101Codec.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; import java.util.Objects; import org.apache.lucene.codecs.Codec; @@ -49,7 +49,7 @@ * *

If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. * - * @see org.apache.lucene.codecs.lucene101 package documentation for file format details. + * @see org.apache.lucene.backward_codecs.lucene101 package documentation for file format details. * @lucene.experimental */ public class Lucene101Codec extends Codec { diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsFormat.java similarity index 91% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsFormat.java index ae9964c0edc7..5528532229f9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsFormat.java @@ -14,18 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; @@ -94,8 +92,8 @@ *

Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on - * the format. + * .doc, .pos, and .pay files. See Lucene90BlockTreeTermsWriter for more details on the + * format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and * TermMetadata sections described here: @@ -146,7 +144,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See Lucene90BlockTreeTermsWriter for more details on the format. *

* * @@ -319,7 +317,7 @@ * * @lucene.experimental */ -public final class Lucene101PostingsFormat extends PostingsFormat { +public class Lucene101PostingsFormat extends PostingsFormat { /** Filename extension for some small metadata about how postings are encoded. */ public static final String META_EXTENSION = "psm"; @@ -342,15 +340,13 @@ public final class Lucene101PostingsFormat extends PostingsFormat { /** Size of blocks. */ public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE; - public static final int BLOCK_MASK = BLOCK_SIZE - 1; - /** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */ public static final int LEVEL1_FACTOR = 32; /** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */ public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE; - public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; + static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; /** * Return the class that implements {@link ImpactsEnum} in this {@link PostingsFormat}. This is @@ -379,55 +375,15 @@ public static Class getImpactsEnumImpl() { static final int VERSION_CURRENT = VERSION_DENSE_BLOCKS_AS_BITSETS; - private final int version; - private final int minTermBlockSize; - private final int maxTermBlockSize; - /** Creates {@code Lucene101PostingsFormat} with default settings. */ public Lucene101PostingsFormat() { - this( - Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); - } - - /** - * Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code - * maxBlockSize} passed to block terms dictionary. - * - * @see - * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) - */ - public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { - this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT); - } - - /** Expert constructor that allows setting the version. */ - public Lucene101PostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) { super("Lucene101"); - if (version < VERSION_START || version > VERSION_CURRENT) { - throw new IllegalArgumentException("Version out of range: " + version); - } - this.version = version; - Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); - this.minTermBlockSize = minTermBlockSize; - this.maxTermBlockSize = maxTermBlockSize; } @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state, version); - boolean success = false; - try { - FieldsConsumer ret = - new Lucene90BlockTreeTermsWriter( - state, postingsWriter, minTermBlockSize, maxTermBlockSize); - success = true; - return ret; - } finally { - if (!success) { - IOUtils.closeWhileHandlingException(postingsWriter); - } - } + throw new UnsupportedOperationException( + "This postings format may not be used for writing, use the current postings format"); } @Override diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsReader.java similarity index 96% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsReader.java index 052e281ca567..f9a599821a74 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsReader.java @@ -14,17 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; - -import static org.apache.lucene.codecs.lucene101.ForUtil.BLOCK_SIZE; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_NUM_DOCS; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.VERSION_START; +package org.apache.lucene.backward_codecs.lucene101; + +import static org.apache.lucene.backward_codecs.lucene101.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.LEVEL1_NUM_DOCS; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.META_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.VERSION_START; import java.io.IOException; import java.util.AbstractList; @@ -32,10 +32,10 @@ import java.util.Collections; import java.util.List; import java.util.RandomAccess; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Impact; import org.apache.lucene.index.Impacts; @@ -44,8 +44,6 @@ import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.internal.vectorization.PostingDecodingUtil; -import org.apache.lucene.internal.vectorization.VectorizationProvider; import org.apache.lucene.store.ByteArrayDataInput; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.DataInput; @@ -65,7 +63,6 @@ */ public final class Lucene101PostingsReader extends PostingsReaderBase { - static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); // Dummy impacts, composed of the maximum possible term frequency and the lowest possible // (unsigned) norm value. This is typically used on tail blocks, which don't actually record // impacts as the storage overhead would not be worth any query evaluation speedup, since there's @@ -462,7 +459,7 @@ public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) if (needsPos) { this.posIn = Lucene101PostingsReader.this.posIn.clone(); - posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + posInUtil = new PostingDecodingUtil(posIn); posDeltaBuffer = new int[BLOCK_SIZE]; } else { this.posIn = null; @@ -472,7 +469,7 @@ public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) if (needsOffsets || needsPayloads) { this.payIn = Lucene101PostingsReader.this.payIn.clone(); - payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); + payInUtil = new PostingDecodingUtil(payIn); } else { this.payIn = null; payInUtil = null; @@ -515,7 +512,7 @@ public BlockPostingsEnum reset(IntBlockTermState termState, int flags) throws IO if (docIn == null) { // lazy init docIn = Lucene101PostingsReader.this.docIn.clone(); - docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + docInUtil = new PostingDecodingUtil(docIn); } prefetchPostings(docIn, termState); } @@ -1354,9 +1351,7 @@ public Impacts getImpacts() { } } - /** - * @see Lucene101PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) - */ + /** see Lucene101PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) */ static int readVInt15(DataInput in) throws IOException { short s = in.readShort(); if (s >= 0) { @@ -1366,9 +1361,7 @@ static int readVInt15(DataInput in) throws IOException { } } - /** - * @see Lucene101PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long) - */ + /** see Lucene101PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long) */ static long readVLong15(DataInput in) throws IOException { short s = in.readShort(); if (s >= 0) { diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PForUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PForUtil.java new file mode 100644 index 000000000000..b3d2685f98be --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PForUtil.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.LongHeap; +import org.apache.lucene.util.packed.PackedInts; + +/** Utility class to encode sequences of 128 small positive integers. */ +final class PForUtil { + + private static final int MAX_EXCEPTIONS = 7; + + static boolean allEqual(int[] l) { + for (int i = 1; i < ForUtil.BLOCK_SIZE; ++i) { + if (l[i] != l[0]) { + return false; + } + } + return true; + } + + private final ForUtil forUtil = new ForUtil(); + + static { + assert ForUtil.BLOCK_SIZE <= 256 : "blocksize must fit in one byte. got " + ForUtil.BLOCK_SIZE; + } + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, DataOutput out) throws IOException { + // Determine the top MAX_EXCEPTIONS + 1 values + final LongHeap top = new LongHeap(MAX_EXCEPTIONS + 1); + for (int i = 0; i <= MAX_EXCEPTIONS; ++i) { + top.push(ints[i]); + } + long topValue = top.top(); + for (int i = MAX_EXCEPTIONS + 1; i < ForUtil.BLOCK_SIZE; ++i) { + if (ints[i] > topValue) { + topValue = top.updateTop(ints[i]); + } + } + + long max = 0L; + for (int i = 1; i <= top.size(); ++i) { + max = Math.max(max, top.get(i)); + } + + final int maxBitsRequired = PackedInts.bitsRequired(max); + // We store the patch on a byte, so we can't decrease the number of bits required by more than 8 + final int patchedBitsRequired = + Math.max(PackedInts.bitsRequired(topValue), maxBitsRequired - 8); + int numExceptions = 0; + final long maxUnpatchedValue = (1L << patchedBitsRequired) - 1; + for (int i = 2; i <= top.size(); ++i) { + if (top.get(i) > maxUnpatchedValue) { + numExceptions++; + } + } + final byte[] exceptions = new byte[numExceptions * 2]; + if (numExceptions > 0) { + int exceptionCount = 0; + for (int i = 0; i < ForUtil.BLOCK_SIZE; ++i) { + if (ints[i] > maxUnpatchedValue) { + exceptions[exceptionCount * 2] = (byte) i; + exceptions[exceptionCount * 2 + 1] = (byte) (ints[i] >>> patchedBitsRequired); + ints[i] &= maxUnpatchedValue; + exceptionCount++; + } + } + assert exceptionCount == numExceptions : exceptionCount + " " + numExceptions; + } + + if (allEqual(ints) && maxBitsRequired <= 8) { + for (int i = 0; i < numExceptions; ++i) { + exceptions[2 * i + 1] = + (byte) (Byte.toUnsignedLong(exceptions[2 * i + 1]) << patchedBitsRequired); + } + out.writeByte((byte) (numExceptions << 5)); + out.writeVInt(ints[0]); + } else { + final int token = (numExceptions << 5) | patchedBitsRequired; + out.writeByte((byte) token); + forUtil.encode(ints, patchedBitsRequired, out); + } + out.writeBytes(exceptions, exceptions.length); + } + + /** Decode 128 integers into {@code ints}. */ + void decode(PostingDecodingUtil pdu, int[] ints) throws IOException { + var in = pdu.in; + final int token = Byte.toUnsignedInt(in.readByte()); + final int bitsPerValue = token & 0x1f; + if (bitsPerValue == 0) { + Arrays.fill(ints, 0, ForUtil.BLOCK_SIZE, in.readVInt()); + } else { + forUtil.decode(bitsPerValue, pdu, ints); + } + final int numExceptions = token >>> 5; + for (int i = 0; i < numExceptions; ++i) { + ints[Byte.toUnsignedInt(in.readByte())] |= Byte.toUnsignedLong(in.readByte()) << bitsPerValue; + } + } + + /** Skip 128 integers. */ + static void skip(DataInput in) throws IOException { + final int token = Byte.toUnsignedInt(in.readByte()); + final int bitsPerValue = token & 0x1f; + final int numExceptions = token >>> 5; + if (bitsPerValue == 0) { + in.readVLong(); + in.skipBytes((numExceptions << 1)); + } else { + in.skipBytes(ForUtil.numBytes(bitsPerValue) + (numExceptions << 1)); + } + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingDecodingUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingDecodingUtil.java new file mode 100644 index 000000000000..dcb980c066e2 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingDecodingUtil.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; + +/** Utility class to decode postings. */ +class PostingDecodingUtil { + + /** The wrapper {@link IndexInput}. */ + final IndexInput in; + + /** Sole constructor, called by sub-classes. */ + PostingDecodingUtil(IndexInput in) { + this.in = in; + } + + /** + * Core methods for decoding blocks of docs / freqs / positions / offsets. + * + * + */ + void splitInts(int count, int[] b, int bShift, int dec, int bMask, int[] c, int cIndex, int cMask) + throws IOException { + // Default implementation, which takes advantage of the C2 compiler's loop unrolling and + // auto-vectorization. + in.readInts(c, cIndex, count); + int maxIter = (bShift - 1) / dec; + for (int i = 0; i < count; ++i) { + for (int j = 0; j <= maxIter; ++j) { + b[count * j + i] = (c[cIndex + i] >>> (bShift - j * dec)) & bMask; + } + c[cIndex + i] &= cMask; + } + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingsUtil.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingsUtil.java new file mode 100644 index 000000000000..1de3bc68d858 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/PostingsUtil.java @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.GroupVIntUtil; + +/** Utility class to encode/decode postings block. */ +final class PostingsUtil { + + /** + * Read values that have been written using variable-length encoding and group-varint encoding + * instead of bit-packing. + */ + static void readVIntBlock( + IndexInput docIn, + int[] docBuffer, + int[] freqBuffer, + int num, + boolean indexHasFreq, + boolean decodeFreq) + throws IOException { + GroupVIntUtil.readGroupVInts(docIn, docBuffer, num); + if (indexHasFreq && decodeFreq) { + for (int i = 0; i < num; ++i) { + freqBuffer[i] = docBuffer[i] & 0x01; + docBuffer[i] >>>= 1; + if (freqBuffer[i] == 0) { + freqBuffer[i] = docIn.readVInt(); + } + } + } else if (indexHasFreq) { + for (int i = 0; i < num; ++i) { + docBuffer[i] >>>= 1; + } + } + } + + /** Write freq buffer with variable-length encoding and doc buffer with group-varint encoding. */ + static void writeVIntBlock( + DataOutput docOut, int[] docBuffer, int[] freqBuffer, int num, boolean writeFreqs) + throws IOException { + if (writeFreqs) { + for (int i = 0; i < num; i++) { + docBuffer[i] = (docBuffer[i] << 1) | (freqBuffer[i] == 1 ? 1 : 0); + } + } + docOut.writeGroupVInts(docBuffer, num); + if (writeFreqs) { + for (int i = 0; i < num; i++) { + final int freq = freqBuffer[i]; + if (freq != 1) { + docOut.writeVInt(freq); + } + } + } + } +} diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForDeltaUtil.py b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForDeltaUtil.py new file mode 100644 index 000000000000..1691a84e87c5 --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForDeltaUtil.py @@ -0,0 +1,351 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import gcd + +"""Code generation for ForDeltaUtil.java""" + +MAX_SPECIALIZED_BITS_PER_VALUE = 16 +OUTPUT_FILE = "ForDeltaUtil.java" +PRIMITIVE_SIZE = [8, 16, 32] +HEADER = """// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.packed.PackedInts; + +import static org.apache.lucene.backward_codecs.lucene101.ForUtil.*; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in a Java int to get SIMD-like speedups. + * If bitsPerValue <= 4 then we pack 4 ints per Java int + * else if bitsPerValue <= 11 we pack 2 ints per Java int + * else we use scalar operations. + */ +public final class ForDeltaUtil { + + private static final int HALF_BLOCK_SIZE = BLOCK_SIZE / 2; + private static final int ONE_BLOCK_SIZE_FOURTH = BLOCK_SIZE / 4; + private static final int TWO_BLOCK_SIZE_FOURTHS = BLOCK_SIZE / 2; + private static final int THREE_BLOCK_SIZE_FOURTHS = 3 * BLOCK_SIZE / 4; + + private static void prefixSum8(int[] arr, int base) { + // When the number of bits per value is 4 or less, we can sum up all values in a block without + // risking overflowing an 8-bits integer. This allows computing the prefix sum by summing up 4 + // values at once. + innerPrefixSum8(arr); + expand8(arr); + final int l0 = base; + final int l1 = l0 + arr[ONE_BLOCK_SIZE_FOURTH - 1]; + final int l2 = l1 + arr[TWO_BLOCK_SIZE_FOURTHS - 1]; + final int l3 = l2 + arr[THREE_BLOCK_SIZE_FOURTHS - 1]; + + for (int i = 0; i < ONE_BLOCK_SIZE_FOURTH; ++i) { + arr[i] += l0; + arr[ONE_BLOCK_SIZE_FOURTH + i] += l1; + arr[TWO_BLOCK_SIZE_FOURTHS + i] += l2; + arr[THREE_BLOCK_SIZE_FOURTHS + i] += l3; + } + } + + private static void prefixSum16(int[] arr, int base) { + // When the number of bits per value is 11 or less, we can sum up all values in a block without + // risking overflowing an 16-bits integer. This allows computing the prefix sum by summing up 2 + // values at once. + innerPrefixSum16(arr); + expand16(arr); + final int l0 = base; + final int l1 = base + arr[HALF_BLOCK_SIZE - 1]; + for (int i = 0; i < HALF_BLOCK_SIZE; ++i) { + arr[i] += l0; + arr[HALF_BLOCK_SIZE + i] += l1; + } + } + + private static void prefixSum32(int[] arr, int base) { + arr[0] += base; + for (int i = 1; i < BLOCK_SIZE; ++i) { + arr[i] += arr[i-1]; + } + } + + // For some reason unrolling seems to help + private static void innerPrefixSum8(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + } + + // For some reason unrolling seems to help + private static void innerPrefixSum16(int[] arr) { + arr[1] += arr[0]; + arr[2] += arr[1]; + arr[3] += arr[2]; + arr[4] += arr[3]; + arr[5] += arr[4]; + arr[6] += arr[5]; + arr[7] += arr[6]; + arr[8] += arr[7]; + arr[9] += arr[8]; + arr[10] += arr[9]; + arr[11] += arr[10]; + arr[12] += arr[11]; + arr[13] += arr[12]; + arr[14] += arr[13]; + arr[15] += arr[14]; + arr[16] += arr[15]; + arr[17] += arr[16]; + arr[18] += arr[17]; + arr[19] += arr[18]; + arr[20] += arr[19]; + arr[21] += arr[20]; + arr[22] += arr[21]; + arr[23] += arr[22]; + arr[24] += arr[23]; + arr[25] += arr[24]; + arr[26] += arr[25]; + arr[27] += arr[26]; + arr[28] += arr[27]; + arr[29] += arr[28]; + arr[30] += arr[29]; + arr[31] += arr[30]; + arr[32] += arr[31]; + arr[33] += arr[32]; + arr[34] += arr[33]; + arr[35] += arr[34]; + arr[36] += arr[35]; + arr[37] += arr[36]; + arr[38] += arr[37]; + arr[39] += arr[38]; + arr[40] += arr[39]; + arr[41] += arr[40]; + arr[42] += arr[41]; + arr[43] += arr[42]; + arr[44] += arr[43]; + arr[45] += arr[44]; + arr[46] += arr[45]; + arr[47] += arr[46]; + arr[48] += arr[47]; + arr[49] += arr[48]; + arr[50] += arr[49]; + arr[51] += arr[50]; + arr[52] += arr[51]; + arr[53] += arr[52]; + arr[54] += arr[53]; + arr[55] += arr[54]; + arr[56] += arr[55]; + arr[57] += arr[56]; + arr[58] += arr[57]; + arr[59] += arr[58]; + arr[60] += arr[59]; + arr[61] += arr[60]; + arr[62] += arr[61]; + arr[63] += arr[62]; + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** Return the number of bits per value required to store the given array containing strictly positive numbers. */ + int bitsRequired(int[] ints) { + int or = 0; + for (int l : ints) { + or |= l; + } + // Deltas should be strictly positive since the delta between consecutive doc IDs is at least 1 + assert or != 0; + return PackedInts.bitsRequired(or); + } + + /** + * Encode deltas of a strictly monotonically increasing sequence of integers. The provided {@code + * ints} are expected to be deltas between consecutive values. + */ + void encodeDeltas(int bitsPerValue, int[] ints, DataOutput out) throws IOException { + final int primitiveSize; + if (bitsPerValue <= 3) { + primitiveSize = 8; + collapse8(ints); + } else if (bitsPerValue <= 10) { + primitiveSize = 16; + collapse16(ints); + } else { + primitiveSize = 32; + } + encode(ints, bitsPerValue, primitiveSize, out, tmp); + } + +""" + +def primitive_size_for_bpv(bpv): + if bpv <= 3: + # If we have 4 bits per value or less then we can compute the prefix sum of 32 ints that store 4 8-bit values each without overflowing. + return 8 + elif bpv <= 10: + # If we have 10 bits per value or less then we can compute the prefix sum of 64 ints that store 2 16-bit values each without overflowing. + return 16 + else: + # No risk of overflow with 32 bits per value + return 32 + +def next_primitive(bpv): + if bpv <= 8: + return 8 + elif bpv <= 16: + return 16 + else: + return 32 + +def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f): + iteration = 1 + num_ints = bpv * num_values / remaining_bits_per_int + while num_ints % 2 == 0 and num_values % 2 == 0: + num_ints /= 2 + num_values /= 2 + iteration *= 2 + f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values)) + i = 0 + remaining_bits = 0 + tmp_idx = 0 + for i in range(int(num_values)): + b = bpv + if remaining_bits == 0: + b -= remaining_bits_per_int + f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + else: + b -= remaining_bits + f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) + tmp_idx += 1 + while b >= remaining_bits_per_int: + b -= remaining_bits_per_int + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + tmp_idx += 1 + if b > 0: + f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b)) + remaining_bits = remaining_bits_per_int-b + f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i)) + f.write(' }\n') + +def writeDecode(bpv, f): + next_primitive = primitive_size_for_bpv(bpv) + if next_primitive % bpv == 0: + f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %(bpv, next_primitive)) + else: + f.write(' private static void decode%dTo%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %(bpv, next_primitive)) + if bpv == next_primitive: + f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4)) + else: + num_values_per_int = 32 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 4 * bpv * num_iters + if remaining_bits == 0: + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f) + f.write(' }\n') + +if __name__ == '__main__': + f = open(OUTPUT_FILE, 'w') + f.write(HEADER) + f.write(""" + /** + * Delta-decode 128 integers into {@code ints}. + */ + void decodeAndPrefixSum(int bitsPerValue, PostingDecodingUtil pdu, int base, int[] ints) throws IOException { + switch (bitsPerValue) { +""") + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + primitive_size = primitive_size_for_bpv(bpv) + f.write(' case %d:\n' %bpv) + if next_primitive(bpv) == primitive_size: + if primitive_size % bpv == 0: + f.write(' decode%d(pdu, ints);\n' %bpv) + else: + f.write(' decode%d(pdu, tmp, ints);\n' %bpv) + else: + if primitive_size % bpv == 0: + f.write(' decode%dTo%d(pdu, ints);\n' %(bpv, primitive_size)) + else: + f.write(' decode%dTo%d(pdu, tmp, ints);\n' %(bpv, primitive_size)) + f.write(' prefixSum%d(ints, base);\n' %primitive_size) + f.write(' break;\n') + f.write(' default:\n') + f.write(' if (bitsPerValue < 1 || bitsPerValue > Integer.SIZE) {\n') + f.write(' throw new IllegalStateException("Illegal number of bits per value: " + bitsPerValue);\n') + f.write(' }\n') + f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') + f.write(' prefixSum32(ints, base);\n') + f.write(' break;\n') + f.write(' }\n') + f.write(' }\n') + + f.write('\n') + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + if next_primitive(bpv) != primitive_size_for_bpv(bpv): + writeDecode(bpv, f) + if bpv < MAX_SPECIALIZED_BITS_PER_VALUE: + f.write('\n') + + f.write('}\n') diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForUtil.py b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForUtil.py new file mode 100644 index 000000000000..a326a19f5abf --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/gen_ForUtil.py @@ -0,0 +1,326 @@ +#! /usr/bin/env python + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from math import gcd + +"""Code generation for ForUtil.java""" + +MAX_SPECIALIZED_BITS_PER_VALUE = 16 +OUTPUT_FILE = "ForUtil.java" +PRIMITIVE_SIZE = [8, 16, 32] +HEADER = """// This file has been automatically generated, DO NOT EDIT + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.DataOutput; + +/** + * Inspired from https://fulmicoton.com/posts/bitpacking/ + * Encodes multiple integers in one to get SIMD-like speedups. + * If bitsPerValue <= 8 then we pack 4 ints per Java int + * else if bitsPerValue <= 16 we pack 2 ints per Java int + * else we do scalar operations. + */ +public final class ForUtil { + + static final int BLOCK_SIZE = 128; + static final int BLOCK_SIZE_LOG2 = 7; + + static int expandMask16(int mask16) { + return mask16 | (mask16 << 16); + } + + static int expandMask8(int mask8) { + return expandMask16(mask8 | (mask8 << 8)); + } + + static int mask32(int bitsPerValue) { + return (1 << bitsPerValue) - 1; + } + + static int mask16(int bitsPerValue) { + return expandMask16((1 << bitsPerValue) - 1); + } + + static int mask8(int bitsPerValue) { + return expandMask8((1 << bitsPerValue) - 1); + } + + static void expand8(int[] arr) { + for (int i = 0; i < 32; ++i) { + int l = arr[i]; + arr[i] = (l >>> 24) & 0xFF; + arr[32 + i] = (l >>> 16) & 0xFF; + arr[64 + i] = (l >>> 8) & 0xFF; + arr[96 + i] = l & 0xFF; + } + } + + static void collapse8(int[] arr) { + for (int i = 0; i < 32; ++i) { + arr[i] = + (arr[i] << 24) + | (arr[32 + i] << 16) + | (arr[64 + i] << 8) + | arr[96 + i]; + } + } + + static void expand16(int[] arr) { + for (int i = 0; i < 64; ++i) { + int l = arr[i]; + arr[i] = (l >>> 16) & 0xFFFF; + arr[64 + i] = l & 0xFFFF; + } + } + + static void collapse16(int[] arr) { + for (int i = 0; i < 64; ++i) { + arr[i] = (arr[i] << 16) | arr[64 + i]; + } + } + + private final int[] tmp = new int[BLOCK_SIZE]; + + /** Encode 128 integers from {@code ints} into {@code out}. */ + void encode(int[] ints, int bitsPerValue, DataOutput out) throws IOException { + final int nextPrimitive; + if (bitsPerValue <= 8) { + nextPrimitive = 8; + collapse8(ints); + } else if (bitsPerValue <= 16) { + nextPrimitive = 16; + collapse16(ints); + } else { + nextPrimitive = 32; + } + encode(ints, bitsPerValue, nextPrimitive, out, tmp); + } + + static void encode(int[] ints, int bitsPerValue, int primitiveSize, DataOutput out, int[] tmp) throws IOException { + final int numInts = BLOCK_SIZE * primitiveSize / Integer.SIZE; + + final int numIntsPerShift = bitsPerValue * 4; + int idx = 0; + int shift = primitiveSize - bitsPerValue; + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] = ints[idx++] << shift; + } + for (shift = shift - bitsPerValue; shift >= 0; shift -= bitsPerValue) { + for (int i = 0; i < numIntsPerShift; ++i) { + tmp[i] |= ints[idx++] << shift; + } + } + + final int remainingBitsPerInt = shift + bitsPerValue; + final int maskRemainingBitsPerInt; + if (primitiveSize == 8) { + maskRemainingBitsPerInt = MASKS8[remainingBitsPerInt]; + } else if (primitiveSize == 16) { + maskRemainingBitsPerInt = MASKS16[remainingBitsPerInt]; + } else { + maskRemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + } + + int tmpIdx = 0; + int remainingBitsPerValue = bitsPerValue; + while (idx < numInts) { + if (remainingBitsPerValue >= remainingBitsPerInt) { + remainingBitsPerValue -= remainingBitsPerInt; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & maskRemainingBitsPerInt; + if (remainingBitsPerValue == 0) { + idx++; + remainingBitsPerValue = bitsPerValue; + } + } else { + final int mask1, mask2; + if (primitiveSize == 8) { + mask1 = MASKS8[remainingBitsPerValue]; + mask2 = MASKS8[remainingBitsPerInt - remainingBitsPerValue]; + } else if (primitiveSize == 16) { + mask1 = MASKS16[remainingBitsPerValue]; + mask2 = MASKS16[remainingBitsPerInt - remainingBitsPerValue]; + } else { + mask1 = MASKS32[remainingBitsPerValue]; + mask2 = MASKS32[remainingBitsPerInt - remainingBitsPerValue]; + } + tmp[tmpIdx] |= (ints[idx++] & mask1) << (remainingBitsPerInt - remainingBitsPerValue); + remainingBitsPerValue = bitsPerValue - remainingBitsPerInt + remainingBitsPerValue; + tmp[tmpIdx++] |= (ints[idx] >>> remainingBitsPerValue) & mask2; + } + } + + for (int i = 0; i < numIntsPerShift; ++i) { + out.writeInt(tmp[i]); + } + } + + /** Number of bytes required to encode 128 integers of {@code bitsPerValue} bits per value. */ + static int numBytes(int bitsPerValue) { + return bitsPerValue << (BLOCK_SIZE_LOG2 - 3); + } + + static void decodeSlow(int bitsPerValue, PostingDecodingUtil pdu, int[] tmp, int[] ints) + throws IOException { + final int numInts = bitsPerValue << 2; + final int mask = MASKS32[bitsPerValue]; + pdu.splitInts(numInts, ints, 32 - bitsPerValue, 32, mask, tmp, 0, -1); + final int remainingBitsPerInt = 32 - bitsPerValue; + final int mask32RemainingBitsPerInt = MASKS32[remainingBitsPerInt]; + int tmpIdx = 0; + int remainingBits = remainingBitsPerInt; + for (int intsIdx = numInts; intsIdx < BLOCK_SIZE; ++intsIdx) { + int b = bitsPerValue - remainingBits; + int l = (tmp[tmpIdx++] & MASKS32[remainingBits]) << b; + while (b >= remainingBitsPerInt) { + b -= remainingBitsPerInt; + l |= (tmp[tmpIdx++] & mask32RemainingBitsPerInt) << b; + } + if (b > 0) { + l |= (tmp[tmpIdx] >>> (remainingBitsPerInt - b)) & MASKS32[b]; + remainingBits = remainingBitsPerInt - b; + } else { + remainingBits = remainingBitsPerInt; + } + ints[intsIdx] = l; + } + } + +""" + +def writeRemainder(bpv, next_primitive, remaining_bits_per_int, o, num_values, f): + iteration = 1 + num_ints = bpv * num_values / remaining_bits_per_int + while num_ints % 2 == 0 and num_values % 2 == 0: + num_ints /= 2 + num_values /= 2 + iteration *= 2 + f.write(' for (int iter = 0, tmpIdx = 0, intsIdx = %d; iter < %d; ++iter, tmpIdx += %d, intsIdx += %d) {\n' %(o, iteration, num_ints, num_values)) + i = 0 + remaining_bits = 0 + tmp_idx = 0 + for i in range(int(num_values)): + b = bpv + if remaining_bits == 0: + b -= remaining_bits_per_int + f.write(' int l%d = tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + else: + b -= remaining_bits + f.write(' int l%d = (tmp[tmpIdx + %d] & MASK%d_%d) << %d;\n' %(i, tmp_idx, next_primitive, remaining_bits, b)) + tmp_idx += 1 + while b >= remaining_bits_per_int: + b -= remaining_bits_per_int + f.write(' l%d |= tmp[tmpIdx + %d] << %d;\n' %(i, tmp_idx, b)) + tmp_idx += 1 + if b > 0: + f.write(' l%d |= (tmp[tmpIdx + %d] >>> %d) & MASK%d_%d;\n' %(i, tmp_idx, remaining_bits_per_int-b, next_primitive, b)) + remaining_bits = remaining_bits_per_int-b + f.write(' ints[intsIdx + %d] = l%d;\n' %(i, i)) + f.write(' }\n') + + +def writeDecode(bpv, f): + next_primitive = 32 + if bpv <= 8: + next_primitive = 8 + elif bpv <= 16: + next_primitive = 16 + if bpv == next_primitive: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.in.readInts(ints, 0, %d);\n' %(bpv*4)) + else: + num_values_per_int = 32 / next_primitive + remaining_bits = next_primitive % bpv + num_iters = (next_primitive - 1) // bpv + o = 4 * bpv * num_iters + if remaining_bits == 0: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, ints, %d, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, o, next_primitive, next_primitive - num_iters * bpv)) + else: + f.write(' static void decode%d(PostingDecodingUtil pdu, int[] tmp, int[] ints) throws IOException {\n' %bpv) + f.write(' pdu.splitInts(%d, ints, %d, %d, MASK%d_%d, tmp, 0, MASK%d_%d);\n' %(bpv*4, next_primitive - bpv, bpv, next_primitive, bpv, next_primitive, next_primitive - num_iters * bpv)) + writeRemainder(bpv, next_primitive, remaining_bits, o, 128/num_values_per_int - o, f) + f.write(' }\n') + +if __name__ == '__main__': + f = open(OUTPUT_FILE, 'w') + f.write(HEADER) + for primitive_size in PRIMITIVE_SIZE: + f.write(' static final int[] MASKS%d = new int[%d];\n' %(primitive_size, primitive_size)) + f.write('\n') + f.write(' static {\n') + for primitive_size in PRIMITIVE_SIZE: + f.write(' for (int i = 0; i < %d; ++i) {\n' %primitive_size) + f.write(' MASKS%d[i] = mask%d(i);\n' %(primitive_size, primitive_size)) + f.write(' }\n') + f.write(' }') + f.write(""" + // mark values in array as final ints to avoid the cost of reading array, arrays should only be + // used when the idx is a variable +""") + for primitive_size in PRIMITIVE_SIZE: + for bpv in range(1, min(MAX_SPECIALIZED_BITS_PER_VALUE + 1, primitive_size)): + f.write(' static final int MASK%d_%d = MASKS%d[%d];\n' %(primitive_size, bpv, primitive_size, bpv)) + + f.write(""" + /** Decode 128 integers into {@code ints}. */ + void decode(int bitsPerValue, PostingDecodingUtil pdu, int[] ints) throws IOException { + switch (bitsPerValue) { +""") + for bpv in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + next_primitive = 32 + if bpv <= 8: + next_primitive = 8 + elif bpv <= 16: + next_primitive = 16 + f.write(' case %d:\n' %bpv) + if next_primitive % bpv == 0: + f.write(' decode%d(pdu, ints);\n' %bpv) + else: + f.write(' decode%d(pdu, tmp, ints);\n' %bpv) + if next_primitive != 32: + f.write(' expand%d(ints);\n' %next_primitive) + f.write(' break;\n') + f.write(' default:\n') + f.write(' decodeSlow(bitsPerValue, pdu, tmp, ints);\n') + f.write(' break;\n') + f.write(' }\n') + f.write(' }\n') + + for i in range(1, MAX_SPECIALIZED_BITS_PER_VALUE+1): + writeDecode(i, f) + if i < MAX_SPECIALIZED_BITS_PER_VALUE: + f.write('\n') + + f.write('}\n') diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/package-info.java similarity index 94% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/package-info.java index 8aa1c3b43a0c..1ff928d08f1e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene101/package-info.java @@ -16,4 +16,4 @@ */ /** Lucene 10.1 file format. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90PostingsFormat.java index 0fceeb47296a..ce6cfccc659e 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90PostingsFormat.java @@ -17,6 +17,7 @@ package org.apache.lucene.backward_codecs.lucene90; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; @@ -24,8 +25,6 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -95,8 +94,8 @@ *
Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on - * the format. + * .doc, .pos, and .pay files. See Lucene90BlockTreeTermsWriter for more details on the + * format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and * TermMetadata sections described here: @@ -150,7 +149,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See Lucene90BlockTreeTermsWriter for more details on the format. *

* * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/CompressionAlgorithm.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/CompressionAlgorithm.java new file mode 100644 index 000000000000..aba587cf499c --- /dev/null +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/CompressionAlgorithm.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene90.blocktree; + +import java.io.IOException; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.util.compress.LowercaseAsciiCompression; + +/** Compression algorithm used for suffixes of a block of terms. */ +enum CompressionAlgorithm { + NO_COMPRESSION(0x00) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + in.readBytes(out, 0, len); + } + }, + + LOWERCASE_ASCII(0x01) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + LowercaseAsciiCompression.decompress(in, out, len); + } + }, + + LZ4(0x02) { + + @Override + void read(DataInput in, byte[] out, int len) throws IOException { + org.apache.lucene.util.compress.LZ4.decompress(in, len, out, 0); + } + }; + + private static final CompressionAlgorithm[] BY_CODE = new CompressionAlgorithm[3]; + + static { + for (CompressionAlgorithm alg : CompressionAlgorithm.values()) { + BY_CODE[alg.code] = alg; + } + } + + /** Look up a {@link CompressionAlgorithm} by its {@link CompressionAlgorithm#code}. */ + static final CompressionAlgorithm byCode(int code) { + if (code < 0 || code >= BY_CODE.length) { + throw new IllegalArgumentException("Illegal code for a compression algorithm: " + code); + } + return BY_CODE[code]; + } + + public final int code; + + private CompressionAlgorithm(int code) { + this.code = code; + } + + abstract void read(DataInput in, byte[] out, int len) throws IOException; +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/FieldReader.java similarity index 96% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/FieldReader.java index 259639762ec9..83286bb76954 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/FieldReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/FieldReader.java @@ -14,9 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; -import static org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT; +import static org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader.VERSION_MSB_VLONG_OUTPUT; import java.io.IOException; import org.apache.lucene.index.FieldInfo; @@ -118,8 +118,8 @@ long readVLongOutput(DataInput in) throws IOException { } /** - * Decodes a variable length byte[] in MSB order back to long, as written by {@link - * Lucene90BlockTreeTermsWriter#writeMSBVLong}. + * Decodes a variable length byte[] in MSB order back to long, as written by + * Lucene90BlockTreeTermsWriter#writeMSBVLong. * *

Package private for testing. */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnum.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnum.java index e2f284f780d0..9b6cc8e4362c 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnum.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnum.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import org.apache.lucene.index.BaseTermsEnum; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnumFrame.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnumFrame.java index 9f6bb75788e2..ed69c95bfe63 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/IntersectTermsEnumFrame.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/IntersectTermsEnumFrame.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import java.util.Arrays; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java index 8be0b1e0f4a9..d12ffcd4eaca 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsReader.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import java.util.ArrayList; @@ -59,7 +59,7 @@ *

Use {@link org.apache.lucene.index.CheckIndex} with the -verbose option to see * summary statistics on the blocks in the dictionary. * - *

See {@link Lucene90BlockTreeTermsWriter}. + *

See Lucene90BlockTreeTermsWriter. * * @lucene.experimental */ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnum.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnum.java index 45ec4ee06ba7..ba6fe67b0f76 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnum.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnum.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import java.io.PrintStream; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnumFrame.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnumFrame.java index 85d23a489fe9..b7affa5c6f5e 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/SegmentTermsEnumFrame.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/SegmentTermsEnumFrame.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import java.util.Arrays; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Stats.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Stats.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Stats.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Stats.java index ceeef4a8687a..8e0284050da3 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Stats.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/Stats.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import static java.nio.charset.StandardCharsets.UTF_8; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/package-info.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/package-info.java similarity index 88% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/package-info.java rename to lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/package-info.java index 27d57b60371e..c7b578896ebb 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/package-info.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/blocktree/package-info.java @@ -23,7 +23,7 @@ * structure. It allows you to plug in your own {@link org.apache.lucene.codecs.PostingsWriterBase} * to implement the postings. * - *

See {@link org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter} for the + *

See org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter for the * file format. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java index bb748f624950..695b6b0828b3 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene912/Lucene912PostingsFormat.java @@ -17,14 +17,13 @@ package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -92,8 +91,8 @@ *

Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on - * the format. + * .doc, .pos, and .pay files. See Lucene90BlockTreeTermsWriter for more details on the + * format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and * TermMetadata sections described here: @@ -144,7 +143,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See Lucene90BlockTreeTermsWriter for more details on the format. *

* * diff --git a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99PostingsFormat.java b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99PostingsFormat.java index 7ff614d684e9..f4fbc49c514a 100644 --- a/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99PostingsFormat.java +++ b/lucene/backward-codecs/src/java/org/apache/lucene/backward_codecs/lucene99/Lucene99PostingsFormat.java @@ -17,6 +17,7 @@ package org.apache.lucene.backward_codecs.lucene99; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.FieldsConsumer; @@ -24,8 +25,6 @@ import org.apache.lucene.codecs.MultiLevelSkipListWriter; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -93,8 +92,8 @@ *
Term Dictionary *

The .tim file contains the list of terms in each field along with per-term statistics * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the - * .doc, .pos, and .pay files. See {@link Lucene90BlockTreeTermsWriter} for more details on - * the format. + * .doc, .pos, and .pay files. See Lucene90BlockTreeTermsWriter for more details on the + * format. *

NOTE: The term dictionary can plug into different postings implementations: the postings * writer/reader are actually responsible for encoding and decoding the PostingsHeader and * TermMetadata sections described here: @@ -148,7 +147,7 @@ *

*
Term Index *

The .tip file contains an index into the term dictionary, so that it can be accessed - * randomly. See {@link Lucene90BlockTreeTermsWriter} for more details on the format. + * randomly. See Lucene90BlockTreeTermsWriter for more details on the format. *

* * diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index ff4d7eeda4e9..9f6cf615a3e0 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -25,3 +25,4 @@ org.apache.lucene.backward_codecs.lucene95.Lucene95Codec org.apache.lucene.backward_codecs.lucene99.Lucene99Codec org.apache.lucene.backward_codecs.lucene912.Lucene912Codec org.apache.lucene.backward_codecs.lucene100.Lucene100Codec +org.apache.lucene.backward_codecs.lucene101.Lucene101Codec diff --git a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 49d917dc4273..84cf41823845 100644 --- a/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/backward-codecs/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -18,3 +18,4 @@ org.apache.lucene.backward_codecs.lucene84.Lucene84PostingsFormat org.apache.lucene.backward_codecs.lucene90.Lucene90PostingsFormat org.apache.lucene.backward_codecs.lucene99.Lucene99PostingsFormat org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsFormat +org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsWriter.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsWriter.java index 3d19a69b82d8..5ea23b201b0a 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/Lucene101PostingsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101PostingsWriter.java @@ -14,25 +14,25 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.*; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.META_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.*; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.DOC_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.LEVEL1_MASK; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.META_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.PAY_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.POS_CODEC; +import static org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.TERMS_CODEC; import java.io.IOException; import java.util.Arrays; import java.util.Collection; import java.util.List; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.CodecUtil; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; import org.apache.lucene.codecs.PushPostingsWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Impact; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101RWPostingsFormat.java new file mode 100644 index 000000000000..f9a2c6f219fd --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/Lucene101RWPostingsFormat.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.util.IOUtils; + +/** Read-write impersonation of {@link Lucene101PostingsFormat}. */ +public final class Lucene101RWPostingsFormat extends Lucene101PostingsFormat { + + private final int version; + private final int minTermBlockSize; + private final int maxTermBlockSize; + + /** Creates {@code Lucene101PostingsFormat} with default settings. */ + public Lucene101RWPostingsFormat() { + this( + Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + /** + * Creates {@code Lucene101PostingsFormat} with custom values for {@code minBlockSize} and {@code + * maxBlockSize} passed to block terms dictionary. + * + * @see + * Lucene90BlockTreeTermsWriter#Lucene90BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) + */ + public Lucene101RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT); + } + + /** Expert constructor that allows setting the version. */ + public Lucene101RWPostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) { + super(); + if (version < VERSION_START || version > VERSION_CURRENT) { + throw new IllegalArgumentException("Version out of range: " + version); + } + this.version = version; + Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + this.minTermBlockSize = minTermBlockSize; + this.maxTermBlockSize = maxTermBlockSize; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state, version); + boolean success = false; + try { + FieldsConsumer ret = + new Lucene90BlockTreeTermsWriter( + state, postingsWriter, minTermBlockSize, maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForDeltaUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForDeltaUtil.java new file mode 100644 index 000000000000..4fe5ae4c300b --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForDeltaUtil.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForDeltaUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31 - 7); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 1, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + int bitsPerValue = forDeltaUtil.bitsRequired(source); + out.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, source, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = new PostingDecodingUtil(in); + ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); + for (int i = 0; i < iterations; ++i) { + int base = 0; + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + int bitsPerValue = pdu.in.readByte(); + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, pdu, base, restored); + final int[] expected = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + expected[j] = values[i * ForUtil.BLOCK_SIZE + j]; + if (j > 0) { + expected[j] += expected[j - 1]; + } else { + expected[j] += base; + } + } + assertArrayEquals(Arrays.toString(restored), expected, restored); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForUtil.java new file mode 100644 index 000000000000..c2411e052a1b --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestForUtil.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 1, 31); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + } + } + + final Directory d = new ByteBuffersDirectory(); + final long endPointer; + + { + // encode + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final ForUtil forUtil = new ForUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + long or = 0; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + or |= source[j]; + } + final int bpv = PackedInts.bitsRequired(or); + out.writeByte((byte) bpv); + forUtil.encode(source, bpv, out); + } + endPointer = out.getFilePointer(); + out.close(); + } + + { + // decode + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = new PostingDecodingUtil(in); + ForUtil forUtil = new ForUtil(); + for (int i = 0; i < iterations; ++i) { + final int bitsPerValue = in.readByte(); + final long currentFilePointer = in.getFilePointer(); + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + forUtil.decode(bitsPerValue, pdu, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + assertEquals(ForUtil.numBytes(bitsPerValue), in.getFilePointer() - currentFilePointer); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + } + + d.close(); + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormat.java similarity index 94% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormat.java index 6b0ff1fe5e3b..91b9469a4afe 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormat.java @@ -14,17 +14,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; import java.io.IOException; import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.lucene.backward_codecs.lucene101.Lucene101PostingsReader.MutableImpactList; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader.MutableImpactList; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; @@ -45,7 +45,7 @@ public class TestLucene101PostingsFormat extends BasePostingsFormatTestCase { @Override protected Codec getCodec() { - return TestUtil.alwaysPostingsFormat(new Lucene101PostingsFormat()); + return TestUtil.alwaysPostingsFormat(new Lucene101RWPostingsFormat()); } public void testVInt15() throws IOException { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormatV0.java similarity index 87% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormatV0.java index 037527413ea8..7231dd2fa1a0 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestLucene101PostingsFormatV0.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestLucene101PostingsFormatV0.java @@ -14,10 +14,10 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.backward_codecs.lucene101; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.tests.index.BasePostingsFormatTestCase; import org.apache.lucene.tests.util.TestUtil; @@ -26,7 +26,7 @@ public class TestLucene101PostingsFormatV0 extends BasePostingsFormatTestCase { @Override protected Codec getCodec() { return TestUtil.alwaysPostingsFormat( - new Lucene101PostingsFormat( + new Lucene101RWPostingsFormat( Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE, Lucene101PostingsFormat.VERSION_START)); diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPForUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPForUtil.java new file mode 100644 index 000000000000..902c384fcc86 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPForUtil.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import com.carrotsearch.randomizedtesting.generators.RandomNumbers; +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.store.ByteBuffersDirectory; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.tests.util.TestUtil; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.packed.PackedInts; + +public class TestPForUtil extends LuceneTestCase { + + public void testEncodeDecode() throws IOException { + final int iterations = RandomNumbers.randomIntBetween(random(), 50, 1000); + final int[] values = createTestData(iterations, 31); + + final Directory d = new ByteBuffersDirectory(); + final long endPointer = encodeTestData(iterations, values, d); + + IndexInput in = d.openInput("test.bin", IOContext.READONCE); + PostingDecodingUtil pdu = new PostingDecodingUtil(in); + final PForUtil pforUtil = new PForUtil(); + for (int i = 0; i < iterations; ++i) { + if (random().nextInt(5) == 0) { + PForUtil.skip(in); + continue; + } + final int[] restored = new int[ForUtil.BLOCK_SIZE]; + pforUtil.decode(pdu, restored); + int[] ints = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + ints[j] = Math.toIntExact(restored[j]); + } + assertArrayEquals( + Arrays.toString(ints), + ArrayUtil.copyOfSubArray(values, i * ForUtil.BLOCK_SIZE, (i + 1) * ForUtil.BLOCK_SIZE), + ints); + } + assertEquals(endPointer, in.getFilePointer()); + in.close(); + + d.close(); + } + + private int[] createTestData(int iterations, int maxBpv) { + final int[] values = new int[iterations * ForUtil.BLOCK_SIZE]; + + for (int i = 0; i < iterations; ++i) { + final int bpv = TestUtil.nextInt(random(), 0, maxBpv); + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + values[i * ForUtil.BLOCK_SIZE + j] = + RandomNumbers.randomIntBetween(random(), 0, (int) PackedInts.maxValue(bpv)); + if (random().nextInt(100) == 0) { + final int exceptionBpv; + if (random().nextInt(10) == 0) { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 9, 16), maxBpv); + } else { + exceptionBpv = Math.min(bpv + TestUtil.nextInt(random(), 1, 8), maxBpv); + } + values[i * ForUtil.BLOCK_SIZE + j] |= random().nextInt(1 << (exceptionBpv - bpv)) << bpv; + } + } + } + + return values; + } + + private long encodeTestData(int iterations, int[] values, Directory d) throws IOException { + IndexOutput out = d.createOutput("test.bin", IOContext.DEFAULT); + final PForUtil pforUtil = new PForUtil(); + + for (int i = 0; i < iterations; ++i) { + int[] source = new int[ForUtil.BLOCK_SIZE]; + for (int j = 0; j < ForUtil.BLOCK_SIZE; ++j) { + source[j] = values[i * ForUtil.BLOCK_SIZE + j]; + } + pforUtil.encode(source, out); + } + final long endPointer = out.getFilePointer(); + out.close(); + + return endPointer; + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPostingsUtil.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPostingsUtil.java new file mode 100644 index 000000000000..ef6587c16058 --- /dev/null +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene101/TestPostingsUtil.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.backward_codecs.lucene101; + +import java.io.IOException; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; + +public class TestPostingsUtil extends LuceneTestCase { + + // checks for bug described in https://github.com/apache/lucene/issues/13373 + public void testIntegerOverflow() throws IOException { + // Size that writes the first value as a regular vint + int randomSize1 = random().nextInt(1, 3); + // Size that writes the first value as a group vint + int randomSize2 = random().nextInt(4, ForUtil.BLOCK_SIZE); + doTestIntegerOverflow(randomSize1); + doTestIntegerOverflow(randomSize2); + } + + private void doTestIntegerOverflow(int size) throws IOException { + final int[] docDeltaBuffer = new int[size]; + final int[] freqBuffer = new int[size]; + + final int delta = 1 << 30; + docDeltaBuffer[0] = delta; + try (Directory dir = newDirectory()) { + try (IndexOutput out = dir.createOutput("test", IOContext.DEFAULT)) { + // In old implementation, this would cause integer overflow exception. + PostingsUtil.writeVIntBlock(out, docDeltaBuffer, freqBuffer, size, true); + } + int[] restoredDocs = new int[size]; + int[] restoredFreqs = new int[size]; + try (IndexInput in = dir.openInput("test", IOContext.DEFAULT)) { + PostingsUtil.readVIntBlock(in, restoredDocs, restoredFreqs, size, true, true); + } + assertEquals(delta, restoredDocs[0]); + } + } +} diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java index 4360b90f2370..94f7cd57820c 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/Lucene90RWPostingsFormat.java @@ -17,14 +17,14 @@ package org.apache.lucene.backward_codecs.lucene90; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.BlockTermState; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TermState; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java index 814285a8895a..002e5bb23330 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/TestLucene90PostingsFormat.java @@ -23,11 +23,11 @@ import java.util.Collections; import java.util.List; import org.apache.lucene.backward_codecs.lucene90.Lucene90ScoreSkipReader.MutableImpactList; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.backward_codecs.lucene99.Lucene99SkipWriter; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java index 8d0d5aaaa224..0c463dd2cd57 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/Lucene90BlockTreeTermsWriter.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import static org.apache.lucene.util.fst.FSTCompiler.getOnHeapReaderWriter; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/TestMSBVLong.java similarity index 96% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java rename to lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/TestMSBVLong.java index 1ebab9262099..c5bd7470290c 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/blocktree/TestMSBVLong.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene90/blocktree/TestMSBVLong.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.backward_codecs.lucene90.blocktree; import java.io.IOException; import org.apache.lucene.store.ByteArrayDataInput; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java index af1037432afa..6728e6e19797 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/Lucene912RWPostingsFormat.java @@ -17,9 +17,9 @@ package org.apache.lucene.backward_codecs.lucene912; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java index adf8aaf9ec76..58eb4b33a38f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene912/TestLucene912PostingsFormat.java @@ -20,11 +20,11 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.backward_codecs.lucene912.Lucene912PostingsReader.MutableImpactList; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWPostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWPostingsFormat.java index f513562cb358..a04a1a3c7005 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWPostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/Lucene99RWPostingsFormat.java @@ -17,9 +17,9 @@ package org.apache.lucene.backward_codecs.lucene99; import java.io.IOException; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99PostingsFormat.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99PostingsFormat.java index e954af88c3b9..0d601d91a4c1 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99PostingsFormat.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_codecs/lucene99/TestLucene99PostingsFormat.java @@ -22,11 +22,11 @@ import java.util.Arrays; import java.util.Collections; import java.util.List; +import org.apache.lucene.backward_codecs.lucene90.blocktree.FieldReader; +import org.apache.lucene.backward_codecs.lucene90.blocktree.Stats; import org.apache.lucene.backward_codecs.lucene99.Lucene99ScoreSkipReader.MutableImpactList; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.CompetitiveImpactAccumulator; -import org.apache.lucene.codecs.lucene90.blocktree.FieldReader; -import org.apache.lucene.codecs.lucene90.blocktree.Stats; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.index.DirectoryReader; diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java index 402747a3138c..a2c09f53526f 100644 --- a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java +++ b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/TestInt7HnswBackwardsCompatibility.java @@ -23,7 +23,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; @@ -69,7 +69,7 @@ public static Iterable testVersionsFactory() throws IllegalAccessExcep } protected Codec getCodec() { - return new Lucene101Codec() { + return new Lucene103Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new Lucene99HnswScalarQuantizedVectorsFormat( diff --git a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.2.0.zip b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.2.0.zip index 337037b5c3e4..56c6e68c30c6 100644 Binary files a/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.2.0.zip and b/lucene/backward-codecs/src/test/org/apache/lucene/backward_index/int7_hnsw.10.2.0.zip differ diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java index 241b289c5f61..0881d0f6bf95 100644 --- a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java +++ b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/PostingIndexInputBenchmark.java @@ -21,9 +21,9 @@ import java.nio.file.Path; import java.util.Random; import java.util.concurrent.TimeUnit; -import org.apache.lucene.codecs.lucene101.ForDeltaUtil; -import org.apache.lucene.codecs.lucene101.ForUtil; -import org.apache.lucene.codecs.lucene101.PostingIndexInput; +import org.apache.lucene.codecs.lucene103.ForDeltaUtil; +import org.apache.lucene.codecs.lucene103.ForUtil; +import org.apache.lucene.codecs.lucene103.PostingIndexInput; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java index 29854aa500f9..68c842c72e9e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/BlockTreeOrdsPostingsFormat.java @@ -22,14 +22,14 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; -/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene101PostingsWriter}. */ +/** Uses {@link OrdsBlockTreeTermsWriter} with {@link Lucene103PostingsWriter}. */ public class BlockTreeOrdsPostingsFormat extends PostingsFormat { private final int minTermBlockSize; @@ -57,7 +57,7 @@ public BlockTreeOrdsPostingsFormat(int minTermBlockSize, int maxTermBlockSize) { super("BlockTreeOrds"); this.minTermBlockSize = minTermBlockSize; this.maxTermBlockSize = maxTermBlockSize; - Lucene90BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + Lucene103BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); } @Override @@ -67,7 +67,7 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(state); boolean success = false; try { @@ -84,7 +84,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene103PostingsReader(state); boolean success = false; try { FieldsProducer ret = new OrdsBlockTreeTermsReader(postingsReader, state); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java index 852c06ae5de6..8518afabab85 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/blocktreeords/OrdsBlockTreeTermsWriter.java @@ -25,7 +25,7 @@ import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PostingsWriterBase; import org.apache.lucene.codecs.blocktreeords.FSTOrdsOutputs.Output; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; // javadocs +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; // javadocs import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; @@ -80,9 +80,9 @@ order, meaning if you just next() the file pointer will */ /** - * This is just like {@link Lucene90BlockTreeTermsWriter}, except it also stores a version per term, - * and adds a method to its TermsEnum implementation to seekExact only if the version is >= the - * specified version. The version is added to the terms index to avoid seeking if no term in the + * This is just like {@link Lucene103BlockTreeTermsWriter}, except it also stores a version per + * term, and adds a method to its TermsEnum implementation to seekExact only if the version is >= + * the specified version. The version is added to the terms index to avoid seeking if no term in the * block has a high enough version. The term blocks file is .tiv and the terms index extension is * .tipv. * @@ -170,7 +170,7 @@ public OrdsBlockTreeTermsWriter( int minItemsInBlock, int maxItemsInBlock) throws IOException { - Lucene90BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock); + Lucene103BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock); maxDoc = state.segmentInfo.maxDoc(); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java index 5af23fb49455..5bb8b533d88e 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/DirectPostingsFormat.java @@ -24,7 +24,7 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.FieldsProducer; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.BaseTermsEnum; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.Fields; @@ -54,7 +54,7 @@ // - or: longer dense skip lists than just next byte? /** - * Wraps {@link Lucene101PostingsFormat} format for on-disk storage, but then at read time loads and + * Wraps {@link Lucene103PostingsFormat} format for on-disk storage, but then at read time loads and * stores all terms and postings directly in RAM as byte[], int[]. * *

WARNING: This is exceptionally RAM intensive: it makes no effort to compress the @@ -97,12 +97,12 @@ public DirectPostingsFormat(int minSkipCount, int lowFreqCutoff) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - return PostingsFormat.forName("Lucene101").fieldsConsumer(state); + return PostingsFormat.forName("Lucene103").fieldsConsumer(state); } @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - FieldsProducer postings = PostingsFormat.forName("Lucene101").fieldsProducer(state); + FieldsProducer postings = PostingsFormat.forName("Lucene103").fieldsProducer(state); if (state.context.context() != IOContext.Context.MERGE) { FieldsProducer loadedPostings; try { diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java index 4893ee8ad265..224d47e8f91c 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/memory/FSTPostingsFormat.java @@ -22,8 +22,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; @@ -41,7 +41,7 @@ public String toString() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(state); boolean success = false; try { @@ -57,7 +57,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene103PostingsReader(state); boolean success = false; try { FieldsProducer ret = new FSTTermsReader(state, postingsReader); diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java index 140d62b4e967..a4a9048a73e8 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/DeltaBaseTermStateSerializer.java @@ -17,13 +17,13 @@ package org.apache.lucene.codecs.uniformsplit; -import static org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.BLOCK_SIZE; import java.io.IOException; import org.apache.lucene.codecs.BlockTermState; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.TermState; @@ -34,7 +34,7 @@ /** * {@link TermState} serializer which encodes each file pointer as a delta relative to a base file - * pointer. It differs from {@link Lucene101PostingsWriter#encodeTerm} which encodes each file + * pointer. It differs from {@link Lucene103PostingsWriter#encodeTerm} which encodes each file * pointer as a delta relative to the previous file pointer. * *

It automatically sets the base file pointer to the first valid file pointer for doc start FP, @@ -95,7 +95,7 @@ public long getBasePayStartFP() { /** * Writes a {@link BlockTermState} to the provided {@link DataOutput}. * - *

Simpler variant of {@link Lucene101PostingsWriter#encodeTerm(DataOutput, FieldInfo, + *

Simpler variant of {@link Lucene103PostingsWriter#encodeTerm(DataOutput, FieldInfo, * BlockTermState, boolean)}. */ public void writeTermState( @@ -145,7 +145,7 @@ public void writeTermState( /** * Reads a {@link BlockTermState} from the provided {@link DataInput}. * - *

Simpler variant of {@link Lucene101PostingsReader#decodeTerm(DataInput, FieldInfo, + *

Simpler variant of {@link Lucene103PostingsReader#decodeTerm(DataInput, FieldInfo, * BlockTermState, boolean)}. * * @param reuse {@link BlockTermState} to reuse; or null to create a new one. diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java index 690eab214003..28c706332c32 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/UniformSplitPostingsFormat.java @@ -23,8 +23,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.util.IOUtils; @@ -113,7 +113,7 @@ protected UniformSplitPostingsFormat( @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(state); boolean success = false; try { FieldsConsumer termsWriter = @@ -130,7 +130,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene103PostingsReader(state); boolean success = false; try { FieldsProducer termsReader = diff --git a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java index d31b28704ef7..a4ac56039138 100644 --- a/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java +++ b/lucene/codecs/src/java/org/apache/lucene/codecs/uniformsplit/package-info.java @@ -28,7 +28,7 @@ * org.apache.lucene.search.PhraseQuery}) *

  • Quite efficient for {@link org.apache.lucene.search.PrefixQuery} *
  • Not efficient for spell-check and {@link org.apache.lucene.search.FuzzyQuery}, in this case - * prefer {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat} + * prefer {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat} * */ package org.apache.lucene.codecs.uniformsplit; diff --git a/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java b/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java index 0708f3b8050b..2f6dc6fa749c 100644 --- a/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java +++ b/lucene/codecs/src/test/org/apache/lucene/codecs/lucene90/tests/MockTermStateFactory.java @@ -17,7 +17,7 @@ package org.apache.lucene.codecs.lucene90.tests; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat.IntBlockTermState; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.IntBlockTermState; /** Test utility class to create mock {@link IntBlockTermState}. */ public class MockTermStateFactory { diff --git a/lucene/core/src/generated/checksums/generateForDeltaUtil.json b/lucene/core/src/generated/checksums/generateForDeltaUtil.json index 85765bbd7cbc..539b766b84b1 100644 --- a/lucene/core/src/generated/checksums/generateForDeltaUtil.json +++ b/lucene/core/src/generated/checksums/generateForDeltaUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java": "87e4d19b5284fa39adf2c24328cae2076b6f7bb3", - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py": "165586f801bef4d2f540521e81bc119880038b6c" + "lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java": "1d463b5044c3498d2372a06e3cd536917786fdd1", + "lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py": "722d50e56c0e805df0059e6ec89b30ca7d5f845a" } \ No newline at end of file diff --git a/lucene/core/src/generated/checksums/generateForUtil.json b/lucene/core/src/generated/checksums/generateForUtil.json index a1d4310d5db7..55cf553dd17c 100644 --- a/lucene/core/src/generated/checksums/generateForUtil.json +++ b/lucene/core/src/generated/checksums/generateForUtil.json @@ -1,4 +1,4 @@ { - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java": "3004112150943413e0f7fcc3e56b74c4875c1d64", - "lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py": "b1041b6b46caab789c04d99483ee016d550eeebc" + "lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForUtil.java": "1ea2dac2a26be521a70cf74c37fe9134f7b42cb7", + "lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForUtil.py": "0eb3ae0815b4b25e4d33074d96546e622e4278be" } \ No newline at end of file diff --git a/lucene/core/src/java/module-info.java b/lucene/core/src/java/module-info.java index 1faa5178e461..b9030b52a470 100644 --- a/lucene/core/src/java/module-info.java +++ b/lucene/core/src/java/module-info.java @@ -25,14 +25,14 @@ exports org.apache.lucene.analysis.tokenattributes; exports org.apache.lucene.analysis; exports org.apache.lucene.codecs.compressing; - exports org.apache.lucene.codecs.lucene90.blocktree; exports org.apache.lucene.codecs.lucene90.compressing; exports org.apache.lucene.codecs.lucene90; exports org.apache.lucene.codecs.lucene94; exports org.apache.lucene.codecs.lucene95; exports org.apache.lucene.codecs.lucene99; - exports org.apache.lucene.codecs.lucene101; exports org.apache.lucene.codecs.lucene102; + exports org.apache.lucene.codecs.lucene103.blocktree; + exports org.apache.lucene.codecs.lucene103; exports org.apache.lucene.codecs.perfield; exports org.apache.lucene.codecs; exports org.apache.lucene.document; @@ -79,7 +79,7 @@ provides org.apache.lucene.analysis.TokenizerFactory with org.apache.lucene.analysis.standard.StandardTokenizerFactory; provides org.apache.lucene.codecs.Codec with - org.apache.lucene.codecs.lucene101.Lucene101Codec; + org.apache.lucene.codecs.lucene103.Lucene103Codec; provides org.apache.lucene.codecs.DocValuesFormat with org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; provides org.apache.lucene.codecs.KnnVectorsFormat with @@ -89,7 +89,7 @@ org.apache.lucene.codecs.lucene102.Lucene102HnswBinaryQuantizedVectorsFormat, org.apache.lucene.codecs.lucene102.Lucene102BinaryQuantizedVectorsFormat; provides org.apache.lucene.codecs.PostingsFormat with - org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; + org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; provides org.apache.lucene.index.SortFieldProvider with org.apache.lucene.search.SortField.Provider, org.apache.lucene.search.SortedNumericSortField.Provider, diff --git a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java index 7be508c6fd1c..026c3b7b36cd 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/Codec.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/Codec.java @@ -56,7 +56,7 @@ static NamedSPILoader getLoader() { } @SuppressWarnings("NonFinalStaticField") - static Codec defaultCodec = LOADER.lookup("Lucene101"); + static Codec defaultCodec = LOADER.lookup("Lucene103"); } private final String name; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java index cb1c874b0eaa..1069c1e094f9 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java @@ -18,7 +18,7 @@ import java.io.Closeable; import java.io.IOException; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.index.TermsEnum; @@ -28,7 +28,7 @@ import org.apache.lucene.util.FixedBitSet; /** - * Class that plugs into term dictionaries, such as {@link Lucene90BlockTreeTermsWriter}, and + * Class that plugs into term dictionaries, such as {@link Lucene103BlockTreeTermsWriter}, and * handles writing postings. * * @see PostingsReaderBase diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene102/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene102/package-info.java index 8f6fcb2ef5bc..05fbb64cd836 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene102/package-info.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene102/package-info.java @@ -151,15 +151,15 @@ * field names. These are used to store auxiliary information about the document, such as its * title, url, or an identifier to access a database. The set of stored fields are what is * returned for each hit when searching. This is keyed by document number. - *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term dictionary}. A + *
  • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term dictionary}. A * dictionary containing all of the terms used in all of the indexed fields of all of the * documents. The dictionary also contains the number of documents which contain the term, and * pointers to the term's frequency and proximity data. - *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Frequency data}. For + *
  • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Frequency data}. For * each term in the dictionary, the numbers of all the documents that contain that term, and * the frequency of the term in that document, unless frequencies are omitted ({@link * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) - *
  • {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Proximity data}. For + *
  • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Proximity data}. For * each term in the dictionary, the positions that the term occurs in each document. Note that * this will not exist if all fields in all documents omit position data. *
  • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For @@ -255,27 +255,27 @@ * The stored fields for documents * * - * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Dictionary} + * {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Dictionary} * .tim * The term dictionary, stores term info * * - * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Term Index} + * {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Index} * .tip * The index into the Term Dictionary * * - * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Frequencies} + * {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Frequencies} * .doc * Contains the list of docs which contain each term along with frequency * * - * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Positions} + * {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Positions} * .pos * Stores position information about where a term occurs in the index * * - * {@link org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat Payloads} + * {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Payloads} * .pay * Stores additional per-position metadata information such as character offsets and user payloads * diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java index ceec3ce3342a..d66b3e336605 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForDeltaUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForDeltaUtil.java @@ -16,9 +16,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; -import static org.apache.lucene.codecs.lucene101.ForUtil.*; +import static org.apache.lucene.codecs.lucene103.ForUtil.*; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForUtil.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForUtil.java index 5a466d0f8f66..4ccfbd281254 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/ForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/ForUtil.java @@ -16,7 +16,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103Codec.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103Codec.java new file mode 100644 index 000000000000..d2e3eea6ef9d --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103Codec.java @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103; + +import java.util.Objects; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompoundFormat; +import org.apache.lucene.codecs.DocValuesFormat; +import org.apache.lucene.codecs.FieldInfosFormat; +import org.apache.lucene.codecs.FilterCodec; +import org.apache.lucene.codecs.KnnVectorsFormat; +import org.apache.lucene.codecs.LiveDocsFormat; +import org.apache.lucene.codecs.NormsFormat; +import org.apache.lucene.codecs.PointsFormat; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.SegmentInfoFormat; +import org.apache.lucene.codecs.StoredFieldsFormat; +import org.apache.lucene.codecs.TermVectorsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat; +import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; +import org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90NormsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90PointsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat; +import org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat; +import org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat; +import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; +import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; +import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat; + +/** + * Implements the Lucene 10.3 index format + * + *

    If you want to reuse functionality of this codec in another codec, extend {@link FilterCodec}. + * + * @see org.apache.lucene.codecs.lucene103 package documentation for file format details. + * @lucene.experimental + */ +public class Lucene103Codec extends Codec { + + /** Configuration option for the codec. */ + public enum Mode { + /** Trade compression ratio for retrieval speed. */ + BEST_SPEED(Lucene90StoredFieldsFormat.Mode.BEST_SPEED), + /** Trade retrieval speed for compression ratio. */ + BEST_COMPRESSION(Lucene90StoredFieldsFormat.Mode.BEST_COMPRESSION); + + private final Lucene90StoredFieldsFormat.Mode storedMode; + + private Mode(Lucene90StoredFieldsFormat.Mode storedMode) { + this.storedMode = Objects.requireNonNull(storedMode); + } + } + + private final TermVectorsFormat vectorsFormat = new Lucene90TermVectorsFormat(); + private final FieldInfosFormat fieldInfosFormat = new Lucene94FieldInfosFormat(); + private final SegmentInfoFormat segmentInfosFormat = new Lucene99SegmentInfoFormat(); + private final LiveDocsFormat liveDocsFormat = new Lucene90LiveDocsFormat(); + private final CompoundFormat compoundFormat = new Lucene90CompoundFormat(); + private final NormsFormat normsFormat = new Lucene90NormsFormat(); + + private final PostingsFormat defaultPostingsFormat; + private final PostingsFormat postingsFormat = + new PerFieldPostingsFormat() { + @Override + public PostingsFormat getPostingsFormatForField(String field) { + return Lucene103Codec.this.getPostingsFormatForField(field); + } + }; + + private final DocValuesFormat defaultDVFormat; + private final DocValuesFormat docValuesFormat = + new PerFieldDocValuesFormat() { + @Override + public DocValuesFormat getDocValuesFormatForField(String field) { + return Lucene103Codec.this.getDocValuesFormatForField(field); + } + }; + + private final KnnVectorsFormat defaultKnnVectorsFormat; + private final KnnVectorsFormat knnVectorsFormat = + new PerFieldKnnVectorsFormat() { + @Override + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return Lucene103Codec.this.getKnnVectorsFormatForField(field); + } + }; + + private final StoredFieldsFormat storedFieldsFormat; + + /** Instantiates a new codec. */ + public Lucene103Codec() { + this(Mode.BEST_SPEED); + } + + /** + * Instantiates a new codec, specifying the stored fields compression mode to use. + * + * @param mode stored fields compression mode to use for newly flushed/merged segments. + */ + public Lucene103Codec(Mode mode) { + super("Lucene103"); + this.storedFieldsFormat = + new Lucene90StoredFieldsFormat(Objects.requireNonNull(mode).storedMode); + this.defaultPostingsFormat = new Lucene103PostingsFormat(); + this.defaultDVFormat = new Lucene90DocValuesFormat(); + this.defaultKnnVectorsFormat = new Lucene99HnswVectorsFormat(); + } + + @Override + public final StoredFieldsFormat storedFieldsFormat() { + return storedFieldsFormat; + } + + @Override + public final TermVectorsFormat termVectorsFormat() { + return vectorsFormat; + } + + @Override + public final PostingsFormat postingsFormat() { + return postingsFormat; + } + + @Override + public final FieldInfosFormat fieldInfosFormat() { + return fieldInfosFormat; + } + + @Override + public final SegmentInfoFormat segmentInfoFormat() { + return segmentInfosFormat; + } + + @Override + public final LiveDocsFormat liveDocsFormat() { + return liveDocsFormat; + } + + @Override + public final CompoundFormat compoundFormat() { + return compoundFormat; + } + + @Override + public final PointsFormat pointsFormat() { + return new Lucene90PointsFormat(); + } + + @Override + public final KnnVectorsFormat knnVectorsFormat() { + return knnVectorsFormat; + } + + /** + * Returns the postings format that should be used for writing new segments of field. + * + *

    The default implementation always returns "Lucene103". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation, + */ + public PostingsFormat getPostingsFormatForField(String field) { + return defaultPostingsFormat; + } + + /** + * Returns the docvalues format that should be used for writing new segments of field + * . + * + *

    The default implementation always returns "Lucene90". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public DocValuesFormat getDocValuesFormatForField(String field) { + return defaultDVFormat; + } + + /** + * Returns the vectors format that should be used for writing new segments of field + * + *

    The default implementation always returns "Lucene99HnswVectorsFormat". + * + *

    WARNING: if you subclass, you are responsible for index backwards compatibility: + * future version of Lucene are only guaranteed to be able to read the default implementation. + */ + public KnnVectorsFormat getKnnVectorsFormatForField(String field) { + return defaultKnnVectorsFormat; + } + + @Override + public final DocValuesFormat docValuesFormat() { + return docValuesFormat; + } + + @Override + public final NormsFormat normsFormat() { + return normsFormat; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsFormat.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsFormat.java new file mode 100644 index 000000000000..3c489f9d3ca3 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsFormat.java @@ -0,0 +1,514 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103; + +import java.io.IOException; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsFormat; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsReader; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.packed.PackedInts; + +/** + * Lucene 10.3 postings format, which encodes postings in packed integer blocks for fast decode. + * + *

    Basic idea: + * + *

      + *
    • Packed Blocks and VInt Blocks: + *

      In packed blocks, integers are encoded with the same bit width ({@link PackedInts packed + * format}): the block size (i.e. number of integers inside block) is fixed (currently 128). + * Additionally blocks that are all the same value are encoded in an optimized way. + *

      In VInt blocks, integers are encoded as {@link DataOutput#writeVInt VInt}: the block + * size is variable. + *

    • Block structure: + *

      When the postings are long enough, Lucene103PostingsFormat will try to encode most + * integer data as a packed block. + *

      Take a term with 259 documents as an example, the first 256 document ids are encoded as + * two packed blocks, while the remaining 3 are encoded as one VInt block. + *

      Different kinds of data are always encoded separately into different packed blocks, but + * may possibly be interleaved into the same VInt block. + *

      This strategy is applied to pairs: <document number, frequency>, <position, + * payload length>, <position, offset start, offset length>, and <position, + * payload length, offsetstart, offset length>. + *

    • Skipdata: + *

      Skipdata is interleaved with blocks on 2 levels. Level 0 skip data is interleaved + * between every packed block. Level 1 skip data is interleaved between every 32 packed + * blocks. + *

    • Positions, Payloads, and Offsets: + *

      A position is an integer indicating where the term occurs within one document. A payload + * is a blob of metadata associated with current position. An offset is a pair of integers + * indicating the tokenized start/end offsets for given term in current position: it is + * essentially a specialized payload. + *

      When payloads and offsets are not omitted, numPositions==numPayloads==numOffsets + * (assuming a null payload contributes one count). As mentioned in block structure, it is + * possible to encode these three either combined or separately. + *

      In all cases, payloads and offsets are stored together. When encoded as a packed block, + * position data is separated out as .pos, while payloads and offsets are encoded in .pay + * (payload metadata will also be stored directly in .pay). When encoded as VInt blocks, all + * these three are stored interleaved into the .pos (so is payload metadata). + *

      With this strategy, the majority of payload and offset data will be outside .pos file. + * So for queries that require only position data, running on a full index with payloads and + * offsets, this reduces disk pre-fetches. + *

    + * + *

    Files and detailed format: + * + *

    + * + * + * + *
    + *
    Term Dictionary + *

    The .tim file contains the list of terms in each field along with per-term statistics + * (such as docfreq) and pointers to the frequencies, positions, payload and skip data in the + * .doc, .pos, and .pay files. See {@link Lucene103BlockTreeTermsWriter} for more details on + * the format. + *

    NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the PostingsHeader and + * TermMetadata sections described here: + *

      + *
    • PostingsHeader --> Header, PackedBlockSize + *
    • TermMetadata --> (DocFPDelta|SingletonDocID), PosFPDelta?, PosVIntBlockFPDelta?, + * PayFPDelta? + *
    • Header, --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • PackedBlockSize, SingletonDocID --> {@link DataOutput#writeVInt VInt} + *
    • DocFPDelta, PosFPDelta, PayFPDelta, PosVIntBlockFPDelta --> {@link + * DataOutput#writeVLong VLong} + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • Header is a {@link CodecUtil#writeIndexHeader IndexHeader} storing the version + * information for the postings. + *
    • PackedBlockSize is the fixed block size for packed blocks. In packed block, bit width + * is determined by the largest integer. Smaller block size result in smaller variance + * among width of integers hence smaller indexes. Larger block size result in more + * efficient bulk i/o hence better acceleration. This value should always be a multiple + * of 64, currently fixed as 128 as a tradeoff. It is also the skip interval used to + * accelerate {@link org.apache.lucene.index.PostingsEnum#advance(int)}. + *
    • DocFPDelta determines the position of this term's TermFreqs within the .doc file. In + * particular, it is the difference of file offset between this term's data and previous + * term's data (or zero, for the first term in the block).On disk it is stored as the + * difference from previous value in sequence. + *
    • PosFPDelta determines the position of this term's TermPositions within the .pos file. + * While PayFPDelta determines the position of this term's <TermPayloads, + * TermOffsets?> within the .pay file. Similar to DocFPDelta, it is the difference + * between two file positions (or neglected, for fields that omit payloads and offsets). + *
    • PosVIntBlockFPDelta determines the position of this term's last TermPosition in last + * pos packed block within the .pos file. It is synonym for PayVIntBlockFPDelta or + * OffsetVIntBlockFPDelta. This is actually used to indicate whether it is necessary to + * load following payloads and offsets from .pos instead of .pay. Every time a new block + * of positions are to be loaded, the PostingsReader will use this value to check + * whether current block is packed format or VInt. When packed format, payloads and + * offsets are fetched from .pay, otherwise from .pos. (this value is neglected when + * total number of positions i.e. totalTermFreq is less or equal to PackedBlockSize). + *
    • SingletonDocID is an optimization when a term only appears in one document. In this + * case, instead of writing a file pointer to the .doc file (DocFPDelta), and then a + * VIntBlock at that location, the single document ID is written to the term dictionary. + *
    + *
    + * + * + * + *
    + *
    Term Index + *

    The .tip file contains an index into the term dictionary, so that it can be accessed + * randomly. See {@link Lucene103BlockTreeTermsWriter} for more details on the format. + *

    + * + * + * + *
    + *
    Frequencies and Skip Data + *

    The .doc file contains the lists of documents which contain each term, along with the + * frequency of the term in that document (except when frequencies are omitted: {@link + * IndexOptions#DOCS}). Skip data is saved at the end of each term's postings. The skip data + * is saved once for the entire postings list. + *

      + *
    • docFile(.doc) --> Header, <TermFreqs>TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermFreqs --> <PackedBlock32> PackedDocBlockNum/32, VIntBlock? + *
    • PackedBlock32 --> Level1SkipData, <PackedBlock> 32 + *
    • PackedBlock --> Level0SkipData, PackedDocDeltaBlock, PackedFreqBlock? + *
    • VIntBlock --> + * <DocDelta[,Freq?]>DocFreq-PackedBlockSize*PackedDocBlockNum + *
    • Level1SkipData --> DocDelta, DocFPDelta, Skip1NumBytes?, ImpactLength?, Impacts?, + * PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto? + *
    • Level0SkipData --> Skip0NumBytes, DocDelta, DocFPDelta, PackedBlockLength, + * ImpactLength?, Impacts?, PosFPDelta?, NextPosUpto?, PayFPDelta?, NextPayByteUpto? + *
    • PackedFreqBlock --> {@link PackedInts PackedInts}, uses patching + *
    • PackedDocDeltaBlock --> {@link PackedInts PackedInts}, does not use patching + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • PackedDocDeltaBlock is theoretically generated from two steps: + *
        + *
      1. Calculate the difference between each document number and previous one, and get + * a d-gaps list (for the first document, use absolute value); + *
      2. For those d-gaps from first one to + * PackedDocBlockNum*PackedBlockSizeth, separately encode as packed + * blocks. + *
      + * If frequencies are not omitted, PackedFreqBlock will be generated without d-gap step. + *
    • VIntBlock stores remaining d-gaps (along with frequencies when possible) with a + * format that encodes DocDelta and Freq: + *

      DocDelta: if frequencies are indexed, this determines both the document number and + * the frequency. In particular, DocDelta/2 is the difference between this document + * number and the previous document number (or zero when this is the first document in a + * TermFreqs). When DocDelta is odd, the frequency is one. When DocDelta is even, the + * frequency is read as another VInt. If frequencies are omitted, DocDelta contains the + * gap (not multiplied by 2) between document numbers and no frequency information is + * stored. + *

      For example, the TermFreqs for a term which occurs once in document seven and + * three times in document eleven, with frequencies indexed, would be the following + * sequence of VInts: + *

      15, 8, 3 + *

      If frequencies were omitted ({@link IndexOptions#DOCS}) it would be this sequence + * of VInts instead: + *

      7,4 + *

    • PackedDocBlockNum is the number of packed blocks for current term's docids or + * frequencies. In particular, PackedDocBlockNum = floor(DocFreq/PackedBlockSize) + *
    • On skip data, DocDelta is the delta between the last doc of the previous block - or + * -1 if there is no previous block - and the last doc of this block. This helps know by + * how much the doc ID should be incremented in case the block gets skipped. + *
    • Skip0Length is the length of skip data at level 0. Encoding it is useful when skip + * data is never needed to quickly skip over skip data, e.g. if only using nextDoc(). It + * is also used when only the first fields of skip data are needed, in order to skip + * over remaining fields without reading them. + *
    • ImpactLength and Impacts are only stored if frequencies are indexed. + *
    • Since positions and payloads are also block encoded, the skip should skip to related + * block first, then fetch the values according to in-block offset. PosFPSkip and + * PayFPSkip record the file offsets of related block in .pos and .pay, respectively. + * While PosBlockOffset indicates which value to fetch inside the related block + * (PayBlockOffset is unnecessary since it is always equal to PosBlockOffset). Same as + * DocFPSkip, the file offsets are relative to the start of current term's TermFreqs, + * and stored as a difference sequence. + *
    • PayByteUpto indicates the start offset of the current payload. It is equivalent to + * the sum of the payload lengths in the current block up to PosBlockOffset + *
    • ImpactLength is the total length of CompetitiveFreqDelta and CompetitiveNormDelta + * pairs. CompetitiveFreqDelta and CompetitiveNormDelta are used to safely skip score + * calculation for uncompetitive documents; See {@link + * org.apache.lucene.codecs.CompetitiveImpactAccumulator} for more details. + *
    + *
    + * + * + * + *
    + *
    Positions + *

    The .pos file contains the lists of positions that each term occurs at within documents. + * It also sometimes stores part of payloads and offsets for speedup. + *

      + *
    • PosFile(.pos) --> Header, <TermPositions> TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermPositions --> <PackedPosDeltaBlock> PackedPosBlockNum, + * VIntBlock? + *
    • VIntBlock --> <PositionDelta[, PayloadLength?], PayloadData?, OffsetDelta?, + * OffsetLength?>PosVIntCount + *
    • PackedPosDeltaBlock --> {@link PackedInts PackedInts} + *
    • PositionDelta, OffsetDelta, OffsetLength --> {@link DataOutput#writeVInt VInt} + *
    • PayloadData --> {@link DataOutput#writeByte byte}PayLength + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • TermPositions are order by term (terms are implicit, from the term dictionary), and + * position values for each term document pair are incremental, and ordered by document + * number. + *
    • PackedPosBlockNum is the number of packed blocks for current term's positions, + * payloads or offsets. In particular, PackedPosBlockNum = + * floor(totalTermFreq/PackedBlockSize) + *
    • PosVIntCount is the number of positions encoded as VInt format. In particular, + * PosVIntCount = totalTermFreq - PackedPosBlockNum*PackedBlockSize + *
    • The procedure how PackedPosDeltaBlock is generated is the same as PackedDocDeltaBlock + * in chapter Frequencies and Skip Data. + *
    • PositionDelta is, if payloads are disabled for the term's field, the difference + * between the position of the current occurrence in the document and the previous + * occurrence (or zero, if this is the first occurrence in this document). If payloads + * are enabled for the term's field, then PositionDelta/2 is the difference between the + * current and the previous position. If payloads are enabled and PositionDelta is odd, + * then PayloadLength is stored, indicating the length of the payload at the current + * term position. + *
    • For example, the TermPositions for a term which occurs as the fourth term in one + * document, and as the fifth and ninth term in a subsequent document, would be the + * following sequence of VInts (payloads disabled): + *

      4, 5, 4 + *

    • PayloadData is metadata associated with the current term position. If PayloadLength + * is stored at the current position, then it indicates the length of this payload. If + * PayloadLength is not stored, then this payload has the same length as the payload at + * the previous position. + *
    • OffsetDelta/2 is the difference between this position's startOffset from the previous + * occurrence (or zero, if this is the first occurrence in this document). If + * OffsetDelta is odd, then the length (endOffset-startOffset) differs from the previous + * occurrence and an OffsetLength follows. Offset data is only written for {@link + * IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}. + *
    + *
    + * + * + * + *
    + *
    Payloads and Offsets + *

    The .pay file will store payloads and offsets associated with certain term-document + * positions. Some payloads and offsets will be separated out into .pos file, for performance + * reasons. + *

      + *
    • PayFile(.pay): --> Header, <TermPayloads?, TermOffsets?> + * TermCount, Footer + *
    • Header --> {@link CodecUtil#writeIndexHeader IndexHeader} + *
    • TermPayloads --> <PackedPayLengthBlock, SumPayLength, PayData> + * PackedPayBlockNum + *
    • TermOffsets --> <PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock> + * PackedPayBlockNum + *
    • PackedPayLengthBlock, PackedOffsetStartDeltaBlock, PackedOffsetLengthBlock --> + * {@link PackedInts PackedInts} + *
    • SumPayLength --> {@link DataOutput#writeVInt VInt} + *
    • PayData --> {@link DataOutput#writeByte byte}SumPayLength + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + *

    Notes: + *

      + *
    • The order of TermPayloads/TermOffsets will be the same as TermPositions, note that + * part of payload/offsets are stored in .pos. + *
    • The procedure how PackedPayLengthBlock and PackedOffsetLengthBlock are generated is + * the same as PackedFreqBlock in chapter Frequencies and Skip + * Data. While PackedStartDeltaBlock follows a same procedure as + * PackedDocDeltaBlock. + *
    • PackedPayBlockNum is always equal to PackedPosBlockNum, for the same term. It is also + * synonym for PackedOffsetBlockNum. + *
    • SumPayLength is the total length of payloads written within one block, should be the + * sum of PayLengths in one packed block. + *
    • PayLength in PackedPayLengthBlock is the length of each payload associated with the + * current position. + *
    + *
    + * + * @lucene.experimental + */ +public final class Lucene103PostingsFormat extends PostingsFormat { + + /** Filename extension for some small metadata about how postings are encoded. */ + public static final String META_EXTENSION = "psm"; + + /** + * Filename extension for document number, frequencies, and skip data. See chapter: Frequencies and Skip Data + */ + public static final String DOC_EXTENSION = "doc"; + + /** Filename extension for positions. See chapter: Positions */ + public static final String POS_EXTENSION = "pos"; + + /** + * Filename extension for payloads and offsets. See chapter: Payloads and + * Offsets + */ + public static final String PAY_EXTENSION = "pay"; + + /** Size of blocks. */ + public static final int BLOCK_SIZE = ForUtil.BLOCK_SIZE; + + public static final int BLOCK_MASK = BLOCK_SIZE - 1; + + /** We insert skip data on every block and every SKIP_FACTOR=32 blocks. */ + public static final int LEVEL1_FACTOR = 32; + + /** Total number of docs covered by level 1 skip data: 32 * 128 = 4,096 */ + public static final int LEVEL1_NUM_DOCS = LEVEL1_FACTOR * BLOCK_SIZE; + + public static final int LEVEL1_MASK = LEVEL1_NUM_DOCS - 1; + + /** + * Return the class that implements {@link ImpactsEnum} in this {@link PostingsFormat}. This is + * internally used to help the JVM make good inlining decisions. + * + * @lucene.internal + */ + public static Class getImpactsEnumImpl() { + return Lucene103PostingsReader.BlockPostingsEnum.class; + } + + static final String TERMS_CODEC = "Lucene103PostingsWriterTerms"; + static final String META_CODEC = "Lucene103PostingsWriterMeta"; + static final String DOC_CODEC = "Lucene103PostingsWriterDoc"; + static final String POS_CODEC = "Lucene103PostingsWriterPos"; + static final String PAY_CODEC = "Lucene103PostingsWriterPay"; + + static final int VERSION_START = 0; + + static final int VERSION_CURRENT = VERSION_START; + + private final int version; + private final int minTermBlockSize; + private final int maxTermBlockSize; + + /** Creates {@code Lucene103PostingsFormat} with default settings. */ + public Lucene103PostingsFormat() { + this( + Lucene103BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene103BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + } + + /** + * Creates {@code Lucene103PostingsFormat} with custom values for {@code minBlockSize} and {@code + * maxBlockSize} passed to block terms dictionary. + * + * @see + * Lucene103BlockTreeTermsWriter#Lucene103BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int) + */ + public Lucene103PostingsFormat(int minTermBlockSize, int maxTermBlockSize) { + this(minTermBlockSize, maxTermBlockSize, VERSION_CURRENT); + } + + /** Expert constructor that allows setting the version. */ + public Lucene103PostingsFormat(int minTermBlockSize, int maxTermBlockSize, int version) { + super("Lucene103"); + if (version < VERSION_START || version > VERSION_CURRENT) { + throw new IllegalArgumentException("Version out of range: " + version); + } + this.version = version; + Lucene103BlockTreeTermsWriter.validateSettings(minTermBlockSize, maxTermBlockSize); + this.minTermBlockSize = minTermBlockSize; + this.maxTermBlockSize = maxTermBlockSize; + } + + @Override + public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(state, version); + boolean success = false; + try { + FieldsConsumer ret = + new Lucene103BlockTreeTermsWriter( + state, postingsWriter, minTermBlockSize, maxTermBlockSize); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsWriter); + } + } + } + + @Override + public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { + PostingsReaderBase postingsReader = new Lucene103PostingsReader(state); + boolean success = false; + try { + FieldsProducer ret = new Lucene103BlockTreeTermsReader(postingsReader, state); + success = true; + return ret; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(postingsReader); + } + } + } + + /** + * Holds all state required for {@link Lucene103PostingsReader} to produce a {@link + * org.apache.lucene.index.PostingsEnum} without re-seeking the terms dict. + * + * @lucene.internal + */ + public static final class IntBlockTermState extends BlockTermState { + /** file pointer to the start of the doc ids enumeration, in {@link #DOC_EXTENSION} file */ + public long docStartFP; + + /** file pointer to the start of the positions enumeration, in {@link #POS_EXTENSION} file */ + public long posStartFP; + + /** file pointer to the start of the payloads enumeration, in {@link #PAY_EXTENSION} file */ + public long payStartFP; + + /** + * file offset for the last position in the last block, if there are more than {@link + * ForUtil#BLOCK_SIZE} positions; otherwise -1 + * + *

    One might think to use total term frequency to track how many positions are left to read + * as we decode the blocks, and decode the last block differently when num_left_positions < + * BLOCK_SIZE. Unfortunately this won't work since the tracking will be messed up when we skip + * blocks as the skipper will only tell us new position offset (start of block) and number of + * positions to skip for that block, without telling us how many positions it has skipped. + */ + public long lastPosBlockOffset; + + /** + * docid when there is a single pulsed posting, otherwise -1. freq is always implicitly + * totalTermFreq in this case. + */ + public int singletonDocID; + + /** Sole constructor. */ + public IntBlockTermState() { + lastPosBlockOffset = -1; + singletonDocID = -1; + } + + @Override + public IntBlockTermState clone() { + IntBlockTermState other = new IntBlockTermState(); + other.copyFrom(this); + return other; + } + + @Override + public void copyFrom(TermState _other) { + super.copyFrom(_other); + IntBlockTermState other = (IntBlockTermState) _other; + docStartFP = other.docStartFP; + posStartFP = other.posStartFP; + payStartFP = other.payStartFP; + lastPosBlockOffset = other.lastPosBlockOffset; + singletonDocID = other.singletonDocID; + } + + @Override + public String toString() { + return super.toString() + + " docStartFP=" + + docStartFP + + " posStartFP=" + + posStartFP + + " payStartFP=" + + payStartFP + + " lastPosBlockOffset=" + + lastPosBlockOffset + + " singletonDocID=" + + singletonDocID; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java new file mode 100644 index 000000000000..ccacfc4658a8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsReader.java @@ -0,0 +1,1439 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103; + +import static org.apache.lucene.codecs.lucene103.ForUtil.BLOCK_SIZE; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.LEVEL1_NUM_DOCS; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.META_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.TERMS_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.VERSION_CURRENT; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.VERSION_START; + +import java.io.IOException; +import java.util.AbstractList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.RandomAccess; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.Impacts; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.internal.vectorization.PostingDecodingUtil; +import org.apache.lucene.internal.vectorization.VectorizationProvider; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.VectorUtil; + +/** + * Concrete class that reads docId(maybe frq,pos,offset,payloads) list with postings format. + * + * @lucene.experimental + */ +public final class Lucene103PostingsReader extends PostingsReaderBase { + + static final VectorizationProvider VECTORIZATION_PROVIDER = VectorizationProvider.getInstance(); + // Dummy impacts, composed of the maximum possible term frequency and the lowest possible + // (unsigned) norm value. This is typically used on tail blocks, which don't actually record + // impacts as the storage overhead would not be worth any query evaluation speedup, since there's + // less than 128 docs left to evaluate anyway. + private static final List DUMMY_IMPACTS = + Collections.singletonList(new Impact(Integer.MAX_VALUE, 1L)); + + private final IndexInput docIn; + private final IndexInput posIn; + private final IndexInput payIn; + + private final int maxNumImpactsAtLevel0; + private final int maxImpactNumBytesAtLevel0; + private final int maxNumImpactsAtLevel1; + private final int maxImpactNumBytesAtLevel1; + + /** Sole constructor. */ + public Lucene103PostingsReader(SegmentReadState state) throws IOException { + String metaName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.META_EXTENSION); + final long expectedDocFileLength, expectedPosFileLength, expectedPayFileLength; + ChecksumIndexInput metaIn = null; + boolean success = false; + int version; + try { + metaIn = state.directory.openChecksumInput(metaName); + version = + CodecUtil.checkIndexHeader( + metaIn, + META_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + maxNumImpactsAtLevel0 = metaIn.readInt(); + maxImpactNumBytesAtLevel0 = metaIn.readInt(); + maxNumImpactsAtLevel1 = metaIn.readInt(); + maxImpactNumBytesAtLevel1 = metaIn.readInt(); + expectedDocFileLength = metaIn.readLong(); + if (state.fieldInfos.hasProx()) { + expectedPosFileLength = metaIn.readLong(); + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + expectedPayFileLength = metaIn.readLong(); + } else { + expectedPayFileLength = -1; + } + } else { + expectedPosFileLength = -1; + expectedPayFileLength = -1; + } + CodecUtil.checkFooter(metaIn, null); + success = true; + } catch (Throwable t) { + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, t); + throw new AssertionError("unreachable"); + } else { + throw t; + } + } finally { + if (success) { + metaIn.close(); + } else { + IOUtils.closeWhileHandlingException(metaIn); + } + } + + success = false; + IndexInput docIn = null; + IndexInput posIn = null; + IndexInput payIn = null; + + // NOTE: these data files are too costly to verify checksum against all the bytes on open, + // but for now we at least verify proper structure of the checksum footer: which looks + // for FOOTER_MAGIC + algorithmID. This is cheap and can detect some forms of corruption + // such as file truncation. + + String docName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.DOC_EXTENSION); + try { + // Postings have a forward-only access pattern, so pass ReadAdvice.NORMAL to perform + // readahead. + docIn = state.directory.openInput(docName, state.context.withReadAdvice(ReadAdvice.NORMAL)); + CodecUtil.checkIndexHeader( + docIn, DOC_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(docIn, expectedDocFileLength); + + if (state.fieldInfos.hasProx()) { + String proxName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.POS_EXTENSION); + posIn = state.directory.openInput(proxName, state.context); + CodecUtil.checkIndexHeader( + posIn, POS_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(posIn, expectedPosFileLength); + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene103PostingsFormat.PAY_EXTENSION); + payIn = state.directory.openInput(payName, state.context); + CodecUtil.checkIndexHeader( + payIn, PAY_CODEC, version, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.retrieveChecksum(payIn, expectedPayFileLength); + } + } + + this.docIn = docIn; + this.posIn = posIn; + this.payIn = payIn; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(docIn, posIn, payIn); + } + } + } + + @Override + public void init(IndexInput termsIn, SegmentReadState state) throws IOException { + // Make sure we are talking to the matching postings writer + CodecUtil.checkIndexHeader( + termsIn, + TERMS_CODEC, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + final int indexBlockSize = termsIn.readVInt(); + if (indexBlockSize != BLOCK_SIZE) { + throw new IllegalStateException( + "index-time BLOCK_SIZE (" + + indexBlockSize + + ") != read-time BLOCK_SIZE (" + + BLOCK_SIZE + + ")"); + } + } + + static void prefixSum(int[] buffer, int count, long base) { + buffer[0] += base; + for (int i = 1; i < count; ++i) { + buffer[i] += buffer[i - 1]; + } + } + + @Override + public BlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void close() throws IOException { + IOUtils.close(docIn, posIn, payIn); + } + + @Override + public void decodeTerm( + DataInput in, FieldInfo fieldInfo, BlockTermState _termState, boolean absolute) + throws IOException { + final IntBlockTermState termState = (IntBlockTermState) _termState; + if (absolute) { + termState.docStartFP = 0; + termState.posStartFP = 0; + termState.payStartFP = 0; + } + + final long l = in.readVLong(); + if ((l & 0x01) == 0) { + termState.docStartFP += l >>> 1; + if (termState.docFreq == 1) { + termState.singletonDocID = in.readVInt(); + } else { + termState.singletonDocID = -1; + } + } else { + assert absolute == false; + assert termState.singletonDocID != -1; + termState.singletonDocID += BitUtil.zigZagDecode(l >>> 1); + } + + if (fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0) { + termState.posStartFP += in.readVLong(); + if (fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0 + || fieldInfo.hasPayloads()) { + termState.payStartFP += in.readVLong(); + } + if (termState.totalTermFreq > BLOCK_SIZE) { + termState.lastPosBlockOffset = in.readVLong(); + } else { + termState.lastPosBlockOffset = -1; + } + } + } + + @Override + public PostingsEnum postings( + FieldInfo fieldInfo, BlockTermState termState, PostingsEnum reuse, int flags) + throws IOException { + return (reuse instanceof BlockPostingsEnum everythingEnum + && everythingEnum.canReuse(docIn, fieldInfo, flags, false) + ? everythingEnum + : new BlockPostingsEnum(fieldInfo, flags, false)) + .reset((IntBlockTermState) termState, flags); + } + + @Override + public ImpactsEnum impacts(FieldInfo fieldInfo, BlockTermState state, int flags) + throws IOException { + return new BlockPostingsEnum(fieldInfo, flags, true).reset((IntBlockTermState) state, flags); + } + + private static int sumOverRange(int[] arr, int start, int end) { + int res = 0; + for (int i = start; i < end; i++) { + res += arr[i]; + } + return res; + } + + final class BlockPostingsEnum extends ImpactsEnum { + + private enum DeltaEncoding { + /** + * Deltas between consecutive docs are stored as packed integers, ie. the block is encoded + * using Frame Of Reference (FOR). + */ + PACKED, + /** + * Deltas between consecutive docs are stored using unary coding, ie. {@code delta-1} zero + * bits followed by a one bit, ie. the block is encoded as an offset plus a bit set. + */ + UNARY + } + + private ForDeltaUtil forDeltaUtil; + private PForUtil pforUtil; + + /* Variables that store the content of a block and the current position within this block */ + /* Shared variables */ + private DeltaEncoding encoding; + private int doc; // doc we last read + + /* Variables when the block is stored as packed deltas (Frame Of Reference) */ + private final int[] docBuffer = new int[BLOCK_SIZE]; + + /* Variables when the block is stored as a bit set */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet docBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + private int docBitSetBase; + // Reuse docBuffer for cumulative pop counts of the words of the bit set. + private final int[] docCumulativeWordPopCounts = docBuffer; + + // level 0 skip data + private int level0LastDocID; + private long level0DocEndFP; + + // level 1 skip data + private int level1LastDocID; + private long level1DocEndFP; + private int level1DocCountUpto; + + private int docFreq; // number of docs in this posting list + private long totalTermFreq; // sum of freqBuffer in this posting list (or docFreq when omitted) + + private int singletonDocID; // docid when there is a single pulsed posting, otherwise -1 + + private int docCountLeft; // number of remaining docs in this postings list + private int prevDocID; // last doc ID of the previous block + + private int docBufferSize; + private int docBufferUpto; + + private IndexInput docIn; + private PostingDecodingUtil docInUtil; + + private final int[] freqBuffer = new int[BLOCK_SIZE]; + private final int[] posDeltaBuffer; + + private final int[] payloadLengthBuffer; + private final int[] offsetStartDeltaBuffer; + private final int[] offsetLengthBuffer; + + private byte[] payloadBytes; + private int payloadByteUpto; + private int payloadLength; + + private int lastStartOffset; + private int startOffset; + private int endOffset; + + private int posBufferUpto; + + final IndexInput posIn; + final PostingDecodingUtil posInUtil; + final IndexInput payIn; + final PostingDecodingUtil payInUtil; + final BytesRef payload; + + final IndexOptions options; + final boolean indexHasFreq; + final boolean indexHasPos; + final boolean indexHasOffsets; + final boolean indexHasPayloads; + final boolean indexHasOffsetsOrPayloads; + + final int flags; + final boolean needsFreq; + final boolean needsPos; + final boolean needsOffsets; + final boolean needsPayloads; + final boolean needsOffsetsOrPayloads; + final boolean needsImpacts; + final boolean needsDocsAndFreqsOnly; + + private long freqFP; // offset of the freq block + + private int position; // current position + + // value of docBufferUpto on the last doc ID when positions have been read + private int posDocBufferUpto; + + // how many positions "behind" we are; nextPosition must + // skip these to "catch up": + private int posPendingCount; + + // File pointer where the last (vInt encoded) pos delta + // block is. We need this to know whether to bulk + // decode vs vInt decode the block: + private long lastPosBlockFP; + + // level 0 skip data + private long level0PosEndFP; + private int level0BlockPosUpto; + private long level0PayEndFP; + private int level0BlockPayUpto; + private final BytesRef level0SerializedImpacts; + private final MutableImpactList level0Impacts; + + // level 1 skip data + private long level1PosEndFP; + private int level1BlockPosUpto; + private long level1PayEndFP; + private int level1BlockPayUpto; + private final BytesRef level1SerializedImpacts; + private final MutableImpactList level1Impacts; + + // true if we shallow-advanced to a new block that we have not decoded yet + private boolean needsRefilling; + + public BlockPostingsEnum(FieldInfo fieldInfo, int flags, boolean needsImpacts) + throws IOException { + options = fieldInfo.getIndexOptions(); + indexHasFreq = options.compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + indexHasPos = options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + indexHasOffsets = + options.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0; + indexHasPayloads = fieldInfo.hasPayloads(); + indexHasOffsetsOrPayloads = indexHasOffsets || indexHasPayloads; + + this.flags = flags; + needsFreq = indexHasFreq && PostingsEnum.featureRequested(flags, PostingsEnum.FREQS); + needsPos = indexHasPos && PostingsEnum.featureRequested(flags, PostingsEnum.POSITIONS); + needsOffsets = indexHasOffsets && PostingsEnum.featureRequested(flags, PostingsEnum.OFFSETS); + needsPayloads = + indexHasPayloads && PostingsEnum.featureRequested(flags, PostingsEnum.PAYLOADS); + needsOffsetsOrPayloads = needsOffsets || needsPayloads; + this.needsImpacts = needsImpacts; + needsDocsAndFreqsOnly = needsPos == false && needsImpacts == false; + + if (needsFreq == false) { + Arrays.fill(freqBuffer, 1); + } + + if (needsFreq && needsImpacts) { + level0SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel0); + level1SerializedImpacts = new BytesRef(maxImpactNumBytesAtLevel1); + level0Impacts = new MutableImpactList(maxNumImpactsAtLevel0); + level1Impacts = new MutableImpactList(maxNumImpactsAtLevel1); + } else { + level0SerializedImpacts = null; + level1SerializedImpacts = null; + level0Impacts = null; + level1Impacts = null; + } + + if (needsPos) { + this.posIn = Lucene103PostingsReader.this.posIn.clone(); + posInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(posIn); + posDeltaBuffer = new int[BLOCK_SIZE]; + } else { + this.posIn = null; + this.posInUtil = null; + posDeltaBuffer = null; + } + + if (needsOffsets || needsPayloads) { + this.payIn = Lucene103PostingsReader.this.payIn.clone(); + payInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(payIn); + } else { + this.payIn = null; + payInUtil = null; + } + + if (needsOffsets) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + startOffset = -1; + endOffset = -1; + } + + if (indexHasPayloads) { + payloadLengthBuffer = new int[BLOCK_SIZE]; + payloadBytes = new byte[128]; + payload = new BytesRef(); + } else { + payloadLengthBuffer = null; + payloadBytes = null; + payload = null; + } + } + + public boolean canReuse( + IndexInput docIn, FieldInfo fieldInfo, int flags, boolean needsImpacts) { + return docIn == Lucene103PostingsReader.this.docIn + && options == fieldInfo.getIndexOptions() + && indexHasPayloads == fieldInfo.hasPayloads() + && this.flags == flags + && this.needsImpacts == needsImpacts; + } + + public BlockPostingsEnum reset(IntBlockTermState termState, int flags) throws IOException { + docFreq = termState.docFreq; + singletonDocID = termState.singletonDocID; + if (docFreq > 1) { + if (docIn == null) { + // lazy init + docIn = Lucene103PostingsReader.this.docIn.clone(); + docInUtil = VECTORIZATION_PROVIDER.newPostingDecodingUtil(docIn); + } + prefetchPostings(docIn, termState); + } + + if (forDeltaUtil == null && docFreq >= BLOCK_SIZE) { + forDeltaUtil = new ForDeltaUtil(); + } + totalTermFreq = indexHasFreq ? termState.totalTermFreq : termState.docFreq; + if (needsFreq && pforUtil == null && totalTermFreq >= BLOCK_SIZE) { + pforUtil = new PForUtil(); + } + + // Where this term's postings start in the .pos file: + final long posTermStartFP = termState.posStartFP; + // Where this term's payloads/offsets start in the .pay + // file: + final long payTermStartFP = termState.payStartFP; + if (posIn != null) { + posIn.seek(posTermStartFP); + if (payIn != null) { + payIn.seek(payTermStartFP); + } + } + level1PosEndFP = posTermStartFP; + level1PayEndFP = payTermStartFP; + level0PosEndFP = posTermStartFP; + level0PayEndFP = payTermStartFP; + posPendingCount = 0; + payloadByteUpto = 0; + if (termState.totalTermFreq < BLOCK_SIZE) { + lastPosBlockFP = posTermStartFP; + } else if (termState.totalTermFreq == BLOCK_SIZE) { + lastPosBlockFP = -1; + } else { + lastPosBlockFP = posTermStartFP + termState.lastPosBlockOffset; + } + + level1BlockPosUpto = 0; + level1BlockPayUpto = 0; + level0BlockPosUpto = 0; + level0BlockPayUpto = 0; + posBufferUpto = BLOCK_SIZE; + + doc = -1; + prevDocID = -1; + docCountLeft = docFreq; + freqFP = -1L; + level0LastDocID = -1; + if (docFreq < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + if (docFreq > 1) { + docIn.seek(termState.docStartFP); + } + } else { + level1LastDocID = -1; + level1DocEndFP = termState.docStartFP; + } + level1DocCountUpto = 0; + docBufferSize = BLOCK_SIZE; + docBufferUpto = BLOCK_SIZE; + posDocBufferUpto = BLOCK_SIZE; + + return this; + } + + @Override + public int docID() { + return doc; + } + + @Override + public int freq() throws IOException { + if (freqFP != -1) { + docIn.seek(freqFP); + pforUtil.decode(docInUtil, freqBuffer); + freqFP = -1; + } + return freqBuffer[docBufferUpto - 1]; + } + + private void refillFullBlock() throws IOException { + int bitsPerValue = docIn.readByte(); + if (bitsPerValue > 0) { + // block is encoded as 128 packed integers that record the delta between doc IDs + forDeltaUtil.decodeAndPrefixSum(bitsPerValue, docInUtil, prevDocID, docBuffer); + encoding = DeltaEncoding.PACKED; + } else { + // block is encoded as a bit set + assert level0LastDocID != NO_MORE_DOCS; + docBitSetBase = prevDocID + 1; + int numLongs; + if (bitsPerValue == 0) { + // 0 is used to record that all 128 docs in the block are consecutive + numLongs = BLOCK_SIZE / Long.SIZE; // 2 + docBitSet.set(0, BLOCK_SIZE); + } else { + numLongs = -bitsPerValue; + docIn.readLongs(docBitSet.getBits(), 0, numLongs); + } + if (needsFreq) { + // Note: we know that BLOCK_SIZE bits are set, so no need to compute the cumulative pop + // count at the last index, it will be BLOCK_SIZE. + // Note: this for loop auto-vectorizes + for (int i = 0; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] = Long.bitCount(docBitSet.getBits()[i]); + } + for (int i = 1; i < numLongs - 1; ++i) { + docCumulativeWordPopCounts[i] += docCumulativeWordPopCounts[i - 1]; + } + docCumulativeWordPopCounts[numLongs - 1] = BLOCK_SIZE; + assert docCumulativeWordPopCounts[numLongs - 2] + + Long.bitCount(docBitSet.getBits()[numLongs - 1]) + == BLOCK_SIZE; + } + encoding = DeltaEncoding.UNARY; + } + if (indexHasFreq) { + if (needsFreq) { + freqFP = docIn.getFilePointer(); + } + PForUtil.skip(docIn); + } + docCountLeft -= BLOCK_SIZE; + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + posDocBufferUpto = 0; + } + + private void refillRemainder() throws IOException { + assert docCountLeft >= 0 && docCountLeft < BLOCK_SIZE; + if (docFreq == 1) { + docBuffer[0] = singletonDocID; + freqBuffer[0] = (int) totalTermFreq; + docBuffer[1] = NO_MORE_DOCS; + assert freqFP == -1; + docCountLeft = 0; + docBufferSize = 1; + } else { + // Read vInts: + PostingsUtil.readVIntBlock( + docIn, docBuffer, freqBuffer, docCountLeft, indexHasFreq, needsFreq); + prefixSum(docBuffer, docCountLeft, prevDocID); + docBuffer[docCountLeft] = NO_MORE_DOCS; + freqFP = -1L; + docBufferSize = docCountLeft; + docCountLeft = 0; + } + prevDocID = docBuffer[BLOCK_SIZE - 1]; + docBufferUpto = 0; + posDocBufferUpto = 0; + encoding = DeltaEncoding.PACKED; + assert docBuffer[docBufferSize] == NO_MORE_DOCS; + } + + private void refillDocs() throws IOException { + assert docCountLeft >= 0; + + if (docCountLeft >= BLOCK_SIZE) { + refillFullBlock(); + } else { + refillRemainder(); + } + } + + private void skipLevel1To(int target) throws IOException { + while (true) { + prevDocID = level1LastDocID; + level0LastDocID = level1LastDocID; + docIn.seek(level1DocEndFP); + level0PosEndFP = level1PosEndFP; + level0BlockPosUpto = level1BlockPosUpto; + level0PayEndFP = level1PayEndFP; + level0BlockPayUpto = level1BlockPayUpto; + docCountLeft = docFreq - level1DocCountUpto; + level1DocCountUpto += LEVEL1_NUM_DOCS; + + if (docCountLeft < LEVEL1_NUM_DOCS) { + level1LastDocID = NO_MORE_DOCS; + break; + } + + level1LastDocID += docIn.readVInt(); + long delta = docIn.readVLong(); + level1DocEndFP = delta + docIn.getFilePointer(); + + if (indexHasFreq) { + long skip1EndFP = docIn.readShort() + docIn.getFilePointer(); + int numImpactBytes = docIn.readShort(); + if (needsImpacts && level1LastDocID >= target) { + docIn.readBytes(level1SerializedImpacts.bytes, 0, numImpactBytes); + level1SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + if (indexHasPos) { + level1PosEndFP += docIn.readVLong(); + level1BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level1PayEndFP += docIn.readVLong(); + level1BlockPayUpto = docIn.readVInt(); + } + } + assert docIn.getFilePointer() == skip1EndFP; + } + + if (level1LastDocID >= target) { + break; + } + } + } + + private void doMoveToNextLevel0Block() throws IOException { + assert doc == level0LastDocID; + if (posIn != null) { + if (level0PosEndFP >= posIn.getFilePointer()) { + posIn.seek(level0PosEndFP); + posPendingCount = level0BlockPosUpto; + if (payIn != null) { + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(level0PayEndFP); + payloadByteUpto = level0BlockPayUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + assert freqFP == -1L; + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + } + } + + if (docCountLeft >= BLOCK_SIZE) { + docIn.readVLong(); // level0NumBytes + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + if (indexHasFreq) { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (indexHasPos) { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } + } + refillFullBlock(); + } else { + level0LastDocID = NO_MORE_DOCS; + refillRemainder(); + } + } + + private void moveToNextLevel0Block() throws IOException { + if (doc == level1LastDocID) { // advance level 1 skip data + skipLevel1To(doc + 1); + } + + // Now advance level 0 skip data + prevDocID = level0LastDocID; + + if (needsDocsAndFreqsOnly && docCountLeft >= BLOCK_SIZE) { + // Optimize the common path for exhaustive evaluation + long level0NumBytes = docIn.readVLong(); + long level0End = docIn.getFilePointer() + level0NumBytes; + level0LastDocID += readVInt15(docIn); + docIn.seek(level0End); + refillFullBlock(); + } else { + doMoveToNextLevel0Block(); + } + } + + private void readLevel0PosData() throws IOException { + level0PosEndFP += docIn.readVLong(); + level0BlockPosUpto = docIn.readByte(); + if (indexHasOffsetsOrPayloads) { + level0PayEndFP += docIn.readVLong(); + level0BlockPayUpto = docIn.readVInt(); + } + } + + private void seekPosData(long posFP, int posUpto, long payFP, int payUpto) throws IOException { + // If nextBlockPosFP is less than the current FP, it means that the block of positions for + // the first docs of the next block are already decoded. In this case we just accumulate + // frequencies into posPendingCount instead of seeking backwards and decoding the same pos + // block again. + if (posFP >= posIn.getFilePointer()) { + posIn.seek(posFP); + posPendingCount = posUpto; + if (payIn != null) { // needs payloads or offsets + assert level0PayEndFP >= payIn.getFilePointer(); + payIn.seek(payFP); + payloadByteUpto = payUpto; + } + posBufferUpto = BLOCK_SIZE; + } else { + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, BLOCK_SIZE); + } + } + + private void skipLevel0To(int target) throws IOException { + long posFP; + int posUpto; + long payFP; + int payUpto; + + while (true) { + prevDocID = level0LastDocID; + + posFP = level0PosEndFP; + posUpto = level0BlockPosUpto; + payFP = level0PayEndFP; + payUpto = level0BlockPayUpto; + + if (docCountLeft >= BLOCK_SIZE) { + long numSkipBytes = docIn.readVLong(); + long skip0End = docIn.getFilePointer() + numSkipBytes; + int docDelta = readVInt15(docIn); + level0LastDocID += docDelta; + boolean found = target <= level0LastDocID; + long blockLength = readVLong15(docIn); + level0DocEndFP = docIn.getFilePointer() + blockLength; + + if (indexHasFreq) { + if (found == false && needsPos == false) { + docIn.seek(skip0End); + } else { + int numImpactBytes = docIn.readVInt(); + if (needsImpacts && found) { + docIn.readBytes(level0SerializedImpacts.bytes, 0, numImpactBytes); + level0SerializedImpacts.length = numImpactBytes; + } else { + docIn.skipBytes(numImpactBytes); + } + + if (needsPos) { + readLevel0PosData(); + } else { + docIn.seek(skip0End); + } + } + } + + if (found) { + break; + } + + docIn.seek(level0DocEndFP); + docCountLeft -= BLOCK_SIZE; + } else { + level0LastDocID = NO_MORE_DOCS; + break; + } + } + + if (posIn != null) { // needs positions + seekPosData(posFP, posUpto, payFP, payUpto); + } + } + + @Override + public void advanceShallow(int target) throws IOException { + if (target > level0LastDocID) { // advance level 0 skip data + doAdvanceShallow(target); + needsRefilling = true; + } + } + + private void doAdvanceShallow(int target) throws IOException { + if (target > level1LastDocID) { // advance skip data on level 1 + skipLevel1To(target); + } else if (needsRefilling) { + docIn.seek(level0DocEndFP); + docCountLeft -= BLOCK_SIZE; + } + + skipLevel0To(target); + } + + @Override + public int nextDoc() throws IOException { + if (doc == level0LastDocID || needsRefilling) { + if (needsRefilling) { + refillDocs(); + needsRefilling = false; + } else { + moveToNextLevel0Block(); + } + } + + switch (encoding) { + case PACKED: + doc = docBuffer[docBufferUpto]; + break; + case UNARY: + int next = docBitSet.nextSetBit(doc - docBitSetBase + 1); + assert next != NO_MORE_DOCS; + doc = docBitSetBase + next; + break; + } + + ++docBufferUpto; + return this.doc; + } + + @Override + public int advance(int target) throws IOException { + if (target > level0LastDocID || needsRefilling) { + if (target > level0LastDocID) { + doAdvanceShallow(target); + } + refillDocs(); + needsRefilling = false; + } + + switch (encoding) { + case PACKED: + { + int next = VectorUtil.findNextGEQ(docBuffer, target, docBufferUpto, docBufferSize); + this.doc = docBuffer[next]; + docBufferUpto = next + 1; + } + break; + case UNARY: + { + int next = docBitSet.nextSetBit(target - docBitSetBase); + assert next != NO_MORE_DOCS; + this.doc = docBitSetBase + next; + if (needsFreq) { + int wordIndex = next >> 6; + // Take the cumulative pop count for the given word, and subtract bits on the left of + // the current doc. + docBufferUpto = + 1 + + docCumulativeWordPopCounts[wordIndex] + - Long.bitCount(docBitSet.getBits()[wordIndex] >>> next); + } else { + // When only docs needed and block is UNARY encoded, we do not need to maintain + // docBufferUpTo to record the iteration position in the block. + // docBufferUpTo == 0 means the block has not been iterated. + // docBufferUpTo != 0 means the block has been iterated. + docBufferUpto = 1; + } + } + break; + } + + return doc; + } + + @Override + public void intoBitSet(int upTo, FixedBitSet bitSet, int offset) throws IOException { + if (doc >= upTo) { + return; + } + + // Handle the current doc separately, it may be on the previous docBuffer. + bitSet.set(doc - offset); + + for (; ; ) { + if (doc == level0LastDocID) { + // refill + moveToNextLevel0Block(); + } + + switch (encoding) { + case PACKED: + { + int start = docBufferUpto; + int end = computeBufferEndBoundary(upTo); + if (end != 0) { + bufferIntoBitSet(start, end, bitSet, offset); + doc = docBuffer[end - 1]; + } + docBufferUpto = end; + if (end != BLOCK_SIZE) { + // Either the block is a tail block, or the block did not fully match, we're done. + nextDoc(); + assert doc >= upTo; + return; + } + } + break; + case UNARY: + { + int sourceFrom; + if (docBufferUpto == 0) { + // start from beginning + sourceFrom = 0; + } else { + // start after the current doc + sourceFrom = doc - docBitSetBase + 1; + } + + int destFrom = docBitSetBase - offset + sourceFrom; + + assert level0LastDocID != NO_MORE_DOCS; + int sourceTo = Math.min(upTo, level0LastDocID + 1) - docBitSetBase; + + if (sourceTo > sourceFrom) { + FixedBitSet.orRange(docBitSet, sourceFrom, bitSet, destFrom, sourceTo - sourceFrom); + } + if (docBitSetBase + sourceTo <= level0LastDocID) { + // We stopped before the end of the current bit set, which means that we're done. + // Set the current doc before returning. + advance(docBitSetBase + sourceTo); + return; + } + doc = level0LastDocID; + docBufferUpto = BLOCK_SIZE; + } + break; + } + } + } + + private int computeBufferEndBoundary(int upTo) { + if (docBufferSize != 0 && docBuffer[docBufferSize - 1] < upTo) { + // All docs in the buffer are under upTo + return docBufferSize; + } else { + // Find the index of the first doc that is greater than or equal to upTo + return VectorUtil.findNextGEQ(docBuffer, upTo, docBufferUpto, docBufferSize); + } + } + + private void bufferIntoBitSet(int start, int end, FixedBitSet bitSet, int offset) + throws IOException { + // bitSet#set and `doc - offset` get auto-vectorized + for (int i = start; i < end; ++i) { + int doc = docBuffer[i]; + bitSet.set(doc - offset); + } + } + + private void skipPositions(int freq) throws IOException { + // Skip positions now: + int toSkip = posPendingCount - freq; + // if (DEBUG) { + // System.out.println(" FPR.skipPositions: toSkip=" + toSkip); + // } + + final int leftInBlock = BLOCK_SIZE - posBufferUpto; + if (toSkip < leftInBlock) { + int end = posBufferUpto + toSkip; + if (needsPayloads) { + payloadByteUpto += sumOverRange(payloadLengthBuffer, posBufferUpto, end); + } + posBufferUpto = end; + } else { + toSkip -= leftInBlock; + while (toSkip >= BLOCK_SIZE) { + assert posIn.getFilePointer() != lastPosBlockFP; + PForUtil.skip(posIn); + + if (payIn != null) { + if (indexHasPayloads) { + // Skip payloadLength block: + PForUtil.skip(payIn); + + // Skip payloadBytes block: + int numBytes = payIn.readVInt(); + payIn.seek(payIn.getFilePointer() + numBytes); + } + + if (indexHasOffsets) { + PForUtil.skip(payIn); + PForUtil.skip(payIn); + } + } + toSkip -= BLOCK_SIZE; + } + refillPositions(); + if (needsPayloads) { + payloadByteUpto = sumOverRange(payloadLengthBuffer, 0, toSkip); + } + posBufferUpto = toSkip; + } + } + + private void refillLastPositionBlock() throws IOException { + final int count = (int) (totalTermFreq % BLOCK_SIZE); + int payloadLength = 0; + int offsetLength = 0; + payloadByteUpto = 0; + for (int i = 0; i < count; i++) { + int code = posIn.readVInt(); + if (indexHasPayloads) { + if ((code & 1) != 0) { + payloadLength = posIn.readVInt(); + } + if (payloadLengthBuffer != null) { // needs payloads + payloadLengthBuffer[i] = payloadLength; + posDeltaBuffer[i] = code >>> 1; + if (payloadLength != 0) { + if (payloadByteUpto + payloadLength > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payloadLength); + } + posIn.readBytes(payloadBytes, payloadByteUpto, payloadLength); + payloadByteUpto += payloadLength; + } + } else { + posIn.skipBytes(payloadLength); + } + } else { + posDeltaBuffer[i] = code; + } + + if (indexHasOffsets) { + int deltaCode = posIn.readVInt(); + if ((deltaCode & 1) != 0) { + offsetLength = posIn.readVInt(); + } + if (offsetStartDeltaBuffer != null) { // needs offsets + offsetStartDeltaBuffer[i] = deltaCode >>> 1; + offsetLengthBuffer[i] = offsetLength; + } + } + } + payloadByteUpto = 0; + } + + private void refillOffsetsOrPayloads() throws IOException { + if (indexHasPayloads) { + if (needsPayloads) { + pforUtil.decode(payInUtil, payloadLengthBuffer); + int numBytes = payIn.readVInt(); + + if (numBytes > payloadBytes.length) { + payloadBytes = ArrayUtil.growNoCopy(payloadBytes, numBytes); + } + payIn.readBytes(payloadBytes, 0, numBytes); + } else if (payIn != null) { // needs offsets + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over lengths + int numBytes = payIn.readVInt(); // read length of payloadBytes + payIn.seek(payIn.getFilePointer() + numBytes); // skip over payloadBytes + } + payloadByteUpto = 0; + } + + if (indexHasOffsets) { + if (needsOffsets) { + pforUtil.decode(payInUtil, offsetStartDeltaBuffer); + pforUtil.decode(payInUtil, offsetLengthBuffer); + } else if (payIn != null) { // needs payloads + // this works, because when writing a vint block we always force the first length to be + // written + PForUtil.skip(payIn); // skip over starts + PForUtil.skip(payIn); // skip over lengths + } + } + } + + private void refillPositions() throws IOException { + if (posIn.getFilePointer() == lastPosBlockFP) { + refillLastPositionBlock(); + return; + } + pforUtil.decode(posInUtil, posDeltaBuffer); + + if (indexHasOffsetsOrPayloads) { + refillOffsetsOrPayloads(); + } + } + + private void accumulatePendingPositions() throws IOException { + int freq = freq(); // trigger lazy decoding of freqs + posPendingCount += sumOverRange(freqBuffer, posDocBufferUpto, docBufferUpto); + posDocBufferUpto = docBufferUpto; + + assert posPendingCount > 0; + + if (posPendingCount > freq) { + skipPositions(freq); + posPendingCount = freq; + } + } + + private void accumulatePayloadAndOffsets() { + if (needsPayloads) { + payloadLength = payloadLengthBuffer[posBufferUpto]; + payload.bytes = payloadBytes; + payload.offset = payloadByteUpto; + payload.length = payloadLength; + payloadByteUpto += payloadLength; + } + + if (needsOffsets) { + startOffset = lastStartOffset + offsetStartDeltaBuffer[posBufferUpto]; + endOffset = startOffset + offsetLengthBuffer[posBufferUpto]; + lastStartOffset = startOffset; + } + } + + @Override + public int nextPosition() throws IOException { + if (needsPos == false) { + return -1; + } + + assert posDocBufferUpto <= docBufferUpto; + if (posDocBufferUpto != docBufferUpto) { + // First position we're reading on this doc + accumulatePendingPositions(); + position = 0; + lastStartOffset = 0; + } + + if (posBufferUpto == BLOCK_SIZE) { + refillPositions(); + posBufferUpto = 0; + } + position += posDeltaBuffer[posBufferUpto]; + + if (needsOffsetsOrPayloads) { + accumulatePayloadAndOffsets(); + } + + posBufferUpto++; + posPendingCount--; + return position; + } + + @Override + public int startOffset() { + if (needsOffsets == false) { + return -1; + } + return startOffset; + } + + @Override + public int endOffset() { + if (needsOffsets == false) { + return -1; + } + return endOffset; + } + + @Override + public BytesRef getPayload() { + if (needsPayloads == false || payloadLength == 0) { + return null; + } else { + return payload; + } + } + + @Override + public long cost() { + return docFreq; + } + + private final Impacts impacts = + new Impacts() { + + private final ByteArrayDataInput scratch = new ByteArrayDataInput(); + + @Override + public int numLevels() { + return indexHasFreq == false || level1LastDocID == NO_MORE_DOCS ? 1 : 2; + } + + @Override + public int getDocIdUpTo(int level) { + if (indexHasFreq == false) { + return NO_MORE_DOCS; + } + if (level == 0) { + return level0LastDocID; + } + return level == 1 ? level1LastDocID : NO_MORE_DOCS; + } + + @Override + public List getImpacts(int level) { + if (indexHasFreq) { + if (level == 0 && level0LastDocID != NO_MORE_DOCS) { + return readImpacts(level0SerializedImpacts, level0Impacts); + } + if (level == 1) { + return readImpacts(level1SerializedImpacts, level1Impacts); + } + } + return DUMMY_IMPACTS; + } + + private List readImpacts(BytesRef serialized, MutableImpactList impactsList) { + var scratch = this.scratch; + scratch.reset(serialized.bytes, 0, serialized.length); + Lucene103PostingsReader.readImpacts(scratch, impactsList); + return impactsList; + } + }; + + @Override + public Impacts getImpacts() { + assert needsImpacts; + return impacts; + } + } + + /** + * @see Lucene103PostingsWriter#writeVInt15(org.apache.lucene.store.DataOutput, int) + */ + static int readVInt15(DataInput in) throws IOException { + short s = in.readShort(); + if (s >= 0) { + return s; + } else { + return (s & 0x7FFF) | (in.readVInt() << 15); + } + } + + /** + * @see Lucene103PostingsWriter#writeVLong15(org.apache.lucene.store.DataOutput, long) + */ + static long readVLong15(DataInput in) throws IOException { + short s = in.readShort(); + if (s >= 0) { + return s; + } else { + return (s & 0x7FFFL) | (in.readVLong() << 15); + } + } + + private static void prefetchPostings(IndexInput docIn, IntBlockTermState state) + throws IOException { + assert state.docFreq > 1; // Singletons are inlined in the terms dict, nothing to prefetch + if (docIn.getFilePointer() != state.docStartFP) { + // Don't prefetch if the input is already positioned at the right offset, which suggests that + // the caller is streaming the entire inverted index (e.g. for merging), let the read-ahead + // logic do its work instead. Note that this heuristic doesn't work for terms that have skip + // data, since skip data is stored after the last term, but handling all terms that have <128 + // docs is a good start already. + docIn.prefetch(state.docStartFP, 1); + } + // Note: we don't prefetch positions or offsets, which are less likely to be needed. + } + + static class MutableImpactList extends AbstractList implements RandomAccess { + int length; + final Impact[] impacts; + + MutableImpactList(int capacity) { + impacts = new Impact[capacity]; + for (int i = 0; i < capacity; ++i) { + impacts[i] = new Impact(Integer.MAX_VALUE, 1L); + } + } + + @Override + public Impact get(int index) { + return impacts[index]; + } + + @Override + public int size() { + return length; + } + } + + static MutableImpactList readImpacts(ByteArrayDataInput in, MutableImpactList reuse) { + int freq = 0; + long norm = 0; + int length = 0; + while (in.getPosition() < in.length()) { + int freqDelta = in.readVInt(); + if ((freqDelta & 0x01) != 0) { + freq += 1 + (freqDelta >>> 1); + try { + norm += 1 + in.readZLong(); + } catch (IOException e) { + throw new RuntimeException(e); // cannot happen on a BADI + } + } else { + freq += 1 + (freqDelta >>> 1); + norm++; + } + Impact impact = reuse.impacts[length]; + impact.freq = freq; + impact.norm = norm; + length++; + } + reuse.length = length; + return reuse; + } + + @Override + public void checkIntegrity() throws IOException { + if (docIn != null) { + CodecUtil.checksumEntireFile(docIn); + } + if (posIn != null) { + CodecUtil.checksumEntireFile(posIn); + } + if (payIn != null) { + CodecUtil.checksumEntireFile(payIn); + } + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(positions=" + + (posIn != null) + + ",payloads=" + + (payIn != null) + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsWriter.java new file mode 100644 index 000000000000..fc43c9053de2 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/Lucene103PostingsWriter.java @@ -0,0 +1,731 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103; + +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.*; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.DOC_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.LEVEL1_MASK; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.META_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.PAY_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.POS_CODEC; +import static org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.TERMS_CODEC; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.PushPostingsWriterBase; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat.IntBlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BitUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; + +/** Writer for {@link Lucene103PostingsFormat}. */ +public class Lucene103PostingsWriter extends PushPostingsWriterBase { + + static final IntBlockTermState EMPTY_STATE = new IntBlockTermState(); + + private final int version; + + IndexOutput metaOut; + IndexOutput docOut; + IndexOutput posOut; + IndexOutput payOut; + + IntBlockTermState lastState; + + // Holds starting file pointers for current term: + private long docStartFP; + private long posStartFP; + private long payStartFP; + + final int[] docDeltaBuffer; + final int[] freqBuffer; + private int docBufferUpto; + + final int[] posDeltaBuffer; + final int[] payloadLengthBuffer; + final int[] offsetStartDeltaBuffer; + final int[] offsetLengthBuffer; + private int posBufferUpto; + + private byte[] payloadBytes; + private int payloadByteUpto; + + private int level0LastDocID; + private long level0LastPosFP; + private long level0LastPayFP; + + private int level1LastDocID; + private long level1LastPosFP; + private long level1LastPayFP; + + private int docID; + private int lastDocID; + private int lastPosition; + private int lastStartOffset; + private int docCount; + + private final PForUtil pforUtil; + private final ForDeltaUtil forDeltaUtil; + + private boolean fieldHasNorms; + private NumericDocValues norms; + private final CompetitiveImpactAccumulator level0FreqNormAccumulator = + new CompetitiveImpactAccumulator(); + private final CompetitiveImpactAccumulator level1CompetitiveFreqNormAccumulator = + new CompetitiveImpactAccumulator(); + + private int maxNumImpactsAtLevel0; + private int maxImpactNumBytesAtLevel0; + private int maxNumImpactsAtLevel1; + private int maxImpactNumBytesAtLevel1; + + /** Scratch output that we use to be able to prepend the encoded length, e.g. impacts. */ + private final ByteBuffersDataOutput scratchOutput = ByteBuffersDataOutput.newResettableInstance(); + + /** + * Output for a single block. This is useful to be able to prepend skip data before each block, + * which can only be computed once the block is encoded. The content is then typically copied to + * {@link #level1Output}. + */ + private final ByteBuffersDataOutput level0Output = ByteBuffersDataOutput.newResettableInstance(); + + /** + * Output for groups of 32 blocks. This is useful to prepend skip data for these 32 blocks, which + * can only be done once we have encoded these 32 blocks. The content is then typically copied to + * {@link #docCount}. + */ + private final ByteBuffersDataOutput level1Output = ByteBuffersDataOutput.newResettableInstance(); + + /** + * Reusable FixedBitSet, for dense blocks that are more efficiently stored by storing them as a + * bit set than as packed deltas. + */ + // Since we use a bit set when it's more storage-efficient, the bit set cannot have more than + // BLOCK_SIZE*32 bits, which is the maximum possible storage requirement with FOR. + private final FixedBitSet spareBitSet = new FixedBitSet(BLOCK_SIZE * Integer.SIZE); + + /** Sole public constructor. */ + public Lucene103PostingsWriter(SegmentWriteState state) throws IOException { + this(state, Lucene103PostingsFormat.VERSION_CURRENT); + } + + /** Constructor that takes a version. */ + Lucene103PostingsWriter(SegmentWriteState state, int version) throws IOException { + this.version = version; + String metaFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.META_EXTENSION); + String docFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.DOC_EXTENSION); + metaOut = state.directory.createOutput(metaFileName, state.context); + IndexOutput posOut = null; + IndexOutput payOut = null; + boolean success = false; + try { + docOut = state.directory.createOutput(docFileName, state.context); + CodecUtil.writeIndexHeader( + metaOut, META_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); + CodecUtil.writeIndexHeader( + docOut, DOC_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); + forDeltaUtil = new ForDeltaUtil(); + pforUtil = new PForUtil(); + if (state.fieldInfos.hasProx()) { + posDeltaBuffer = new int[BLOCK_SIZE]; + String posFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, state.segmentSuffix, Lucene103PostingsFormat.POS_EXTENSION); + posOut = state.directory.createOutput(posFileName, state.context); + CodecUtil.writeIndexHeader( + posOut, POS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); + + if (state.fieldInfos.hasPayloads()) { + payloadBytes = new byte[128]; + payloadLengthBuffer = new int[BLOCK_SIZE]; + } else { + payloadBytes = null; + payloadLengthBuffer = null; + } + + if (state.fieldInfos.hasOffsets()) { + offsetStartDeltaBuffer = new int[BLOCK_SIZE]; + offsetLengthBuffer = new int[BLOCK_SIZE]; + } else { + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + } + + if (state.fieldInfos.hasPayloads() || state.fieldInfos.hasOffsets()) { + String payFileName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene103PostingsFormat.PAY_EXTENSION); + payOut = state.directory.createOutput(payFileName, state.context); + CodecUtil.writeIndexHeader( + payOut, PAY_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); + } + } else { + posDeltaBuffer = null; + payloadLengthBuffer = null; + offsetStartDeltaBuffer = null; + offsetLengthBuffer = null; + payloadBytes = null; + } + this.payOut = payOut; + this.posOut = posOut; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut); + } + } + + docDeltaBuffer = new int[BLOCK_SIZE]; + freqBuffer = new int[BLOCK_SIZE]; + } + + @Override + public IntBlockTermState newTermState() { + return new IntBlockTermState(); + } + + @Override + public void init(IndexOutput termsOut, SegmentWriteState state) throws IOException { + CodecUtil.writeIndexHeader( + termsOut, TERMS_CODEC, version, state.segmentInfo.getId(), state.segmentSuffix); + termsOut.writeVInt(BLOCK_SIZE); + } + + @Override + public void setField(FieldInfo fieldInfo) { + super.setField(fieldInfo); + lastState = EMPTY_STATE; + fieldHasNorms = fieldInfo.hasNorms(); + } + + @Override + public void startTerm(NumericDocValues norms) { + docStartFP = docOut.getFilePointer(); + if (writePositions) { + posStartFP = posOut.getFilePointer(); + level1LastPosFP = level0LastPosFP = posStartFP; + if (writePayloads || writeOffsets) { + payStartFP = payOut.getFilePointer(); + level1LastPayFP = level0LastPayFP = payStartFP; + } + } + lastDocID = -1; + level0LastDocID = -1; + level1LastDocID = -1; + this.norms = norms; + if (writeFreqs) { + level0FreqNormAccumulator.clear(); + } + } + + @Override + public void startDoc(int docID, int termDocFreq) throws IOException { + if (docBufferUpto == BLOCK_SIZE) { + flushDocBlock(false); + docBufferUpto = 0; + } + + final int docDelta = docID - lastDocID; + + if (docID < 0 || docDelta <= 0) { + throw new CorruptIndexException( + "docs out of order (" + docID + " <= " + lastDocID + " )", docOut); + } + + docDeltaBuffer[docBufferUpto] = docDelta; + if (writeFreqs) { + freqBuffer[docBufferUpto] = termDocFreq; + } + + this.docID = docID; + lastPosition = 0; + lastStartOffset = 0; + + if (writeFreqs) { + long norm; + if (fieldHasNorms) { + boolean found = norms.advanceExact(docID); + if (found == false) { + // This can happen if indexing hits a problem after adding a doc to the + // postings but before buffering the norm. Such documents are written + // deleted and will go away on the first merge. + norm = 1L; + } else { + norm = norms.longValue(); + assert norm != 0 : docID; + } + } else { + norm = 1L; + } + + level0FreqNormAccumulator.add(termDocFreq, norm); + } + } + + @Override + public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) + throws IOException { + if (position > IndexWriter.MAX_POSITION) { + throw new CorruptIndexException( + "position=" + + position + + " is too large (> IndexWriter.MAX_POSITION=" + + IndexWriter.MAX_POSITION + + ")", + docOut); + } + if (position < 0) { + throw new CorruptIndexException("position=" + position + " is < 0", docOut); + } + posDeltaBuffer[posBufferUpto] = position - lastPosition; + if (writePayloads) { + if (payload == null || payload.length == 0) { + // no payload + payloadLengthBuffer[posBufferUpto] = 0; + } else { + payloadLengthBuffer[posBufferUpto] = payload.length; + if (payloadByteUpto + payload.length > payloadBytes.length) { + payloadBytes = ArrayUtil.grow(payloadBytes, payloadByteUpto + payload.length); + } + System.arraycopy( + payload.bytes, payload.offset, payloadBytes, payloadByteUpto, payload.length); + payloadByteUpto += payload.length; + } + } + + if (writeOffsets) { + assert startOffset >= lastStartOffset; + assert endOffset >= startOffset; + offsetStartDeltaBuffer[posBufferUpto] = startOffset - lastStartOffset; + offsetLengthBuffer[posBufferUpto] = endOffset - startOffset; + lastStartOffset = startOffset; + } + + posBufferUpto++; + lastPosition = position; + if (posBufferUpto == BLOCK_SIZE) { + pforUtil.encode(posDeltaBuffer, posOut); + + if (writePayloads) { + pforUtil.encode(payloadLengthBuffer, payOut); + payOut.writeVInt(payloadByteUpto); + payOut.writeBytes(payloadBytes, 0, payloadByteUpto); + payloadByteUpto = 0; + } + if (writeOffsets) { + pforUtil.encode(offsetStartDeltaBuffer, payOut); + pforUtil.encode(offsetLengthBuffer, payOut); + } + posBufferUpto = 0; + } + } + + @Override + public void finishDoc() { + docBufferUpto++; + docCount++; + + lastDocID = docID; + } + + /** + * Special vints that are encoded on 2 bytes if they require 15 bits or less. VInt becomes + * especially slow when the number of bytes is variable, so this special layout helps in the case + * when the number likely requires 15 bits or less + */ + static void writeVInt15(DataOutput out, int v) throws IOException { + assert v >= 0; + writeVLong15(out, v); + } + + /** + * @see #writeVInt15(DataOutput, int) + */ + static void writeVLong15(DataOutput out, long v) throws IOException { + assert v >= 0; + if ((v & ~0x7FFFL) == 0) { + out.writeShort((short) v); + } else { + out.writeShort((short) (0x8000 | (v & 0x7FFF))); + out.writeVLong(v >> 15); + } + } + + private void flushDocBlock(boolean finishTerm) throws IOException { + assert docBufferUpto != 0; + + if (docBufferUpto < BLOCK_SIZE) { + assert finishTerm; + PostingsUtil.writeVIntBlock( + level0Output, docDeltaBuffer, freqBuffer, docBufferUpto, writeFreqs); + } else { + if (writeFreqs) { + List impacts = level0FreqNormAccumulator.getCompetitiveFreqNormPairs(); + if (impacts.size() > maxNumImpactsAtLevel0) { + maxNumImpactsAtLevel0 = impacts.size(); + } + writeImpacts(impacts, scratchOutput); + assert level0Output.size() == 0; + if (scratchOutput.size() > maxImpactNumBytesAtLevel0) { + maxImpactNumBytesAtLevel0 = Math.toIntExact(scratchOutput.size()); + } + level0Output.writeVLong(scratchOutput.size()); + scratchOutput.copyTo(level0Output); + scratchOutput.reset(); + if (writePositions) { + level0Output.writeVLong(posOut.getFilePointer() - level0LastPosFP); + level0Output.writeByte((byte) posBufferUpto); + level0LastPosFP = posOut.getFilePointer(); + + if (writeOffsets || writePayloads) { + level0Output.writeVLong(payOut.getFilePointer() - level0LastPayFP); + level0Output.writeVInt(payloadByteUpto); + level0LastPayFP = payOut.getFilePointer(); + } + } + } + long numSkipBytes = level0Output.size(); + // Now we need to decide whether to encode block deltas as packed integers (FOR) or unary + // codes (bit set). FOR makes #nextDoc() a bit faster while the bit set approach makes + // #advance() usually faster and #intoBitSet() much faster. In the end, we make the decision + // based on storage requirements, picking the bit set approach whenever it's more + // storage-efficient than the next number of bits per value (which effectively slightly biases + // towards the bit set approach). + int bitsPerValue = forDeltaUtil.bitsRequired(docDeltaBuffer); + int sum = Math.toIntExact(Arrays.stream(docDeltaBuffer).sum()); + int numBitSetLongs = FixedBitSet.bits2words(sum); + int numBitsNextBitsPerValue = Math.min(Integer.SIZE, bitsPerValue + 1) * BLOCK_SIZE; + if (sum == BLOCK_SIZE) { + level0Output.writeByte((byte) 0); + } else if (numBitsNextBitsPerValue <= sum) { + level0Output.writeByte((byte) bitsPerValue); + forDeltaUtil.encodeDeltas(bitsPerValue, docDeltaBuffer, level0Output); + } else { + // Storing doc deltas is more efficient using unary coding (ie. storing doc IDs as a bit + // set) + spareBitSet.clear(0, numBitSetLongs << 6); + int s = -1; + for (int i : docDeltaBuffer) { + s += i; + spareBitSet.set(s); + } + // We never use the bit set encoding when it requires more than Integer.SIZE=32 bits per + // value. So the bit set cannot have more than BLOCK_SIZE * Integer.SIZE / Long.SIZE = 64 + // longs, which fits on a byte. + assert numBitSetLongs <= BLOCK_SIZE / 2; + level0Output.writeByte((byte) -numBitSetLongs); + for (int i = 0; i < numBitSetLongs; ++i) { + level0Output.writeLong(spareBitSet.getBits()[i]); + } + } + + if (writeFreqs) { + pforUtil.encode(freqBuffer, level0Output); + } + + // docID - lastBlockDocID is at least 128, so it can never fit a single byte with a vint + // Even if we subtracted 128, only extremely dense blocks would be eligible to a single byte + // so let's go with 2 bytes right away + writeVInt15(scratchOutput, docID - level0LastDocID); + writeVLong15(scratchOutput, level0Output.size()); + numSkipBytes += scratchOutput.size(); + level1Output.writeVLong(numSkipBytes); + scratchOutput.copyTo(level1Output); + scratchOutput.reset(); + } + + level0Output.copyTo(level1Output); + level0Output.reset(); + level0LastDocID = docID; + if (writeFreqs) { + level1CompetitiveFreqNormAccumulator.addAll(level0FreqNormAccumulator); + level0FreqNormAccumulator.clear(); + } + + if ((docCount & LEVEL1_MASK) == 0) { // true every 32 blocks (4,096 docs) + writeLevel1SkipData(); + level1LastDocID = docID; + level1CompetitiveFreqNormAccumulator.clear(); + } else if (finishTerm) { + level1Output.copyTo(docOut); + level1Output.reset(); + level1CompetitiveFreqNormAccumulator.clear(); + } + } + + private void writeLevel1SkipData() throws IOException { + docOut.writeVInt(docID - level1LastDocID); + final long level1End; + if (writeFreqs) { + List impacts = level1CompetitiveFreqNormAccumulator.getCompetitiveFreqNormPairs(); + if (impacts.size() > maxNumImpactsAtLevel1) { + maxNumImpactsAtLevel1 = impacts.size(); + } + writeImpacts(impacts, scratchOutput); + long numImpactBytes = scratchOutput.size(); + if (numImpactBytes > maxImpactNumBytesAtLevel1) { + maxImpactNumBytesAtLevel1 = Math.toIntExact(numImpactBytes); + } + if (writePositions) { + scratchOutput.writeVLong(posOut.getFilePointer() - level1LastPosFP); + scratchOutput.writeByte((byte) posBufferUpto); + level1LastPosFP = posOut.getFilePointer(); + if (writeOffsets || writePayloads) { + scratchOutput.writeVLong(payOut.getFilePointer() - level1LastPayFP); + scratchOutput.writeVInt(payloadByteUpto); + level1LastPayFP = payOut.getFilePointer(); + } + } + final long level1Len = 2 * Short.BYTES + scratchOutput.size() + level1Output.size(); + docOut.writeVLong(level1Len); + level1End = docOut.getFilePointer() + level1Len; + // There are at most 128 impacts, that require at most 2 bytes each + assert numImpactBytes <= Short.MAX_VALUE; + // Like impacts plus a few vlongs, still way under the max short value + assert scratchOutput.size() + Short.BYTES <= Short.MAX_VALUE; + docOut.writeShort((short) (scratchOutput.size() + Short.BYTES)); + docOut.writeShort((short) numImpactBytes); + scratchOutput.copyTo(docOut); + scratchOutput.reset(); + } else { + docOut.writeVLong(level1Output.size()); + level1End = docOut.getFilePointer() + level1Output.size(); + } + level1Output.copyTo(docOut); + level1Output.reset(); + assert docOut.getFilePointer() == level1End : docOut.getFilePointer() + " " + level1End; + } + + static void writeImpacts(Collection impacts, DataOutput out) throws IOException { + Impact previous = new Impact(0, 0); + for (Impact impact : impacts) { + assert impact.freq > previous.freq; + assert Long.compareUnsigned(impact.norm, previous.norm) > 0; + int freqDelta = impact.freq - previous.freq - 1; + long normDelta = impact.norm - previous.norm - 1; + if (normDelta == 0) { + // most of time, norm only increases by 1, so we can fold everything in a single byte + out.writeVInt(freqDelta << 1); + } else { + out.writeVInt((freqDelta << 1) | 1); + out.writeZLong(normDelta); + } + previous = impact; + } + } + + /** Called when we are done adding docs to this term */ + @Override + public void finishTerm(BlockTermState _state) throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + assert state.docFreq > 0; + + // TODO: wasteful we are counting this (counting # docs + // for this term) in two places? + assert state.docFreq == docCount : state.docFreq + " vs " + docCount; + + // docFreq == 1, don't write the single docid/freq to a separate file along with a pointer to + // it. + final int singletonDocID; + if (state.docFreq == 1) { + // pulse the singleton docid into the term dictionary, freq is implicitly totalTermFreq + singletonDocID = docDeltaBuffer[0] - 1; + } else { + singletonDocID = -1; + flushDocBlock(true); + } + + final long lastPosBlockOffset; + + if (writePositions) { + // totalTermFreq is just total number of positions(or payloads, or offsets) + // associated with current term. + assert state.totalTermFreq != -1; + if (state.totalTermFreq > BLOCK_SIZE) { + // record file offset for last pos in last block + lastPosBlockOffset = posOut.getFilePointer() - posStartFP; + } else { + lastPosBlockOffset = -1; + } + if (posBufferUpto > 0) { + assert posBufferUpto < BLOCK_SIZE; + // TODO: should we send offsets/payloads to + // .pay...? seems wasteful (have to store extra + // vLong for low (< BLOCK_SIZE) DF terms = vast vast + // majority) + + // vInt encode the remaining positions/payloads/offsets: + int lastPayloadLength = -1; // force first payload length to be written + int lastOffsetLength = -1; // force first offset length to be written + int payloadBytesReadUpto = 0; + for (int i = 0; i < posBufferUpto; i++) { + final int posDelta = posDeltaBuffer[i]; + if (writePayloads) { + final int payloadLength = payloadLengthBuffer[i]; + if (payloadLength != lastPayloadLength) { + lastPayloadLength = payloadLength; + posOut.writeVInt((posDelta << 1) | 1); + posOut.writeVInt(payloadLength); + } else { + posOut.writeVInt(posDelta << 1); + } + + if (payloadLength != 0) { + posOut.writeBytes(payloadBytes, payloadBytesReadUpto, payloadLength); + payloadBytesReadUpto += payloadLength; + } + } else { + posOut.writeVInt(posDelta); + } + + if (writeOffsets) { + int delta = offsetStartDeltaBuffer[i]; + int length = offsetLengthBuffer[i]; + if (length == lastOffsetLength) { + posOut.writeVInt(delta << 1); + } else { + posOut.writeVInt(delta << 1 | 1); + posOut.writeVInt(length); + lastOffsetLength = length; + } + } + } + + if (writePayloads) { + assert payloadBytesReadUpto == payloadByteUpto; + payloadByteUpto = 0; + } + } + } else { + lastPosBlockOffset = -1; + } + + state.docStartFP = docStartFP; + state.posStartFP = posStartFP; + state.payStartFP = payStartFP; + state.singletonDocID = singletonDocID; + + state.lastPosBlockOffset = lastPosBlockOffset; + docBufferUpto = 0; + posBufferUpto = 0; + lastDocID = -1; + docCount = 0; + } + + @Override + public void encodeTerm( + DataOutput out, FieldInfo fieldInfo, BlockTermState _state, boolean absolute) + throws IOException { + IntBlockTermState state = (IntBlockTermState) _state; + if (absolute) { + lastState = EMPTY_STATE; + assert lastState.docStartFP == 0; + } + + if (lastState.singletonDocID != -1 + && state.singletonDocID != -1 + && state.docStartFP == lastState.docStartFP) { + // With runs of rare values such as ID fields, the increment of pointers in the docs file is + // often 0. + // Furthermore some ID schemes like auto-increment IDs or Flake IDs are monotonic, so we + // encode the delta + // between consecutive doc IDs to save space. + final long delta = (long) state.singletonDocID - lastState.singletonDocID; + out.writeVLong((BitUtil.zigZagEncode(delta) << 1) | 0x01); + } else { + out.writeVLong((state.docStartFP - lastState.docStartFP) << 1); + if (state.singletonDocID != -1) { + out.writeVInt(state.singletonDocID); + } + } + + if (writePositions) { + out.writeVLong(state.posStartFP - lastState.posStartFP); + if (writePayloads || writeOffsets) { + out.writeVLong(state.payStartFP - lastState.payStartFP); + } + } + if (writePositions) { + if (state.lastPosBlockOffset != -1) { + out.writeVLong(state.lastPosBlockOffset); + } + } + lastState = state; + } + + @Override + public void close() throws IOException { + // TODO: add a finish() at least to PushBase? DV too...? + boolean success = false; + try { + if (docOut != null) { + CodecUtil.writeFooter(docOut); + } + if (posOut != null) { + CodecUtil.writeFooter(posOut); + } + if (payOut != null) { + CodecUtil.writeFooter(payOut); + } + if (metaOut != null) { + metaOut.writeInt(maxNumImpactsAtLevel0); + metaOut.writeInt(maxImpactNumBytesAtLevel0); + metaOut.writeInt(maxNumImpactsAtLevel1); + metaOut.writeInt(maxImpactNumBytesAtLevel1); + metaOut.writeLong(docOut.getFilePointer()); + if (posOut != null) { + metaOut.writeLong(posOut.getFilePointer()); + if (payOut != null) { + metaOut.writeLong(payOut.getFilePointer()); + } + } + CodecUtil.writeFooter(metaOut); + } + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, docOut, posOut, payOut); + } else { + IOUtils.closeWhileHandlingException(metaOut, docOut, posOut, payOut); + } + metaOut = docOut = posOut = payOut = null; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PForUtil.java similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/PForUtil.java index fd8ecd056b49..81aef85cd031 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PForUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PForUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import java.util.Arrays; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingIndexInput.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingIndexInput.java index 59388e3446b9..e66ed6351468 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingIndexInput.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingIndexInput.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingsUtil.java similarity index 98% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingsUtil.java index 34431a3689fb..e28d59166dcc 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/PostingsUtil.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/PostingsUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.store.DataOutput; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/CompressionAlgorithm.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/CompressionAlgorithm.java similarity index 97% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/CompressionAlgorithm.java rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/CompressionAlgorithm.java index de24dc3fb05b..120d009f0122 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene90/blocktree/CompressionAlgorithm.java +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/CompressionAlgorithm.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene90.blocktree; +package org.apache.lucene.codecs.lucene103.blocktree; import java.io.IOException; import org.apache.lucene.store.DataInput; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/FieldReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/FieldReader.java new file mode 100644 index 000000000000..ee204d47ee62 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/FieldReader.java @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.automaton.CompiledAutomaton; + +/** + * BlockTree's implementation of {@link Terms}. + * + * @lucene.internal + */ +public final class FieldReader extends Terms { + + // private final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + final long numTerms; + final FieldInfo fieldInfo; + final long sumTotalTermFreq; + final long sumDocFreq; + final int docCount; + final long rootBlockFP; + final BytesRef minTerm; + final BytesRef maxTerm; + final long indexStart; + final long rootFP; + final long indexEnd; + final Lucene103BlockTreeTermsReader parent; + final IndexInput indexIn; + + // private boolean DEBUG; + + FieldReader( + Lucene103BlockTreeTermsReader parent, + FieldInfo fieldInfo, + long numTerms, + long sumTotalTermFreq, + long sumDocFreq, + int docCount, + IndexInput metaIn, + IndexInput indexIn, + BytesRef minTerm, + BytesRef maxTerm) + throws IOException { + assert numTerms > 0; + this.fieldInfo = fieldInfo; + // DEBUG = BlockTreeTermsReader.DEBUG && fieldInfo.name.equals("id"); + this.parent = parent; + this.numTerms = numTerms; + this.sumTotalTermFreq = sumTotalTermFreq; + this.sumDocFreq = sumDocFreq; + this.docCount = docCount; + this.minTerm = minTerm; + this.maxTerm = maxTerm; + + // if (DEBUG) { + // System.out.println("BTTR: seg=" + segment + " field=" + fieldInfo.name + " rootBlockCode=" + // + rootCode + " divisor=" + indexDivisor); + // } + + this.indexStart = metaIn.readVLong(); + this.rootFP = metaIn.readVLong(); + this.indexEnd = metaIn.readVLong(); + this.indexIn = indexIn; + + TrieReader trieReader = newReader(); + this.rootBlockFP = trieReader.root.outputFp; + } + + private TrieReader newReader() throws IOException { + return new TrieReader(indexIn.slice("trie index", indexStart, indexEnd - indexStart), rootFP); + } + + @Override + public BytesRef getMin() throws IOException { + if (minTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMin(); + } else { + return minTerm; + } + } + + @Override + public BytesRef getMax() throws IOException { + if (maxTerm == null) { + // Older index that didn't store min/maxTerm + return super.getMax(); + } else { + return maxTerm; + } + } + + /** For debugging -- used by CheckIndex too */ + @Override + public Stats getStats() throws IOException { + return new SegmentTermsEnum(this, newReader()).computeBlockStats(); + } + + @Override + public boolean hasFreqs() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0; + } + + @Override + public boolean hasOffsets() { + return fieldInfo + .getIndexOptions() + .compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) + >= 0; + } + + @Override + public boolean hasPositions() { + return fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) >= 0; + } + + @Override + public boolean hasPayloads() { + return fieldInfo.hasPayloads(); + } + + @Override + public TermsEnum iterator() throws IOException { + return new SegmentTermsEnum(this, newReader()); + } + + @Override + public long size() { + return numTerms; + } + + @Override + public long getSumTotalTermFreq() { + return sumTotalTermFreq; + } + + @Override + public long getSumDocFreq() { + return sumDocFreq; + } + + @Override + public int getDocCount() { + return docCount; + } + + @Override + public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException { + // if (DEBUG) System.out.println(" FieldReader.intersect startTerm=" + + // ToStringUtils.bytesRefToString(startTerm)); + // System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton); + // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum? + // can we optimize knowing that...? + if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { + throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); + } + return new IntersectTermsEnum( + this, + newReader(), + compiled.getTransitionAccessor(), + compiled.getByteRunnable(), + compiled.commonSuffixRef, + startTerm); + } + + @Override + public String toString() { + return "BlockTreeTerms(seg=" + + parent.segment + + " terms=" + + numTerms + + ",postings=" + + sumDocFreq + + ",positions=" + + sumTotalTermFreq + + ",docs=" + + docCount + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnum.java new file mode 100644 index 000000000000..75cdc1c88c33 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnum.java @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.RamUsageEstimator; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.automaton.ByteRunnable; +import org.apache.lucene.util.automaton.Transition; +import org.apache.lucene.util.automaton.TransitionAccessor; + +/** + * This is used to implement efficient {@link Terms#intersect} for block-tree. Note that it cannot + * seek, except for the initial term on init. It just "nexts" through the intersection of the + * automaton and the terms. It does not use the terms index at all: on init, it loads the root + * block, and scans its way to the initial term. Likewise, in next it scans until it finds a term + * that matches the current automaton transition. + */ +final class IntersectTermsEnum extends BaseTermsEnum { + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + final IndexInput in; + + IntersectTermsEnumFrame[] stack; + + private TrieReader.Node[] nodes = new TrieReader.Node[5]; + + final ByteRunnable runAutomaton; + final TransitionAccessor automaton; + final BytesRef commonSuffix; + + private IntersectTermsEnumFrame currentFrame; + private Transition currentTransition; + + private final BytesRef term = new BytesRef(); + + final TrieReader trieReader; + + final FieldReader fr; + + private BytesRef savedStartTerm; + + // TODO: in some cases we can filter by length? eg + // regexp foo*bar must be at least length 6 bytes + public IntersectTermsEnum( + FieldReader fr, + TrieReader trieReader, + TransitionAccessor automaton, + ByteRunnable runAutomaton, + BytesRef commonSuffix, + BytesRef startTerm) + throws IOException { + this.fr = fr; + + assert automaton != null; + assert runAutomaton != null; + + this.runAutomaton = runAutomaton; + this.automaton = automaton; + this.commonSuffix = commonSuffix; + + in = fr.parent.termsIn.clone(); + stack = new IntersectTermsEnumFrame[5]; + for (int idx = 0; idx < stack.length; idx++) { + stack[idx] = new IntersectTermsEnumFrame(this, idx); + } + for (int nodeIdx = 1; nodeIdx < nodes.length; nodeIdx++) { + nodes[nodeIdx] = new TrieReader.Node(); + } + + this.trieReader = trieReader; + + // TODO: if the automaton is "smallish" we really + // should use the terms index to seek at least to + // the initial term and likely to subsequent terms + // (or, maybe just fallback to ATE for such cases). + // Else the seek cost of loading the frames will be + // too costly. + + final TrieReader.Node node = nodes[0] = trieReader.root; + // Empty string prefix must have an output in the index! + assert node.hasOutput(); + + // Special pushFrame since it's the first one: + final IntersectTermsEnumFrame f = stack[0]; + f.fp = f.fpOrig = fr.rootBlockFP; + f.prefix = 0; + f.setState(0); + f.node = node; + f.load(node); + + // for assert: + assert setSavedStartTerm(startTerm); + + currentFrame = f; + + if (startTerm != null) { + seekToStartTerm(startTerm); + } + currentTransition = currentFrame.transition; + } + + // only for assert: + private boolean setSavedStartTerm(BytesRef startTerm) { + savedStartTerm = startTerm == null ? null : BytesRef.deepCopyOf(startTerm); + return true; + } + + @Override + public TermState termState() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.clone(); + } + + private IntersectTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final IntersectTermsEnumFrame[] next = + new IntersectTermsEnumFrame + [ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) { + next[stackOrd] = new IntersectTermsEnumFrame(this, stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private TrieReader.Node getNode(int ord) { + if (ord >= nodes.length) { + final TrieReader.Node[] next = + new TrieReader.Node[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(nodes, 0, next, 0, nodes.length); + for (int nodeOrd = nodes.length; nodeOrd < next.length; nodeOrd++) { + next[nodeOrd] = new TrieReader.Node(); + } + nodes = next; + } + return nodes[ord]; + } + + private IntersectTermsEnumFrame pushFrame(int state) throws IOException { + assert currentFrame != null; + + final IntersectTermsEnumFrame f = getFrame(currentFrame == null ? 0 : 1 + currentFrame.ord); + + f.fp = f.fpOrig = currentFrame.lastSubFP; + f.prefix = currentFrame.prefix + currentFrame.suffix; + f.setState(state); + + // Walk the node through the index -- we only + // "bother" with this so we can get the floor data + // from the index and skip floor blocks when + // possible: + TrieReader.Node node = currentFrame.node; + int idx = currentFrame.prefix; + assert currentFrame.suffix > 0; + + while (idx < f.prefix) { + final int target = term.bytes[idx] & 0xff; + // TODO: we could be more efficient for the next() + // case by using current node as starting point, + // passed to findTargetNode + TrieReader.Node parent = node; + node = trieReader.lookupChild(target, parent, getNode(1 + idx)); + assert node != null; + idx++; + } + + f.node = node; + assert node.hasOutput(); + f.load(node); + return f; + } + + @Override + public BytesRef term() { + return term; + } + + @Override + public int docFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + currentFrame.decodeMetaData(); + return currentFrame.termState.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.termState, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + currentFrame.decodeMetaData(); + return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.termState, flags); + } + + private int getState() { + int state = currentFrame.state; + for (int idx = 0; idx < currentFrame.suffix; idx++) { + state = + runAutomaton.step( + state, currentFrame.suffixBytes[currentFrame.startBytePos + idx] & 0xff); + assert state != -1; + } + return state; + } + + // NOTE: specialized to only doing the first-time + // seek, but we could generalize it to allow + // arbitrary seekExact/Ceil. Note that this is a + // seekFloor! + private void seekToStartTerm(BytesRef target) throws IOException { + assert currentFrame.ord == 0; + if (term.length < target.length) { + term.bytes = ArrayUtil.grow(term.bytes, target.length); + } + TrieReader.Node node = nodes[0]; + assert node == currentFrame.node; + + for (int idx = 0; idx <= target.length; idx++) { + + while (true) { + final int savNextEnt = currentFrame.nextEnt; + final int savePos = currentFrame.suffixesReader.getPosition(); + final int saveLengthPos = currentFrame.suffixLengthsReader.getPosition(); + final int saveStartBytePos = currentFrame.startBytePos; + final int saveSuffix = currentFrame.suffix; + final long saveLastSubFP = currentFrame.lastSubFP; + final int saveTermBlockOrd = currentFrame.termState.termBlockOrd; + + final boolean isSubBlock = currentFrame.next(); + + term.length = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < term.length) { + term.bytes = ArrayUtil.grow(term.bytes, term.length); + } + System.arraycopy( + currentFrame.suffixBytes, + currentFrame.startBytePos, + term.bytes, + currentFrame.prefix, + currentFrame.suffix); + + if (isSubBlock && StringHelper.startsWith(target, term)) { + // Recurse + currentFrame = pushFrame(getState()); + break; + } else { + final int cmp = term.compareTo(target); + if (cmp < 0) { + if (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + continue; + } else { + return; + } + } + continue; + } else if (cmp == 0) { + return; + } else { + // Fallback to prior entry: the semantics of + // this method is that the first call to + // next() will return the term after the + // requested term + currentFrame.nextEnt = savNextEnt; + currentFrame.lastSubFP = saveLastSubFP; + currentFrame.startBytePos = saveStartBytePos; + currentFrame.suffix = saveSuffix; + currentFrame.suffixesReader.setPosition(savePos); + currentFrame.suffixLengthsReader.setPosition(saveLengthPos); + currentFrame.termState.termBlockOrd = saveTermBlockOrd; + System.arraycopy( + currentFrame.suffixBytes, + currentFrame.startBytePos, + term.bytes, + currentFrame.prefix, + currentFrame.suffix); + term.length = currentFrame.prefix + currentFrame.suffix; + // If the last entry was a block we don't + // need to bother recursing and pushing to + // the last term under it because the first + // next() will simply skip the frame anyway + return; + } + } + } + } + + assert false; + } + + private boolean popPushNext() throws IOException { + // Pop finished frames + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + break; + } else { + if (currentFrame.ord == 0) { + throw NoMoreTermsException.INSTANCE; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + currentTransition = currentFrame.transition; + assert currentFrame.lastSubFP == lastFP; + } + } + + return currentFrame.next(); + } + + // Only used internally when there are no more terms in next(): + private static final class NoMoreTermsException extends RuntimeException { + + // Only used internally when there are no more terms in next(): + public static final NoMoreTermsException INSTANCE = new NoMoreTermsException(); + + private NoMoreTermsException() {} + + @Override + public Throwable fillInStackTrace() { + // Do nothing: + return this; + } + } + + @Override + public BytesRef next() throws IOException { + try { + return _next(); + } catch ( + @SuppressWarnings("unused") + NoMoreTermsException eoi) { + // Provoke NPE if we are (illegally!) called again: + currentFrame = null; + return null; + } + } + + private BytesRef _next() throws IOException { + + boolean isSubBlock = popPushNext(); + + nextTerm: + while (true) { + assert currentFrame.transition == currentTransition; + + int state; + int lastState; + + // NOTE: suffix == 0 can only happen on the first term in a block, when + // there is a term exactly matching a prefix in the index. If we + // could somehow re-org the code so we only checked this case immediately + // after pushing a frame... + if (currentFrame.suffix != 0) { + + final byte[] suffixBytes = currentFrame.suffixBytes; + + // This is the first byte of the suffix of the term we are now on: + final int label = suffixBytes[currentFrame.startBytePos] & 0xff; + + if (label < currentTransition.min) { + // Common case: we are scanning terms in this block to "catch up" to + // current transition in the automaton: + int minTrans = currentTransition.min; + while (currentFrame.nextEnt < currentFrame.entCount) { + isSubBlock = currentFrame.next(); + if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) { + continue nextTerm; + } + } + + // End of frame: + isSubBlock = popPushNext(); + continue nextTerm; + } + + // Advance where we are in the automaton to match this label: + + while (label > currentTransition.max) { + if (currentFrame.transitionIndex >= currentFrame.transitionCount - 1) { + // Pop this frame: no further matches are possible because + // we've moved beyond what the max transition will allow + if (currentFrame.ord == 0) { + // Provoke NPE if we are (illegally!) called again: + currentFrame = null; + return null; + } + currentFrame = stack[currentFrame.ord - 1]; + currentTransition = currentFrame.transition; + isSubBlock = popPushNext(); + continue nextTerm; + } + currentFrame.transitionIndex++; + automaton.getNextTransition(currentTransition); + + if (label < currentTransition.min) { + int minTrans = currentTransition.min; + while (currentFrame.nextEnt < currentFrame.entCount) { + isSubBlock = currentFrame.next(); + if ((suffixBytes[currentFrame.startBytePos] & 0xff) >= minTrans) { + continue nextTerm; + } + } + + // End of frame: + isSubBlock = popPushNext(); + continue nextTerm; + } + } + + if (commonSuffix != null && !isSubBlock) { + final int termLen = currentFrame.prefix + currentFrame.suffix; + if (termLen < commonSuffix.length) { + // No match + isSubBlock = popPushNext(); + continue nextTerm; + } + + final byte[] commonSuffixBytes = commonSuffix.bytes; + + final int lenInPrefix = commonSuffix.length - currentFrame.suffix; + assert commonSuffix.offset == 0; + int suffixBytesPos; + int commonSuffixBytesPos = 0; + + if (lenInPrefix > 0) { + // A prefix of the common suffix overlaps with + // the suffix of the block prefix so we first + // test whether the prefix part matches: + final byte[] termBytes = term.bytes; + int termBytesPos = currentFrame.prefix - lenInPrefix; + assert termBytesPos >= 0; + final int termBytesPosEnd = currentFrame.prefix; + while (termBytesPos < termBytesPosEnd) { + if (termBytes[termBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + isSubBlock = popPushNext(); + continue nextTerm; + } + } + suffixBytesPos = currentFrame.startBytePos; + } else { + suffixBytesPos = currentFrame.startBytePos + currentFrame.suffix - commonSuffix.length; + } + + // Test overlapping suffix part: + final int commonSuffixBytesPosEnd = commonSuffix.length; + while (commonSuffixBytesPos < commonSuffixBytesPosEnd) { + if (suffixBytes[suffixBytesPos++] != commonSuffixBytes[commonSuffixBytesPos++]) { + isSubBlock = popPushNext(); + continue nextTerm; + } + } + } + + // TODO: maybe we should do the same linear test + // that AutomatonTermsEnum does, so that if we + // reach a part of the automaton where .* is + // "temporarily" accepted, we just blindly .next() + // until the limit + + // See if the term suffix matches the automaton: + + // We know from above that the first byte in our suffix (label) matches + // the current transition, so we step from the 2nd byte + // in the suffix: + lastState = currentFrame.state; + state = currentTransition.dest; + + int end = currentFrame.startBytePos + currentFrame.suffix; + for (int idx = currentFrame.startBytePos + 1; idx < end; idx++) { + lastState = state; + state = runAutomaton.step(state, suffixBytes[idx] & 0xff); + if (state == -1) { + // No match + isSubBlock = popPushNext(); + continue nextTerm; + } + } + } else { + state = currentFrame.state; + lastState = currentFrame.lastState; + } + + if (isSubBlock) { + // Match! Recurse: + copyTerm(); + currentFrame = pushFrame(state); + currentTransition = currentFrame.transition; + currentFrame.lastState = lastState; + } else if (runAutomaton.isAccept(state)) { + copyTerm(); + assert savedStartTerm == null || term.compareTo(savedStartTerm) > 0 + : "saveStartTerm=" + savedStartTerm.utf8ToString() + " term=" + term.utf8ToString(); + return term; + } else { + // This term is a prefix of a term accepted by the automaton, but is not itself accepted + } + + isSubBlock = popPushNext(); + } + } + + private void copyTerm() { + final int len = currentFrame.prefix + currentFrame.suffix; + if (term.bytes.length < len) { + term.bytes = ArrayUtil.grow(term.bytes, len); + } + System.arraycopy( + currentFrame.suffixBytes, + currentFrame.startBytePos, + term.bytes, + currentFrame.prefix, + currentFrame.suffix); + term.length = len; + } + + @Override + public boolean seekExact(BytesRef text) { + throw new UnsupportedOperationException(); + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } + + @Override + public SeekStatus seekCeil(BytesRef text) { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnumFrame.java new file mode 100644 index 000000000000..8bb6f84b7980 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/IntersectTermsEnumFrame.java @@ -0,0 +1,324 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.automaton.Transition; + +// TODO: can we share this with the frame in STE? +final class IntersectTermsEnumFrame { + final int ord; + long fp; + long fpOrig; + long fpEnd; + long lastSubFP; + + // private static boolean DEBUG = IntersectTermsEnum.DEBUG; + + // State in automaton + int state; + + // State just before the last label + int lastState; + + int metaDataUpto; + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] suffixLengthBytes; + final ByteArrayDataInput suffixLengthsReader; + + byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + long floorDataPos; + IndexInput floorDataReader; + + // Length of prefix shared by all terms in this block + int prefix; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + int numFollowFloorBlocks; + int nextFloorLabel; + + final Transition transition = new Transition(); + int transitionIndex; + int transitionCount; + + TrieReader.Node node; + + final BlockTermState termState; + + // metadata buffer + byte[] bytes = new byte[32]; + + final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + int startBytePos; + int suffix; + + private final IntersectTermsEnum ite; + + public IntersectTermsEnumFrame(IntersectTermsEnum ite, int ord) throws IOException { + this.ite = ite; + this.ord = ord; + this.termState = ite.fr.parent.postingsReader.newTermState(); + this.termState.totalTermFreq = -1; + suffixLengthBytes = new byte[32]; + suffixLengthsReader = new ByteArrayDataInput(); + } + + void loadNextFloorBlock() throws IOException { + assert numFollowFloorBlocks > 0 : "nextFloorLabel=" + nextFloorLabel; + + floorDataReader.seek(floorDataPos); + do { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min); + load(null); + floorDataPos = floorDataReader.getFilePointer(); + } + + public void setState(int state) { + this.state = state; + transitionIndex = 0; + transitionCount = ite.automaton.getNumTransitions(state); + if (transitionCount != 0) { + ite.automaton.initTransition(state, transition); + ite.automaton.getNextTransition(transition); + } else { + + // Must set min to -1 so the "label < min" check never falsely triggers: + transition.min = -1; + + // Must set max to -1 so we immediately realize we need to step to the next transition and + // then pop this frame: + transition.max = -1; + } + } + + void load(TrieReader.Node node) throws IOException { + if (node != null) { + // This block is the first one in a possible sequence of floor blocks corresponding to a + // single seek point from the trie terms index + if (node.isFloor()) { + floorDataReader = node.floorData(ite.trieReader); + // Floor frame + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + + // If current state is not accept, and has transitions, we must process + // first block in case it has empty suffix: + if (ite.runAutomaton.isAccept(state) == false && transitionCount != 0) { + // Maybe skip floor blocks: + assert transitionIndex == 0 : "transitionIndex=" + transitionIndex; + while (numFollowFloorBlocks != 0 && nextFloorLabel <= transition.min) { + fp = fpOrig + (floorDataReader.readVLong() >>> 1); + numFollowFloorBlocks--; + if (numFollowFloorBlocks != 0) { + nextFloorLabel = floorDataReader.readByte() & 0xff; + } else { + nextFloorLabel = 256; + } + } + } + floorDataPos = floorDataReader.getFilePointer(); + } + } + + ite.in.seek(fp); + int code = ite.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + // term suffixes: + final long codeL = ite.in.readVLong(); + isLeafBlock = (codeL & 0x04) != 0; + final int numSuffixBytes = (int) (codeL >>> 3); + if (suffixBytes.length < numSuffixBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)]; + } + final CompressionAlgorithm compressionAlg; + try { + compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03); + } catch (IllegalArgumentException e) { + throw new CorruptIndexException(e.getMessage(), ite.in, e); + } + compressionAlg.read(ite.in, suffixBytes, numSuffixBytes); + suffixesReader.reset(suffixBytes, 0, numSuffixBytes); + + int numSuffixLengthBytes = ite.in.readVInt(); + final boolean allEqual = (numSuffixLengthBytes & 0x01) != 0; + numSuffixLengthBytes >>>= 1; + if (suffixLengthBytes.length < numSuffixLengthBytes) { + suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)]; + } + if (allEqual) { + Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ite.in.readByte()); + } else { + ite.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); + } + suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); + + // stats + int numBytes = ite.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; + metaDataUpto = 0; + + termState.termBlockOrd = 0; + nextEnt = 0; + + // metadata + numBytes = ite.in.readVInt(); + if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ite.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + if (!isLastInFloor) { + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ite.in.getFilePointer(); + } + } + + // TODO: maybe add scanToLabel; should give perf boost + + // Decodes next entry; returns true if it's a sub-block + public boolean next() { + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + return nextNonLeaf(); + } + } + + public void nextLeaf() { + assert nextEnt != -1 && nextEnt < entCount + : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffix = suffixLengthsReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + } + + public boolean nextNonLeaf() { + assert nextEnt != -1 && nextEnt < entCount + : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffix = code >>> 1; + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffix); + if ((code & 1) == 0) { + // A normal term + termState.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + lastSubFP = fp - suffixLengthsReader.readVLong(); + return true; + } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : termState.termBlockOrd; + } + + public void decodeMetaData() throws IOException { + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + + // stats + if (statsSingletonRunLength > 0) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if ((token & 1) == 1) { + termState.docFreq = 1; + termState.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + termState.docFreq = token >>> 1; + if (ite.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + termState.totalTermFreq = termState.docFreq; + } else { + termState.totalTermFreq = termState.docFreq + statsReader.readVLong(); + } + } + } + // metadata + ite.fr.parent.postingsReader.decodeTerm(bytesReader, ite.fr.fieldInfo, termState, absolute); + + metaDataUpto++; + absolute = false; + } + termState.termBlockOrd = metaDataUpto; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsReader.java new file mode 100644 index 000000000000..1eea2b1b2063 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsReader.java @@ -0,0 +1,326 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsProducer; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.internal.hppc.IntCursor; +import org.apache.lucene.internal.hppc.IntObjectHashMap; +import org.apache.lucene.store.ChecksumIndexInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.ReadAdvice; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IOUtils; + +/** + * A block-based terms index and dictionary that assigns terms to variable length blocks according + * to how they share prefixes. The terms index is a prefix trie whose leaves are term blocks. The + * advantage of this approach is that seekExact is often able to determine a term cannot exist + * without doing any IO, and intersection with Automata is very fast. Note that this terms + * dictionary has its own fixed terms index (ie, it does not support a pluggable terms index + * implementation). + * + *

    NOTE: this terms dictionary supports min/maxItemsPerBlock during indexing to control + * how much memory the terms index uses. + * + *

    The data structure used by this implementation is very similar to a burst trie + * (http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.18.3499), but with added logic to break + * up too-large blocks of all terms sharing a given prefix into smaller ones. + * + *

    Use {@link org.apache.lucene.index.CheckIndex} with the -verbose option to see + * summary statistics on the blocks in the dictionary. + * + *

    See {@link Lucene103BlockTreeTermsWriter}. + * + * @lucene.experimental + */ +public final class Lucene103BlockTreeTermsReader extends FieldsProducer { + + /** Extension of terms file */ + static final String TERMS_EXTENSION = "tim"; + + static final String TERMS_CODEC_NAME = "BlockTreeTermsDict"; + + /** Initial terms format. */ + public static final int VERSION_START = 0; + + /** Current terms format. */ + public static final int VERSION_CURRENT = VERSION_START; + + /** Extension of terms index file */ + static final String TERMS_INDEX_EXTENSION = "tip"; + + static final String TERMS_INDEX_CODEC_NAME = "BlockTreeTermsIndex"; + + /** Extension of terms meta file */ + static final String TERMS_META_EXTENSION = "tmd"; + + static final String TERMS_META_CODEC_NAME = "BlockTreeTermsMeta"; + + // Open input to the main terms dict file (_X.tib) + final IndexInput termsIn; + // Open input to the terms index file (_X.tip) + final IndexInput indexIn; + + // private static final boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // Reads the terms dict entries, to gather state to + // produce DocsEnum on demand + final PostingsReaderBase postingsReader; + + private final FieldInfos fieldInfos; + private final IntObjectHashMap fieldMap; + private final List fieldList; + + final String segment; + + final int version; + + /** Sole constructor. */ + public Lucene103BlockTreeTermsReader(PostingsReaderBase postingsReader, SegmentReadState state) + throws IOException { + boolean success = false; + + this.postingsReader = postingsReader; + this.segment = state.segmentInfo.name; + + try { + String termsName = + IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_EXTENSION); + termsIn = state.directory.openInput(termsName, state.context); + version = + CodecUtil.checkIndexHeader( + termsIn, + TERMS_CODEC_NAME, + VERSION_START, + VERSION_CURRENT, + state.segmentInfo.getId(), + state.segmentSuffix); + + String indexName = + IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_INDEX_EXTENSION); + indexIn = + state.directory.openInput( + indexName, state.context.withReadAdvice(ReadAdvice.RANDOM_PRELOAD)); + CodecUtil.checkIndexHeader( + indexIn, + TERMS_INDEX_CODEC_NAME, + version, + version, + state.segmentInfo.getId(), + state.segmentSuffix); + + // Read per-field details + String metaName = + IndexFileNames.segmentFileName(segment, state.segmentSuffix, TERMS_META_EXTENSION); + IntObjectHashMap fieldMap = null; + Throwable priorE = null; + long indexLength = -1, termsLength = -1; + try (ChecksumIndexInput metaIn = state.directory.openChecksumInput(metaName)) { + try { + CodecUtil.checkIndexHeader( + metaIn, + TERMS_META_CODEC_NAME, + version, + version, + state.segmentInfo.getId(), + state.segmentSuffix); + postingsReader.init(metaIn, state); + + final int numFields = metaIn.readVInt(); + if (numFields < 0) { + throw new CorruptIndexException("invalid numFields: " + numFields, metaIn); + } + fieldMap = new IntObjectHashMap<>(numFields); + for (int i = 0; i < numFields; ++i) { + final int field = metaIn.readVInt(); + final long numTerms = metaIn.readVLong(); + if (numTerms <= 0) { + throw new CorruptIndexException( + "Illegal numTerms for field number: " + field, metaIn); + } + final FieldInfo fieldInfo = state.fieldInfos.fieldInfo(field); + if (fieldInfo == null) { + throw new CorruptIndexException("invalid field number: " + field, metaIn); + } + final long sumTotalTermFreq = metaIn.readVLong(); + // when frequencies are omitted, sumDocFreq=sumTotalTermFreq and only one value is + // written. + final long sumDocFreq = + fieldInfo.getIndexOptions() == IndexOptions.DOCS + ? sumTotalTermFreq + : metaIn.readVLong(); + final int docCount = metaIn.readVInt(); + BytesRef minTerm = readBytesRef(metaIn); + BytesRef maxTerm = readBytesRef(metaIn); + if (numTerms == 1) { + assert maxTerm.equals(minTerm); + // save heap for edge case of a single term only so min == max + maxTerm = minTerm; + } + if (docCount < 0 + || docCount > state.segmentInfo.maxDoc()) { // #docs with field must be <= #docs + throw new CorruptIndexException( + "invalid docCount: " + docCount + " maxDoc: " + state.segmentInfo.maxDoc(), + metaIn); + } + if (sumDocFreq < docCount) { // #postings must be >= #docs with field + throw new CorruptIndexException( + "invalid sumDocFreq: " + sumDocFreq + " docCount: " + docCount, metaIn); + } + if (sumTotalTermFreq < sumDocFreq) { // #positions must be >= #postings + throw new CorruptIndexException( + "invalid sumTotalTermFreq: " + sumTotalTermFreq + " sumDocFreq: " + sumDocFreq, + metaIn); + } + FieldReader previous = + fieldMap.put( + fieldInfo.number, + new FieldReader( + this, + fieldInfo, + numTerms, + sumTotalTermFreq, + sumDocFreq, + docCount, + metaIn, + indexIn, + minTerm, + maxTerm)); + if (previous != null) { + throw new CorruptIndexException("duplicate field: " + fieldInfo.name, metaIn); + } + } + indexLength = metaIn.readLong(); + termsLength = metaIn.readLong(); + } catch (Throwable exception) { + priorE = exception; + } finally { + if (metaIn != null) { + CodecUtil.checkFooter(metaIn, priorE); + } else if (priorE != null) { + IOUtils.rethrowAlways(priorE); + } + } + } + // At this point the checksum of the meta file has been verified so the lengths are likely + // correct + CodecUtil.retrieveChecksum(indexIn, indexLength); + CodecUtil.retrieveChecksum(termsIn, termsLength); + fieldInfos = state.fieldInfos; + this.fieldMap = fieldMap; + this.fieldList = sortFieldNames(fieldMap, state.fieldInfos); + success = true; + } finally { + if (!success) { + // this.close() will close in: + IOUtils.closeWhileHandlingException(this); + } + } + } + + private static BytesRef readBytesRef(IndexInput in) throws IOException { + int numBytes = in.readVInt(); + if (numBytes < 0) { + throw new CorruptIndexException("invalid bytes length: " + numBytes, in); + } + + BytesRef bytes = new BytesRef(numBytes); + bytes.length = numBytes; + in.readBytes(bytes.bytes, 0, numBytes); + + return bytes; + } + + private static List sortFieldNames( + IntObjectHashMap fieldMap, FieldInfos fieldInfos) { + List fieldNames = new ArrayList<>(fieldMap.size()); + for (IntCursor fieldNumber : fieldMap.keys()) { + fieldNames.add(fieldInfos.fieldInfo(fieldNumber.value).name); + } + fieldNames.sort(null); + return Collections.unmodifiableList(fieldNames); + } + + // for debugging + // private static String toHex(int v) { + // return "0x" + Integer.toHexString(v); + // } + + @Override + public void close() throws IOException { + try { + IOUtils.close(indexIn, termsIn, postingsReader); + } finally { + // Clear so refs to terms index is GCable even if + // app hangs onto us: + fieldMap.clear(); + } + } + + @Override + public Iterator iterator() { + return fieldList.iterator(); + } + + @Override + public Terms terms(String field) throws IOException { + assert field != null; + FieldInfo fieldInfo = fieldInfos.fieldInfo(field); + return fieldInfo == null ? null : fieldMap.get(fieldInfo.number); + } + + @Override + public int size() { + return fieldMap.size(); + } + + @Override + public void checkIntegrity() throws IOException { + // terms index + CodecUtil.checksumEntireFile(indexIn); + + // term dictionary + CodecUtil.checksumEntireFile(termsIn); + + // postings + postingsReader.checkIntegrity(); + } + + @Override + public String toString() { + return getClass().getSimpleName() + + "(fields=" + + fieldMap.size() + + ",delegate=" + + postingsReader + + ")"; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsWriter.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsWriter.java new file mode 100644 index 000000000000..ee4689183aba --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Lucene103BlockTreeTermsWriter.java @@ -0,0 +1,1160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.codecs.FieldsConsumer; +import org.apache.lucene.codecs.NormsProducer; +import org.apache.lucene.codecs.PostingsWriterBase; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.FieldInfos; +import org.apache.lucene.index.Fields; +import org.apache.lucene.index.IndexFileNames; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.SegmentWriteState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.ByteBuffersDataOutput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.FixedBitSet; +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.StringHelper; +import org.apache.lucene.util.ToStringUtils; +import org.apache.lucene.util.compress.LZ4; +import org.apache.lucene.util.compress.LowercaseAsciiCompression; + +/* + TODO: + + - Currently there is a one-to-one mapping of indexed + term to term block, but we could decouple the two, ie, + put more terms into the index than there are blocks. + The index would take up more RAM but then it'd be able + to avoid seeking more often and could make PK/FuzzyQ + faster if the additional indexed terms could store + the offset into the terms block. + + - The blocks are not written in true depth-first + order, meaning if you just next() the file pointer will + sometimes jump backwards. For example, block foo* will + be written before block f* because it finished before. + This could possibly hurt performance if the terms dict is + not hot, since OSs anticipate sequential file access. We + could fix the writer to re-order the blocks as a 2nd + pass. + + - Each block encodes the term suffixes packed + sequentially using a separate vInt per term, which is + 1) wasteful and 2) slow (must linear scan to find a + particular suffix). We should instead 1) make + random-access array so we can directly access the Nth + suffix, and 2) bulk-encode this array using bulk int[] + codecs; then at search time we can binary search when + we seek a particular term. +*/ + +/** + * Block-based terms index and dictionary writer. + * + *

    Writes terms dict and index, block-encoding (column stride) each term's metadata for each set + * of terms between two index terms. + * + *

    Files: + * + *

    + * + *

    + * + *

    Term Dictionary

    + * + *

    The .tim file contains the list of terms in each field along with per-term statistics (such as + * docfreq) and per-term metadata (typically pointers to the postings list for that term in the + * inverted index). + * + *

    The .tim is arranged in blocks: with blocks containing a variable number of entries (by + * default 25-48), where each entry is either a term or a reference to a sub-block. + * + *

    NOTE: The term dictionary can plug into different postings implementations: the postings + * writer/reader are actually responsible for encoding and decoding the Postings Metadata and Term + * Metadata sections. + * + *

      + *
    • TermsDict (.tim) --> Header, FieldDictNumFields, Footer + *
    • FieldDict --> PostingsHeader, NodeBlockNumBlocks + *
    • NodeBlock --> (OuterNode | InnerNode) + *
    • OuterNode --> EntryCount, SuffixLength, ByteSuffixLength, StatsLength, < + * TermStats >EntryCount, MetaLength, + * <TermMetadata>EntryCount + *
    • InnerNode --> EntryCount, SuffixLength[,Sub?], ByteSuffixLength, StatsLength, + * < TermStats ? >EntryCount, MetaLength, <TermMetadata ? + * >EntryCount + *
    • TermStats --> DocFreq, TotalTermFreq + *
    • Header --> {@link CodecUtil#writeHeader CodecHeader} + *
    • EntryCount,SuffixLength,StatsLength,DocFreq,MetaLength --> {@link DataOutput#writeVInt + * VInt} + *
    • TotalTermFreq --> {@link DataOutput#writeVLong VLong} + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + * + *

    Notes: + * + *

      + *
    • Header is a {@link CodecUtil#writeHeader CodecHeader} storing the version information for + * the BlockTree implementation. + *
    • DocFreq is the count of documents which contain the term. + *
    • TotalTermFreq is the total number of occurrences of the term. This is encoded as the + * difference between the total number of occurrences and the DocFreq. + *
    • PostingsHeader and TermMetadata are plugged into by the specific postings implementation: + * these contain arbitrary per-file data (such as parameters or versioning information) and + * per-term data (such as pointers to inverted files). + *
    • For inner nodes of the tree, every entry will steal one bit to mark whether it points to + * child nodes(sub-block). If so, the corresponding TermStats and TermMetaData are omitted. + *
    + * + *

    + * + *

    Term Metadata

    + * + *

    The .tmd file contains the list of term metadata (such as trie index metadata) and field level + * statistics (such as sum of total term freq). + * + *

      + *
    • TermsMeta (.tmd) --> Header, NumFields, <FieldStats>NumFields, + * TermIndexLength, TermDictLength, Footer + *
    • FieldStats --> FieldNumber, NumTerms, RootCodeLength, ByteRootCodeLength, + * SumTotalTermFreq?, SumDocFreq, DocCount, MinTerm, MaxTerm, IndexStartFP, TrieRootNodeFp, + * IndexEndFp + *
    • Header --> {@link CodecUtil#writeHeader CodecHeader} + *
    • TermIndexLength, TermDictLength --> {@link DataOutput#writeLong Uint64} + *
    • MinTerm,MaxTerm --> {@link DataOutput#writeVInt VInt} length followed by the byte[] + *
    • NumFields,FieldNumber,RootCodeLength,DocCount --> {@link DataOutput#writeVInt VInt} + *
    • NumTerms,SumTotalTermFreq,SumDocFreq,IndexStartFP,TrieRootNodeFp,IndexEndFp --> {@link + * DataOutput#writeVLong VLong} + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + * + *

    Notes: + * + *

      + *
    • FieldNumber is the fields number from {@link FieldInfos}. (.fnm) + *
    • NumTerms is the number of unique terms for the field. + *
    • RootCode points to the root block for the field. + *
    • SumDocFreq is the total number of postings, the number of term-document pairs across the + * entire field. + *
    • DocCount is the number of documents that have at least one posting for this field. + *
    • MinTerm, MaxTerm are the lowest and highest term in this field. + *
    + * + * + * + *

    Term Index

    + * + *

    The .tip file contains an index into the term dictionary, so that it can be accessed randomly. + * The index is also used to determine when a given term cannot exist on disk (in the .tim file), + * saving a disk seek. + * + *

      + *
    • TermsIndex (.tip) --> Header, TrieIndexNumFieldsFooter + *
    • Header --> {@link CodecUtil#writeHeader CodecHeader} + * + *
    • TrieIndex --> {trie<byte[]>} + *
    • Footer --> {@link CodecUtil#writeFooter CodecFooter} + *
    + * + *

    Notes: + * + *

      + *
    • The .tip file contains a separate trie for each field. The trie maps a term prefix to the + * on-disk block that holds all terms starting with that prefix. Each field's IndexStartFP + * points to its trie. + *
    • The trie stores its nodes in a depth-first order. Each node stores its output (if any) and + * children's labels and file pointers. Nodes have various strategies to store their children, + * decided by the distribution of the children's labels. + *
    • It's possible that an on-disk block would contain too many terms (more than the allowed + * maximum (default: 48)). When this happens, the block is sub-divided into new blocks (called + * "floor blocks"), and then the output in the trie for the block's prefix encodes the leading + * byte of each sub-block, and its file pointer. + *
    + * + * @see Lucene103BlockTreeTermsReader + * @lucene.experimental + */ +public final class Lucene103BlockTreeTermsWriter extends FieldsConsumer { + + /** + * Suggested default value for the {@code minItemsInBlock} parameter to {@link + * #Lucene103BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MIN_BLOCK_SIZE = 25; + + /** + * Suggested default value for the {@code maxItemsInBlock} parameter to {@link + * #Lucene103BlockTreeTermsWriter(SegmentWriteState,PostingsWriterBase,int,int)}. + */ + public static final int DEFAULT_MAX_BLOCK_SIZE = 48; + + // public static boolean DEBUG = false; + // public static boolean DEBUG2 = false; + + // private final static boolean SAVE_DOT_FILES = false; + + private final IndexOutput metaOut; + private final IndexOutput termsOut; + private final IndexOutput indexOut; + final int maxDoc; + final int minItemsInBlock; + final int maxItemsInBlock; + final int version; + + final PostingsWriterBase postingsWriter; + final FieldInfos fieldInfos; + + private final List fields = new ArrayList<>(); + + /** + * Create a new writer. The number of items (terms or sub-blocks) per block will aim to be between + * minItemsPerBlock and maxItemsPerBlock, though in some cases the blocks may be smaller than the + * min. + */ + public Lucene103BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock) + throws IOException { + this( + state, + postingsWriter, + minItemsInBlock, + maxItemsInBlock, + Lucene103BlockTreeTermsReader.VERSION_CURRENT); + } + + /** Expert constructor that allows configuring the version, used for bw tests. */ + public Lucene103BlockTreeTermsWriter( + SegmentWriteState state, + PostingsWriterBase postingsWriter, + int minItemsInBlock, + int maxItemsInBlock, + int version) + throws IOException { + validateSettings(minItemsInBlock, maxItemsInBlock); + + this.minItemsInBlock = minItemsInBlock; + this.maxItemsInBlock = maxItemsInBlock; + if (version < Lucene103BlockTreeTermsReader.VERSION_START + || version > Lucene103BlockTreeTermsReader.VERSION_CURRENT) { + throw new IllegalArgumentException( + "Expected version in range [" + + Lucene103BlockTreeTermsReader.VERSION_START + + ", " + + Lucene103BlockTreeTermsReader.VERSION_CURRENT + + "], but got " + + version); + } + this.version = version; + + this.maxDoc = state.segmentInfo.maxDoc(); + this.fieldInfos = state.fieldInfos; + this.postingsWriter = postingsWriter; + + final String termsName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene103BlockTreeTermsReader.TERMS_EXTENSION); + termsOut = state.directory.createOutput(termsName, state.context); + boolean success = false; + IndexOutput metaOut = null, indexOut = null; + try { + CodecUtil.writeIndexHeader( + termsOut, + Lucene103BlockTreeTermsReader.TERMS_CODEC_NAME, + version, + state.segmentInfo.getId(), + state.segmentSuffix); + + final String indexName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene103BlockTreeTermsReader.TERMS_INDEX_EXTENSION); + indexOut = state.directory.createOutput(indexName, state.context); + CodecUtil.writeIndexHeader( + indexOut, + Lucene103BlockTreeTermsReader.TERMS_INDEX_CODEC_NAME, + version, + state.segmentInfo.getId(), + state.segmentSuffix); + // segment = state.segmentInfo.name; + + final String metaName = + IndexFileNames.segmentFileName( + state.segmentInfo.name, + state.segmentSuffix, + Lucene103BlockTreeTermsReader.TERMS_META_EXTENSION); + metaOut = state.directory.createOutput(metaName, state.context); + CodecUtil.writeIndexHeader( + metaOut, + Lucene103BlockTreeTermsReader.TERMS_META_CODEC_NAME, + version, + state.segmentInfo.getId(), + state.segmentSuffix); + + postingsWriter.init(metaOut, state); // have consumer write its format/header + + this.metaOut = metaOut; + this.indexOut = indexOut; + success = true; + } finally { + if (!success) { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut); + } + } + } + + /** Throws {@code IllegalArgumentException} if any of these settings is invalid. */ + public static void validateSettings(int minItemsInBlock, int maxItemsInBlock) { + if (minItemsInBlock <= 1) { + throw new IllegalArgumentException("minItemsInBlock must be >= 2; got " + minItemsInBlock); + } + if (minItemsInBlock > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be >= minItemsInBlock; got maxItemsInBlock=" + + maxItemsInBlock + + " minItemsInBlock=" + + minItemsInBlock); + } + if (2 * (minItemsInBlock - 1) > maxItemsInBlock) { + throw new IllegalArgumentException( + "maxItemsInBlock must be at least 2*(minItemsInBlock-1); got maxItemsInBlock=" + + maxItemsInBlock + + " minItemsInBlock=" + + minItemsInBlock); + } + } + + @Override + public void write(Fields fields, NormsProducer norms) throws IOException { + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment); + + String lastField = null; + for (String field : fields) { + assert lastField == null || lastField.compareTo(field) < 0; + lastField = field; + + // if (DEBUG) System.out.println("\nBTTW.write seg=" + segment + " field=" + field); + Terms terms = fields.terms(field); + if (terms == null) { + continue; + } + + TermsEnum termsEnum = terms.iterator(); + TermsWriter termsWriter = new TermsWriter(fieldInfos.fieldInfo(field)); + while (true) { + BytesRef term = termsEnum.next(); + // if (DEBUG) System.out.println("BTTW: next term " + term); + + if (term == null) { + break; + } + + // if (DEBUG) System.out.println("write field=" + fieldInfo.name + " term=" + + // ToStringUtils.bytesRefToString(term)); + termsWriter.write(term, termsEnum, norms); + } + + termsWriter.finish(); + + // if (DEBUG) System.out.println("\nBTTW.write done seg=" + segment + " field=" + field); + } + } + + private static class PendingEntry { + public final boolean isTerm; + + protected PendingEntry(boolean isTerm) { + this.isTerm = isTerm; + } + } + + private static final class PendingTerm extends PendingEntry { + public final byte[] termBytes; + // stats + metadata + public final BlockTermState state; + + public PendingTerm(BytesRef term, BlockTermState state) { + super(true); + this.termBytes = new byte[term.length]; + System.arraycopy(term.bytes, term.offset, termBytes, 0, term.length); + this.state = state; + } + + @Override + public String toString() { + return "TERM: " + ToStringUtils.bytesRefToString(termBytes); + } + } + + private final class PendingBlock extends PendingEntry { + public final BytesRef prefix; + public final long fp; + public TrieBuilder index; + public List subIndices; + public final boolean hasTerms; + public final boolean isFloor; + public final int floorLeadByte; + + public PendingBlock( + BytesRef prefix, + long fp, + boolean hasTerms, + boolean isFloor, + int floorLeadByte, + List subIndices) { + super(false); + this.prefix = prefix; + this.fp = fp; + this.hasTerms = hasTerms; + this.isFloor = isFloor; + this.floorLeadByte = floorLeadByte; + this.subIndices = subIndices; + } + + @Override + public String toString() { + return "BLOCK: prefix=" + ToStringUtils.bytesRefToString(prefix); + } + + public void compileIndex(List blocks, ByteBuffersDataOutput scratchBytes) + throws IOException { + + assert (isFloor && blocks.size() > 1) || (isFloor == false && blocks.size() == 1) + : "isFloor=" + isFloor + " blocks=" + blocks; + assert this == blocks.get(0); + + assert scratchBytes.size() == 0; + + BytesRef floorData = null; + if (isFloor) { + scratchBytes.writeVInt(blocks.size() - 1); + for (int i = 1; i < blocks.size(); i++) { + PendingBlock sub = blocks.get(i); + assert sub.floorLeadByte != -1; + // if (DEBUG) { + // System.out.println(" write floorLeadByte=" + + // Integer.toHexString(sub.floorLeadByte&0xff)); + // } + scratchBytes.writeByte((byte) sub.floorLeadByte); + assert sub.fp > fp; + scratchBytes.writeVLong((sub.fp - fp) << 1 | (sub.hasTerms ? 1 : 0)); + } + floorData = new BytesRef(scratchBytes.toArrayCopy()); + } + + TrieBuilder trieBuilder = + TrieBuilder.bytesRefToTrie(prefix, new TrieBuilder.Output(fp, hasTerms, floorData)); + scratchBytes.reset(); + + // Copy over index for all sub-blocks + for (PendingBlock block : blocks) { + if (block.subIndices != null) { + for (TrieBuilder subIndex : block.subIndices) { + trieBuilder.append(subIndex); + } + block.subIndices = null; + } + } + + index = trieBuilder; + + assert subIndices == null; + + /* + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot")); + Util.toDot(index, w, false, false); + System.out.println("SAVED to out.dot"); + w.close(); + */ + } + } + + private final ByteBuffersDataOutput scratchBytes = ByteBuffersDataOutput.newResettableInstance(); + + private static class StatsWriter { + + private final DataOutput out; + private final boolean hasFreqs; + private int singletonCount; + + StatsWriter(DataOutput out, boolean hasFreqs) { + this.out = out; + this.hasFreqs = hasFreqs; + } + + void add(int df, long ttf) throws IOException { + // Singletons (DF==1, TTF==1) are run-length encoded + if (df == 1 && (hasFreqs == false || ttf == 1)) { + singletonCount++; + } else { + finish(); + out.writeVInt(df << 1); + if (hasFreqs) { + out.writeVLong(ttf - df); + } + } + } + + void finish() throws IOException { + if (singletonCount > 0) { + out.writeVInt(((singletonCount - 1) << 1) | 1); + singletonCount = 0; + } + } + } + + class TermsWriter { + private final FieldInfo fieldInfo; + private long numTerms; + final FixedBitSet docsSeen; + long sumTotalTermFreq; + long sumDocFreq; + + // Records index into pending where the current prefix at that + // length "started"; for example, if current term starts with 't', + // startsByPrefix[0] is the index into pending for the first + // term/sub-block starting with 't'. We use this to figure out when + // to write a new block: + private final BytesRefBuilder lastTerm = new BytesRefBuilder(); + private int[] prefixStarts = new int[8]; + + // Pending stack of terms and blocks. As terms arrive (in sorted order) + // we append to this stack, and once the top of the stack has enough + // terms starting with a common prefix, we write a new block with + // those terms and replace those terms in the stack with a new block: + private final List pending = new ArrayList<>(); + + // Reused in writeBlocks: + private final List newBlocks = new ArrayList<>(); + + private PendingTerm firstPendingTerm; + private PendingTerm lastPendingTerm; + + /** Writes the top count entries in pending, using prevTerm to compute the prefix. */ + void writeBlocks(int prefixLength, int count) throws IOException { + + assert count > 0; + + // if (DEBUG2) { + // BytesRef br = new BytesRef(lastTerm.bytes()); + // br.length = prefixLength; + // System.out.println("writeBlocks: seg=" + segment + " prefix=" + + // ToStringUtils.bytesRefToString(br) + " count=" + count); + // } + + // Root block better write all remaining pending entries: + assert prefixLength > 0 || count == pending.size(); + + int lastSuffixLeadLabel = -1; + + // True if we saw at least one term in this block (we record if a block + // only points to sub-blocks in the terms index so we can avoid seeking + // to it when we are looking for a term): + boolean hasTerms = false; + boolean hasSubBlocks = false; + + int start = pending.size() - count; + int end = pending.size(); + int nextBlockStart = start; + int nextFloorLeadLabel = -1; + + for (int i = start; i < end; i++) { + + PendingEntry ent = pending.get(i); + + int suffixLeadLabel; + + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + if (term.termBytes.length == prefixLength) { + // Suffix is 0, i.e. prefix 'foo' and term is + // 'foo' so the term has empty string suffix + // in this block + assert lastSuffixLeadLabel == -1 + : "i=" + i + " lastSuffixLeadLabel=" + lastSuffixLeadLabel; + suffixLeadLabel = -1; + } else { + suffixLeadLabel = term.termBytes[prefixLength] & 0xff; + } + } else { + PendingBlock block = (PendingBlock) ent; + assert block.prefix.length > prefixLength; + suffixLeadLabel = block.prefix.bytes[block.prefix.offset + prefixLength] & 0xff; + } + // if (DEBUG) System.out.println(" i=" + i + " ent=" + ent + " suffixLeadLabel=" + + // suffixLeadLabel); + + if (suffixLeadLabel != lastSuffixLeadLabel) { + int itemsInBlock = i - nextBlockStart; + if (itemsInBlock >= minItemsInBlock && end - nextBlockStart > maxItemsInBlock) { + // The count is too large for one block, so we must break it into "floor" blocks, where + // we record + // the leading label of the suffix of the first term in each floor block, so at search + // time we can + // jump to the right floor block. We just use a naive greedy segmenter here: make a new + // floor + // block as soon as we have at least minItemsInBlock. This is not always best: it often + // produces + // a too-small block as the final block: + boolean isFloor = itemsInBlock < count; + newBlocks.add( + writeBlock( + prefixLength, + isFloor, + nextFloorLeadLabel, + nextBlockStart, + i, + hasTerms, + hasSubBlocks)); + + hasTerms = false; + hasSubBlocks = false; + nextFloorLeadLabel = suffixLeadLabel; + nextBlockStart = i; + } + + lastSuffixLeadLabel = suffixLeadLabel; + } + + if (ent.isTerm) { + hasTerms = true; + } else { + hasSubBlocks = true; + } + } + + // Write last block, if any: + if (nextBlockStart < end) { + int itemsInBlock = end - nextBlockStart; + boolean isFloor = itemsInBlock < count; + newBlocks.add( + writeBlock( + prefixLength, + isFloor, + nextFloorLeadLabel, + nextBlockStart, + end, + hasTerms, + hasSubBlocks)); + } + + assert newBlocks.isEmpty() == false; + + PendingBlock firstBlock = newBlocks.get(0); + + assert firstBlock.isFloor || newBlocks.size() == 1; + + firstBlock.compileIndex(newBlocks, scratchBytes); + + // Remove slice from the top of the pending stack, that we just wrote: + pending.subList(pending.size() - count, pending.size()).clear(); + + // Append new block + pending.add(firstBlock); + + newBlocks.clear(); + } + + private boolean allEqual(byte[] b, int startOffset, int endOffset, byte value) { + Objects.checkFromToIndex(startOffset, endOffset, b.length); + for (int i = startOffset; i < endOffset; ++i) { + if (b[i] != value) { + return false; + } + } + return true; + } + + /** + * Writes the specified slice (start is inclusive, end is exclusive) from pending stack as a new + * block. If isFloor is true, there were too many (more than maxItemsInBlock) entries sharing + * the same prefix, and so we broke it into multiple floor blocks where we record the starting + * label of the suffix of each floor block. + */ + private PendingBlock writeBlock( + int prefixLength, + boolean isFloor, + int floorLeadLabel, + int start, + int end, + boolean hasTerms, + boolean hasSubBlocks) + throws IOException { + + assert end > start; + + long startFP = termsOut.getFilePointer(); + + boolean hasFloorLeadLabel = isFloor && floorLeadLabel != -1; + + final BytesRef prefix = new BytesRef(prefixLength + (hasFloorLeadLabel ? 1 : 0)); + System.arraycopy(lastTerm.get().bytes, 0, prefix.bytes, 0, prefixLength); + prefix.length = prefixLength; + + // if (DEBUG2) System.out.println(" writeBlock field=" + fieldInfo.name + " prefix=" + + // ToStringUtils.bytesRefToString(prefix) + " fp=" + startFP + " isFloor=" + isFloor + + // " isLastInFloor=" + (end == pending.size()) + " floorLeadLabel=" + floorLeadLabel + + // " start=" + start + " end=" + end + " hasTerms=" + hasTerms + " hasSubBlocks=" + + // hasSubBlocks); + + // Write block header: + int numEntries = end - start; + int code = numEntries << 1; + if (end == pending.size()) { + // Last block: + code |= 1; + } + termsOut.writeVInt(code); + + /* + if (DEBUG) { + System.out.println(" writeBlock " + (isFloor ? "(floor) " : "") + "seg=" + segment + " pending.size()=" + + pending.size() + " prefixLength=" + prefixLength + " indexPrefix=" + ToStringUtils.bytesRefToString(prefix) + + " entCount=" + (end-start+1) + " startFP=" + startFP + (isFloor ? (" floorLeadLabel=" + Integer.toHexString(floorLeadLabel)) : "")); + } + */ + + // 1st pass: pack term suffix bytes into byte[] blob + // TODO: cutover to bulk int codec... simple64? + + // We optimize the leaf block case (block has only terms), writing a more + // compact format in this case: + boolean isLeafBlock = hasSubBlocks == false; + + // System.out.println(" isLeaf=" + isLeafBlock); + + final List subIndices; + + boolean absolute = true; + + if (isLeafBlock) { + // Block contains only ordinary terms: + subIndices = null; + StatsWriter statsWriter = + new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + assert ent.isTerm : "i=" + i; + + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For leaf block we write suffix straight + suffixLengthsWriter.writeVInt(suffix); + suffixWriter.append(term.termBytes, prefixLength, suffix); + assert floorLeadLabel == -1 || (term.termBytes[prefixLength] & 0xff) >= floorLeadLabel; + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } + statsWriter.finish(); + } else { + // Block has at least one prefix term or a sub block: + subIndices = new ArrayList<>(); + StatsWriter statsWriter = + new StatsWriter(this.statsWriter, fieldInfo.getIndexOptions() != IndexOptions.DOCS); + for (int i = start; i < end; i++) { + PendingEntry ent = pending.get(i); + if (ent.isTerm) { + PendingTerm term = (PendingTerm) ent; + + assert StringHelper.startsWith(term.termBytes, prefix) : term + " prefix=" + prefix; + BlockTermState state = term.state; + final int suffix = term.termBytes.length - prefixLength; + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(term.termBytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write term suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes)); + // } + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block, and 1 bit to record if + // it's a prefix term. Terms cannot be larger than ~32 KB + // so we won't run out of bits: + + suffixLengthsWriter.writeVInt(suffix << 1); + suffixWriter.append(term.termBytes, prefixLength, suffix); + + // Write term stats, to separate byte[] blob: + statsWriter.add(state.docFreq, state.totalTermFreq); + + // TODO: now that terms dict "sees" these longs, + // we can explore better column-stride encodings + // to encode all long[0]s for this block at + // once, all long[1]s, etc., e.g. using + // Simple64. Alternatively, we could interleave + // stats + meta ... no reason to have them + // separate anymore: + + // Write term meta data + postingsWriter.encodeTerm(metaWriter, fieldInfo, state, absolute); + absolute = false; + } else { + PendingBlock block = (PendingBlock) ent; + assert StringHelper.startsWith(block.prefix, prefix); + final int suffix = block.prefix.length - prefixLength; + assert StringHelper.startsWith(block.prefix, prefix); + + assert suffix > 0; + + // For non-leaf block we borrow 1 bit to record + // if entry is term or sub-block:f + suffixLengthsWriter.writeVInt((suffix << 1) | 1); + suffixWriter.append(block.prefix.bytes, prefixLength, suffix); + + // if (DEBUG2) { + // BytesRef suffixBytes = new BytesRef(suffix); + // System.arraycopy(block.prefix.bytes, prefixLength, suffixBytes.bytes, 0, suffix); + // suffixBytes.length = suffix; + // System.out.println(" write sub-block suffix=" + + // ToStringUtils.bytesRefToString(suffixBytes) + " subFP=" + block.fp + " subCode=" + + // (startFP-block.fp) + " floor=" + block.isFloor); + // } + + assert floorLeadLabel == -1 + || (block.prefix.bytes[prefixLength] & 0xff) >= floorLeadLabel + : "floorLeadLabel=" + + floorLeadLabel + + " suffixLead=" + + (block.prefix.bytes[prefixLength] & 0xff); + assert block.fp < startFP; + + suffixLengthsWriter.writeVLong(startFP - block.fp); + subIndices.add(block.index); + } + } + statsWriter.finish(); + + assert subIndices.size() != 0; + } + + // Write suffixes byte[] blob to terms dict output, either uncompressed, compressed with LZ4 + // or with LowercaseAsciiCompression. + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + // If there are 2 suffix bytes or less per term, then we don't bother compressing as suffix + // are unlikely what + // makes the terms dictionary large, and it also tends to be frequently the case for dense IDs + // like + // auto-increment IDs, so not compressing in that case helps not hurt ID lookups by too much. + // We also only start compressing when the prefix length is greater than 2 since blocks whose + // prefix length is + // 1 or 2 always all get visited when running a fuzzy query whose max number of edits is 2. + if (suffixWriter.length() > 2L * numEntries && prefixLength > 2) { + // LZ4 inserts references whenever it sees duplicate strings of 4 chars or more, so only try + // it out if the + // average suffix length is greater than 6. + if (suffixWriter.length() > 6L * numEntries) { + if (compressionHashTable == null) { + compressionHashTable = new LZ4.HighCompressionHashTable(); + } + LZ4.compress( + suffixWriter.bytes(), 0, suffixWriter.length(), spareWriter, compressionHashTable); + if (spareWriter.size() < suffixWriter.length() - (suffixWriter.length() >>> 2)) { + // LZ4 saved more than 25%, go for it + compressionAlg = CompressionAlgorithm.LZ4; + } + } + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + spareWriter.reset(); + if (spareBytes.length < suffixWriter.length()) { + spareBytes = new byte[ArrayUtil.oversize(suffixWriter.length(), 1)]; + } + if (LowercaseAsciiCompression.compress( + suffixWriter.bytes(), suffixWriter.length(), spareBytes, spareWriter)) { + compressionAlg = CompressionAlgorithm.LOWERCASE_ASCII; + } + } + } + long token = ((long) suffixWriter.length()) << 3; + if (isLeafBlock) { + token |= 0x04; + } + token |= compressionAlg.code; + termsOut.writeVLong(token); + if (compressionAlg == CompressionAlgorithm.NO_COMPRESSION) { + termsOut.writeBytes(suffixWriter.bytes(), suffixWriter.length()); + } else { + spareWriter.copyTo(termsOut); + } + suffixWriter.setLength(0); + spareWriter.reset(); + + // Write suffix lengths + final int numSuffixBytes = Math.toIntExact(suffixLengthsWriter.size()); + spareBytes = ArrayUtil.growNoCopy(spareBytes, numSuffixBytes); + suffixLengthsWriter.copyTo(new ByteArrayDataOutput(spareBytes)); + suffixLengthsWriter.reset(); + if (allEqual(spareBytes, 1, numSuffixBytes, spareBytes[0])) { + // Structured fields like IDs often have most values of the same length + termsOut.writeVInt((numSuffixBytes << 1) | 1); + termsOut.writeByte(spareBytes[0]); + } else { + termsOut.writeVInt(numSuffixBytes << 1); + termsOut.writeBytes(spareBytes, numSuffixBytes); + } + + // Stats + final int numStatsBytes = Math.toIntExact(statsWriter.size()); + termsOut.writeVInt(numStatsBytes); + statsWriter.copyTo(termsOut); + statsWriter.reset(); + + // Write term meta data byte[] blob + termsOut.writeVInt((int) metaWriter.size()); + metaWriter.copyTo(termsOut); + metaWriter.reset(); + + // if (DEBUG) { + // System.out.println(" fpEnd=" + out.getFilePointer()); + // } + + if (hasFloorLeadLabel) { + // We already allocated to length+1 above: + prefix.bytes[prefix.length++] = (byte) floorLeadLabel; + } + + return new PendingBlock(prefix, startFP, hasTerms, isFloor, floorLeadLabel, subIndices); + } + + TermsWriter(FieldInfo fieldInfo) { + this.fieldInfo = fieldInfo; + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + docsSeen = new FixedBitSet(maxDoc); + postingsWriter.setField(fieldInfo); + } + + /** Writes one term's worth of postings. */ + public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException { + /* + if (DEBUG) { + int[] tmp = new int[lastTerm.length]; + System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length); + System.out.println("BTTW: write term=" + ToStringUtils.bytesRefToString(text) + " prefixStarts=" + Arrays.toString(tmp) + + " pending.size()=" + pending.size()); + } + */ + + BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms); + if (state != null) { + + assert state.docFreq != 0; + assert fieldInfo.getIndexOptions() == IndexOptions.DOCS + || state.totalTermFreq >= state.docFreq + : "postingsWriter=" + postingsWriter; + pushTerm(text); + + PendingTerm term = new PendingTerm(text, state); + pending.add(term); + // if (DEBUG) System.out.println(" add pending term = " + text + " pending.size()=" + + // pending.size()); + + sumDocFreq += state.docFreq; + sumTotalTermFreq += state.totalTermFreq; + numTerms++; + if (firstPendingTerm == null) { + firstPendingTerm = term; + } + lastPendingTerm = term; + } + } + + /** Pushes the new term to the top of the stack, and writes new blocks. */ + private void pushTerm(BytesRef text) throws IOException { + // Find common prefix between last term and current term: + int prefixLength = + Arrays.mismatch( + lastTerm.bytes(), + 0, + lastTerm.length(), + text.bytes, + text.offset, + text.offset + text.length); + if (prefixLength == -1) { // Only happens for the first term, if it is empty + assert lastTerm.length() == 0; + prefixLength = 0; + } + + // if (DEBUG) System.out.println(" shared=" + pos + " lastTerm.length=" + lastTerm.length); + + // Close the "abandoned" suffix now: + for (int i = lastTerm.length() - 1; i >= prefixLength; i--) { + + // How many items on top of the stack share the current suffix + // we are closing: + int prefixTopSize = pending.size() - prefixStarts[i]; + if (prefixTopSize >= minItemsInBlock) { + // if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + + // " minItemsInBlock=" + minItemsInBlock); + writeBlocks(i + 1, prefixTopSize); + prefixStarts[i] -= prefixTopSize - 1; + } + } + + if (prefixStarts.length < text.length) { + prefixStarts = ArrayUtil.grow(prefixStarts, text.length); + } + + // Init new tail: + for (int i = prefixLength; i < text.length; i++) { + prefixStarts[i] = pending.size(); + } + + lastTerm.copyBytes(text); + } + + // Finishes all terms in this field + public void finish() throws IOException { + if (numTerms > 0) { + // if (DEBUG) System.out.println("BTTW: finish prefixStarts=" + + // Arrays.toString(prefixStarts)); + + // Add empty term to force closing of all final blocks: + pushTerm(new BytesRef()); + + // TODO: if pending.size() is already 1 with a non-zero prefix length + // we can save writing a "degenerate" root block, but we have to + // fix all the places that assume the root block's prefix is the empty string: + pushTerm(new BytesRef()); + writeBlocks(0, pending.size()); + + // We better have one final "root" block: + assert pending.size() == 1 && !pending.get(0).isTerm + : "pending.size()=" + pending.size() + " pending=" + pending; + final PendingBlock root = (PendingBlock) pending.get(0); + assert root.prefix.length == 0; + assert root.index.getEmptyOutput() != null; + + ByteBuffersDataOutput metaOut = new ByteBuffersDataOutput(); + fields.add(metaOut); + + metaOut.writeVInt(fieldInfo.number); + metaOut.writeVLong(numTerms); + assert fieldInfo.getIndexOptions() != IndexOptions.NONE; + if (fieldInfo.getIndexOptions() != IndexOptions.DOCS) { + metaOut.writeVLong(sumTotalTermFreq); + } + metaOut.writeVLong(sumDocFreq); + metaOut.writeVInt(docsSeen.cardinality()); + writeBytesRef(metaOut, new BytesRef(firstPendingTerm.termBytes)); + writeBytesRef(metaOut, new BytesRef(lastPendingTerm.termBytes)); + root.index.save(metaOut, indexOut); + // System.out.println(" write trie " + indexStartFP + " field=" + fieldInfo.name); + + /* + if (DEBUG) { + final String dotFileName = segment + "_" + fieldInfo.name + ".dot"; + Writer w = new OutputStreamWriter(new FileOutputStream(dotFileName)); + Util.toDot(root.index, w, false, false); + System.out.println("SAVED to " + dotFileName); + w.close(); + } + */ + + } else { + assert sumTotalTermFreq == 0 + || fieldInfo.getIndexOptions() == IndexOptions.DOCS && sumTotalTermFreq == -1; + assert sumDocFreq == 0; + assert docsSeen.cardinality() == 0; + } + } + + private final ByteBuffersDataOutput suffixLengthsWriter = + ByteBuffersDataOutput.newResettableInstance(); + private final BytesRefBuilder suffixWriter = new BytesRefBuilder(); + private final ByteBuffersDataOutput statsWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput metaWriter = ByteBuffersDataOutput.newResettableInstance(); + private final ByteBuffersDataOutput spareWriter = ByteBuffersDataOutput.newResettableInstance(); + private byte[] spareBytes = BytesRef.EMPTY_BYTES; + private LZ4.HighCompressionHashTable compressionHashTable; + } + + private boolean closed; + + @Override + public void close() throws IOException { + if (closed) { + return; + } + closed = true; + + boolean success = false; + try { + metaOut.writeVInt(fields.size()); + for (ByteBuffersDataOutput fieldMeta : fields) { + fieldMeta.copyTo(metaOut); + } + CodecUtil.writeFooter(indexOut); + metaOut.writeLong(indexOut.getFilePointer()); + CodecUtil.writeFooter(termsOut); + metaOut.writeLong(termsOut.getFilePointer()); + CodecUtil.writeFooter(metaOut); + success = true; + } finally { + if (success) { + IOUtils.close(metaOut, termsOut, indexOut, postingsWriter); + } else { + IOUtils.closeWhileHandlingException(metaOut, termsOut, indexOut, postingsWriter); + } + } + } + + private static void writeBytesRef(DataOutput out, BytesRef bytes) throws IOException { + out.writeVInt(bytes.length); + out.writeBytes(bytes.bytes, bytes.offset, bytes.length); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnum.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnum.java new file mode 100644 index 000000000000..99acaae560d0 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnum.java @@ -0,0 +1,1070 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.io.PrintStream; +import java.util.Arrays; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.BaseTermsEnum; +import org.apache.lucene.index.ImpactsEnum; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.TermState; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; +import org.apache.lucene.util.IOBooleanSupplier; +import org.apache.lucene.util.RamUsageEstimator; + +/** Iterates through terms in this field. */ +final class SegmentTermsEnum extends BaseTermsEnum { + + // Lazy init: + IndexInput in; + private SegmentTermsEnumFrame[] stack = new SegmentTermsEnumFrame[0]; + private final SegmentTermsEnumFrame staticFrame; + SegmentTermsEnumFrame currentFrame; + boolean termExists; + final FieldReader fr; + private int targetBeforeCurrentLength; + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // What prefix of the current term was present in the index; when we only next() through the + // index, this stays at 0. It's only set when + // we seekCeil/Exact: + private int validIndexPrefix; + + // assert only: + private boolean eof; + + final BytesRefBuilder term = new BytesRefBuilder(); + private final TrieReader trieReader; + private TrieReader.Node[] nodes = new TrieReader.Node[1]; + + public SegmentTermsEnum(FieldReader fr, TrieReader reader) throws IOException { + this.fr = fr; + // Used to hold seek by TermState, or cached seek + staticFrame = new SegmentTermsEnumFrame(this, -1); + trieReader = reader; + currentFrame = staticFrame; + nodes[0] = trieReader.root; + + // currentFrame = pushFrame(arc, rootCode, 0); + // currentFrame.loadBlock(); + validIndexPrefix = 0; + // if (DEBUG) { + // System.out.println("init frame state " + currentFrame.ord); + // printSeekState(); + // } + + // System.out.println(); + // computeBlockStats().print(System.out); + } + + // Not private to avoid synthetic access$NNN methods + void initIndexInput() { + if (this.in == null) { + this.in = fr.parent.termsIn.clone(); + } + } + + /** Runs next() through the entire terms dict, computing aggregate statistics. */ + public Stats computeBlockStats() throws IOException { + Stats stats = new Stats(fr.parent.segment, fr.fieldInfo.name); + + currentFrame = staticFrame; + TrieReader.Node node = nodes[0] = trieReader.root; + + // Empty string prefix must have an output in the + // index! + currentFrame = pushFrame(node, 0); + currentFrame.fpOrig = currentFrame.fp; + currentFrame.loadBlock(); + validIndexPrefix = 0; + + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + + allTerms: + while (true) { + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + stats.endBlock(currentFrame); + if (!currentFrame.isLastInFloor) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + stats.startBlock(currentFrame, true); + break; + } else { + if (currentFrame.ord == 0) { + break allTerms; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + assert lastFP == currentFrame.lastSubFP; + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while (true) { + if (currentFrame.next()) { + // Push to new block: + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + currentFrame.fpOrig = currentFrame.fp; + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.loadBlock(); + stats.startBlock(currentFrame, !currentFrame.isLastInFloor); + } else { + stats.term(term.get()); + break; + } + } + } + + stats.finish(); + + // Put root frame back: + currentFrame = staticFrame; + + node = nodes[0] = trieReader.root; + // Empty string prefix must have an output in the index! + assert node.hasOutput(); + + currentFrame = pushFrame(node, 0); + currentFrame.rewind(); + currentFrame.loadBlock(); + validIndexPrefix = 0; + term.clear(); + + return stats; + } + + private SegmentTermsEnumFrame getFrame(int ord) throws IOException { + if (ord >= stack.length) { + final SegmentTermsEnumFrame[] next = + new SegmentTermsEnumFrame + [ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(stack, 0, next, 0, stack.length); + for (int stackOrd = stack.length; stackOrd < next.length; stackOrd++) { + next[stackOrd] = new SegmentTermsEnumFrame(this, stackOrd); + } + stack = next; + } + assert stack[ord].ord == ord; + return stack[ord]; + } + + private TrieReader.Node getNode(int ord) { + if (ord >= nodes.length) { + final TrieReader.Node[] next = + new TrieReader.Node[ArrayUtil.oversize(1 + ord, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; + System.arraycopy(nodes, 0, next, 0, nodes.length); + for (int nodeOrd = nodes.length; nodeOrd < next.length; nodeOrd++) { + next[nodeOrd] = new TrieReader.Node(); + } + nodes = next; + } + return nodes[ord]; + } + + // Pushes a frame we seek'd to + SegmentTermsEnumFrame pushFrame(TrieReader.Node node, int length) throws IOException { + final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); + f.hasTerms = node.hasTerms; + f.hasTermsOrig = f.hasTerms; + f.isFloor = node.isFloor(); + if (f.isFloor) { + f.setFloorData(node.floorData(trieReader)); + } + pushFrame(node, node.outputFp, length); + + return f; + } + + // Pushes next'd frame or seek'd frame; we later + // lazy-load the frame only when needed + SegmentTermsEnumFrame pushFrame(TrieReader.Node node, long fp, int length) throws IOException { + final SegmentTermsEnumFrame f = getFrame(1 + currentFrame.ord); + f.node = node; + if (f.fpOrig == fp && f.nextEnt != -1) { + // if (DEBUG) System.out.println(" push reused frame ord=" + f.ord + " fp=" + f.fp + + // " isFloor?=" + f.isFloor + " hasTerms=" + f.hasTerms + " pref=" + term + " nextEnt=" + + // f.nextEnt + " targetBeforeCurrentLength=" + targetBeforeCurrentLength + " term.length=" + + // term.length + " vs prefix=" + f.prefix); + // if (f.prefix > targetBeforeCurrentLength) { + if (f.ord > targetBeforeCurrentLength) { + f.rewind(); + } else { + // if (DEBUG) { + // System.out.println(" skip rewind!"); + // } + } + assert length == f.prefixLength; + } else { + f.nextEnt = -1; + f.prefixLength = length; + f.state.termBlockOrd = 0; + f.fpOrig = f.fp = fp; + f.lastSubFP = -1; + // if (DEBUG) { + // final int sav = term.length; + // term.length = length; + // System.out.println(" push new frame ord=" + f.ord + " fp=" + f.fp + " hasTerms=" + + // f.hasTerms + " isFloor=" + f.isFloor + " pref=" + ToStringUtils.bytesRefToString(term)); + // term.length = sav; + // } + } + + return f; + } + + // asserts only + private boolean clearEOF() { + eof = false; + return true; + } + + // asserts only + private boolean setEOF() { + eof = true; + return true; + } + + private IOBooleanSupplier prepareSeekExact(BytesRef target, boolean prefetch) throws IOException { + if (fr.size() > 0 && (target.compareTo(fr.getMin()) < 0 || target.compareTo(fr.getMax()) > 0)) { + return null; + } + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekExact seg=" + fr.parent.segment + " target=" + + // fr.fieldInfo.name + ":" + ToStringUtils.bytesRefToString(target) + " current=" + + // ToStringUtils.bytesRefToString(term) + + // " (exists?=" + termExists + ") validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + TrieReader.Node node; + int targetUpto; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + node = nodes[0]; + assert node.hasOutput(); + targetUpto = 0; + + SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + // + ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + + // " vs termLabel=" + (char) (term.bytes[targetUpto]) + ")" + " node.output=" + + // node.output + // + " output=" + output); + // } + if (cmp != 0) { + break; + } + node = nodes[1 + targetUpto]; + assert node.label == (target.bytes[target.offset + targetUpto] & 0xFF) + : "node.label=" + + (char) node.label + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + if (node.hasOutput()) { + lastFrame = stack[1 + lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + // Second compare the rest of the term, but + // don't save node/output/frame; we only do this + // to find out if the target term is before, + // equal or after the current term + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); + // frame.ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = lastFrame.ord; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); + // rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return true"); + // } + return () -> true; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + // validIndexPrefix = currentFrame.depth; + // term.length = target.length; + // return termExists; + } + + } else { + + targetBeforeCurrentLength = -1; + node = trieReader.root; + + // Empty string prefix must have an output (block) in the index! + assert node.hasOutput(); + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + currentFrame = staticFrame; + + // term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(node, 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + + // " currentFrame.ord=" + currentFrame.ord + " targetBeforeCurrentLength=" + + // targetBeforeCurrentLength); + // } + + // We are done sharing the common prefix with the incoming target and where we are currently + // seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final TrieReader.Node nextNode = + trieReader.lookupChild(targetLabel, node, getNode(1 + targetUpto)); + + if (nextNode == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + + // toHex(targetLabel)); + // } + + validIndexPrefix = currentFrame.prefixLength; + // validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + if (!currentFrame.hasTerms) { + termExists = false; + term.setByteAt(targetUpto, (byte) targetLabel); + term.setLength(1 + targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + ToStringUtils.bytesRefToString(term)); + // } + return null; + } + + if (prefetch) { + currentFrame.prefetchBlock(); + } + + return () -> { + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got " + result + "; return NOT_FOUND term=" + + // ToStringUtils.bytesRefToString(term)); + // } + return false; + } + }; + } else { + // Follow this node + node = nextNode; + term.setByteAt(targetUpto, (byte) targetLabel); + // Aggregate output as we go: + + // if (DEBUG) { + // System.out.println(" index: follow label=" + toHex(target.bytes[target.offset + + // targetUpto]&0xff) + " node.output=" + node.output + " node.nfo=" + node.nextFinalOutput); + // } + targetUpto++; + + if (node.hasOutput()) { + // if (DEBUG) System.out.println(" node is final!"); + currentFrame = pushFrame(node, targetUpto); + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + + // currentFrame.hasTerms); + } + } + } + + // validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefixLength; + + currentFrame.scanToFloorFrame(target); + + // Target term is entirely contained in the index: + if (!currentFrame.hasTerms) { + termExists = false; + term.setLength(targetUpto); + // if (DEBUG) { + // System.out.println(" FAST NOT_FOUND term=" + ToStringUtils.bytesRefToString(term)); + // } + return null; + } + + if (prefetch) { + currentFrame.prefetchBlock(); + } + + return () -> { + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, true); + if (result == SeekStatus.FOUND) { + // if (DEBUG) { + // System.out.println(" return FOUND term=" + term.utf8ToString() + " " + term); + // } + return true; + } else { + // if (DEBUG) { + // System.out.println(" got result " + result + "; return NOT_FOUND term=" + + // term.utf8ToString()); + // } + + return false; + } + }; + } + + @Override + public IOBooleanSupplier prepareSeekExact(BytesRef target) throws IOException { + return prepareSeekExact(target, true); + } + + @Override + public boolean seekExact(BytesRef target) throws IOException { + IOBooleanSupplier termExistsSupplier = prepareSeekExact(target, false); + return termExistsSupplier != null && termExistsSupplier.get(); + } + + @Override + public SeekStatus seekCeil(BytesRef target) throws IOException { + + term.grow(1 + target.length); + + assert clearEOF(); + + // if (DEBUG) { + // System.out.println("\nBTTR.seekCeil seg=" + fr.parent.segment + " target=" + + // fr.fieldInfo.name + ":" + ToStringUtils.bytesRefToString(target) + " current=" + + // ToStringUtils.bytesRefToString(term) + " (exists?=" + termExists + + // ") validIndexPrefix= " + validIndexPrefix); + // printSeekState(System.out); + // } + + TrieReader.Node node; + int targetUpto; + + targetBeforeCurrentLength = currentFrame.ord; + + if (currentFrame != staticFrame) { + + // We are already seek'd; find the common + // prefix of new seek term vs current term and + // re-use the corresponding seek state. For + // example, if app first seeks to foobar, then + // seeks to foobaz, we can re-use the seek state + // for the first 5 bytes. + + // if (DEBUG) { + // System.out.println(" re-use current seek state validIndexPrefix=" + validIndexPrefix); + // } + + node = nodes[0]; + assert node.hasOutput(); + targetUpto = 0; + + SegmentTermsEnumFrame lastFrame = stack[0]; + assert validIndexPrefix <= term.length(); + + final int targetLimit = Math.min(target.length, validIndexPrefix); + + int cmp = 0; + + // First compare up to valid seek frames: + while (targetUpto < targetLimit) { + cmp = (term.byteAt(targetUpto) & 0xFF) - (target.bytes[target.offset + targetUpto] & 0xFF); + // if (DEBUG) { + // System.out.println(" cycle targetUpto=" + targetUpto + " (vs limit=" + targetLimit + + // ") cmp=" + cmp + " (targetLabel=" + (char) (target.bytes[target.offset + targetUpto]) + + // " vs termLabel=" + (char) (term.byteAt(targetUpto)) + ")" + " node.output=" + + // node.output + // + " output=" + output); + // } + if (cmp != 0) { + break; + } + node = nodes[1 + targetUpto]; + assert node.label == (target.bytes[target.offset + targetUpto] & 0xFF) + : "node.label=" + + (char) node.label + + " targetLabel=" + + (char) (target.bytes[target.offset + targetUpto] & 0xFF); + + if (node.hasOutput()) { + lastFrame = stack[1 + lastFrame.ord]; + } + targetUpto++; + } + + if (cmp == 0) { + // Second compare the rest of the term, but + // don't save node/output/frame: + cmp = + Arrays.compareUnsigned( + term.bytes(), + targetUpto, + term.length(), + target.bytes, + target.offset + targetUpto, + target.offset + target.length); + } + + if (cmp < 0) { + // Common case: target term is after current + // term, ie, app is seeking multiple terms + // in sorted order + // if (DEBUG) { + // System.out.println(" target is after current (shares prefixLen=" + targetUpto + "); + // clear frame.scanned ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + + } else if (cmp > 0) { + // Uncommon case: target term + // is before current term; this means we can + // keep the currentFrame but we must rewind it + // (so we scan from the start) + targetBeforeCurrentLength = 0; + // if (DEBUG) { + // System.out.println(" target is before current (shares prefixLen=" + targetUpto + "); + // rewind frame ord=" + lastFrame.ord); + // } + currentFrame = lastFrame; + currentFrame.rewind(); + } else { + // Target is exactly the same as current term + assert term.length() == target.length; + if (termExists) { + // if (DEBUG) { + // System.out.println(" target is same as current; return FOUND"); + // } + return SeekStatus.FOUND; + } else { + // if (DEBUG) { + // System.out.println(" target is same as current but term doesn't exist"); + // } + } + } + + } else { + + targetBeforeCurrentLength = -1; + node = nodes[0] = trieReader.root; + + // Empty string prefix must have an output (block) in the index! + assert node.hasOutput(); + + // if (DEBUG) { + // System.out.println(" no seek state; push root frame"); + // } + + currentFrame = staticFrame; + + // term.length = 0; + targetUpto = 0; + currentFrame = pushFrame(node, 0); + } + + // if (DEBUG) { + // System.out.println(" start index loop targetUpto=" + targetUpto + " output=" + output + + // " currentFrame.ord+1=" + currentFrame.ord + " targetBeforeCurrentLength=" + + // targetBeforeCurrentLength); + // } + + // We are done sharing the common prefix with the incoming target and where we are currently + // seek'd; now continue walking the index: + while (targetUpto < target.length) { + + final int targetLabel = target.bytes[target.offset + targetUpto] & 0xFF; + + final TrieReader.Node nextNode = + trieReader.lookupChild(targetLabel, node, getNode(1 + targetUpto)); + + if (nextNode == null) { + + // Index is exhausted + // if (DEBUG) { + // System.out.println(" index: index exhausted label=" + ((char) targetLabel) + " " + + // targetLabel); + // } + + validIndexPrefix = currentFrame.prefixLength; + // validIndexPrefix = targetUpto; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + // if (DEBUG) System.out.println(" now scanToTerm"); + final SeekStatus result = currentFrame.scanToTerm(target, false); + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + + if (next() != null) { + // if (DEBUG) { + // System.out.println(" return NOT_FOUND term=" + + // ToStringUtils.bytesRefToString(term)); + // } + return SeekStatus.NOT_FOUND; + } else { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return SeekStatus.END; + } + } else { + // if (DEBUG) { + // System.out.println(" return " + result + " term=" + + // ToStringUtils.bytesRefToString(term)); + // } + return result; + } + } else { + // Follow this node + term.setByteAt(targetUpto, (byte) targetLabel); + node = nextNode; + + // if (DEBUG) { + // System.out.println(" index: follow label=" + (target.bytes[target.offset + + // targetUpto]&0xff) + " node.output=" + node.output + " node.nfo=" + node.nextFinalOutput); + // } + targetUpto++; + + if (node.hasOutput()) { + // if (DEBUG) System.out.println(" node is final!"); + currentFrame = pushFrame(node, targetUpto); + // if (DEBUG) System.out.println(" curFrame.ord=" + currentFrame.ord + " hasTerms=" + + // currentFrame.hasTerms); + } + } + } + + // validIndexPrefix = targetUpto; + validIndexPrefix = currentFrame.prefixLength; + + currentFrame.scanToFloorFrame(target); + + currentFrame.loadBlock(); + + final SeekStatus result = currentFrame.scanToTerm(target, false); + + if (result == SeekStatus.END) { + term.copyBytes(target); + termExists = false; + if (next() != null) { + // if (DEBUG) { + // System.out.println(" return NOT_FOUND term=" + term.get().utf8ToString() + " " + term); + // } + return SeekStatus.NOT_FOUND; + } else { + // if (DEBUG) { + // System.out.println(" return END"); + // } + return SeekStatus.END; + } + } else { + return result; + } + } + + @SuppressWarnings("unused") + private void printSeekState(PrintStream out) throws IOException { + if (currentFrame == staticFrame) { + out.println(" no prior seek"); + } else { + out.println(" prior seek state:"); + int ord = 0; + boolean isSeekFrame = true; + while (true) { + SegmentTermsEnumFrame f = getFrame(ord); + assert f != null; + final BytesRef prefix = new BytesRef(term.get().bytes, 0, f.prefixLength); + if (f.nextEnt == -1) { + out.println( + " frame " + + (isSeekFrame ? "(seek)" : "(next)") + + " ord=" + + ord + + " fp=" + + f.fp + + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + + " prefixLen=" + + f.prefixLength + + " prefix=" + + prefix + + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + + " hasTerms=" + + f.hasTerms + + " isFloor=" + + f.isFloor + + " isLastInFloor=" + + f.isLastInFloor + + " mdUpto=" + + f.metaDataUpto + + " tbOrd=" + + f.getTermBlockOrd()); + } else { + out.println( + " frame " + + (isSeekFrame ? "(seek, loaded)" : "(next, loaded)") + + " ord=" + + ord + + " fp=" + + f.fp + + (f.isFloor ? (" (fpOrig=" + f.fpOrig + ")") : "") + + " prefixLen=" + + f.prefixLength + + " prefix=" + + prefix + + " nextEnt=" + + f.nextEnt + + (f.nextEnt == -1 ? "" : (" (of " + f.entCount + ")")) + + " hasTerms=" + + f.hasTerms + + " isFloor=" + + f.isFloor + + " lastSubFP=" + + f.lastSubFP + + " isLastInFloor=" + + f.isLastInFloor + + " mdUpto=" + + f.metaDataUpto + + " tbOrd=" + + f.getTermBlockOrd()); + } + assert !isSeekFrame || f.node != null : "isSeekFrame=" + isSeekFrame + " f.node=" + f.node; + if (f.prefixLength > 0 + && isSeekFrame + && f.node.label != (term.byteAt(f.prefixLength - 1) & 0xFF)) { + out.println( + " broken seek state: node.label=" + + (char) f.node.label + + " vs term byte=" + + (char) (term.byteAt(f.prefixLength - 1) & 0xFF)); + throw new RuntimeException("seek state is broken"); + } + + TrieReader.Node node = trieReader.root; + TrieReader.Node child = new TrieReader.Node(); + for (int i = 0; i < prefix.length; i++) { + TrieReader.Node found = + trieReader.lookupChild(prefix.bytes[i + prefix.offset] & 0xFF, node, child); + if (found == null) { + throw new RuntimeException("seek state is broken, prefix not exist in index"); + } + node = child; + child = new TrieReader.Node(); + } + if (!node.hasOutput()) { + out.println(" broken seek state: prefix is not final in index"); + throw new RuntimeException("seek state is broken"); + } else if (isSeekFrame && !f.isFloor) { + if (f.fp != node.outputFp || f.hasTerms != node.hasTerms || f.isFloor != node.isFloor()) { + out.println( + " broken seek state: output fp=" + + node.outputFp + + ", hasTerms=" + + node.hasTerms + + ", isFloor=" + + node.isFloor() + + " doesn't match frame fp=" + + f.fp + + ", hasTerms=" + + f.hasTerms + + ", isFloor=" + + f.isFloor); + throw new RuntimeException("seek state is broken"); + } + } + + if (f == currentFrame) { + break; + } + if (f.prefixLength == validIndexPrefix) { + isSeekFrame = false; + } + ord++; + } + } + } + + /* Decodes only the term bytes of the next term. If caller then asks for + metadata, ie docFreq, totalTermFreq or pulls a D/&PEnum, we then (lazily) + decode all metadata up to the current term. */ + @Override + public BytesRef next() throws IOException { + if (in == null) { + // Fresh TermsEnum; seek to first term: + final TrieReader.Node node = nodes[0] = trieReader.root; + currentFrame = pushFrame(node, 0); + currentFrame.loadBlock(); + } + + targetBeforeCurrentLength = currentFrame.ord; + + assert !eof; + // if (DEBUG) { + // System.out.println("\nBTTR.next seg=" + fr.parent.segment + " term=" + + // ToStringUtils.bytesRefToString(term) + " termExists?=" + termExists + " field=" + + // fr.fieldInfo.name + " termBlockOrd=" + currentFrame.state.termBlockOrd + + // " validIndexPrefix=" + validIndexPrefix); + // printSeekState(System.out); + // } + + if (currentFrame == staticFrame) { + // If seek was previously called and the term was + // cached, or seek(TermState) was called, usually + // caller is just going to pull a D/&PEnum or get + // docFreq, etc. But, if they then call next(), + // this method catches up all internal state so next() + // works properly: + // if (DEBUG) System.out.println(" re-seek to pending term=" + term.utf8ToString() + " " + + // term); + final boolean result = seekExact(term.get()); + assert result; + } + + // Pop finished blocks + while (currentFrame.nextEnt == currentFrame.entCount) { + if (!currentFrame.isLastInFloor) { + // Advance to next floor block + currentFrame.loadNextFloorBlock(); + break; + } else { + // if (DEBUG) System.out.println(" pop frame"); + if (currentFrame.ord == 0) { + // if (DEBUG) System.out.println(" return null"); + assert setEOF(); + term.clear(); + validIndexPrefix = 0; + currentFrame.rewind(); + termExists = false; + return null; + } + final long lastFP = currentFrame.fpOrig; + currentFrame = stack[currentFrame.ord - 1]; + + if (currentFrame.nextEnt == -1 || currentFrame.lastSubFP != lastFP) { + // We popped into a frame that's not loaded + // yet or not scan'd to the right entry + currentFrame.scanToFloorFrame(term.get()); + currentFrame.loadBlock(); + currentFrame.scanToSubBlock(lastFP); + } + + // Note that the seek state (last seek) has been + // invalidated beyond this depth + validIndexPrefix = Math.min(validIndexPrefix, currentFrame.prefixLength); + // if (DEBUG) { + // System.out.println(" reset validIndexPrefix=" + validIndexPrefix); + // } + } + } + + while (true) { + if (currentFrame.next()) { + // Push to new block: + // if (DEBUG) System.out.println(" push frame"); + currentFrame = pushFrame(null, currentFrame.lastSubFP, term.length()); + // This is a "next" frame -- even if it's + // floor'd we must pretend it isn't so we don't + // try to scan to the right floor frame: + currentFrame.loadBlock(); + } else { + // if (DEBUG) System.out.println(" return term=" + ToStringUtils.bytesRefToString(term) + + // " currentFrame.ord=" + currentFrame.ord); + return term.get(); + } + } + } + + @Override + public BytesRef term() { + assert !eof; + return term.get(); + } + + @Override + public int docFreq() throws IOException { + assert !eof; + // if (DEBUG) System.out.println("BTR.docFreq"); + currentFrame.decodeMetaData(); + // if (DEBUG) System.out.println(" return " + currentFrame.state.docFreq); + return currentFrame.state.docFreq; + } + + @Override + public long totalTermFreq() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + return currentFrame.state.totalTermFreq; + } + + @Override + public PostingsEnum postings(PostingsEnum reuse, int flags) throws IOException { + assert !eof; + // if (DEBUG) { + // System.out.println("BTTR.docs seg=" + segment); + // } + currentFrame.decodeMetaData(); + // if (DEBUG) { + // System.out.println(" state=" + currentFrame.state); + // } + return fr.parent.postingsReader.postings(fr.fieldInfo, currentFrame.state, reuse, flags); + } + + @Override + public ImpactsEnum impacts(int flags) throws IOException { + assert !eof; + // if (DEBUG) { + // System.out.println("BTTR.docs seg=" + segment); + // } + currentFrame.decodeMetaData(); + // if (DEBUG) { + // System.out.println(" state=" + currentFrame.state); + // } + return fr.parent.postingsReader.impacts(fr.fieldInfo, currentFrame.state, flags); + } + + @Override + public void seekExact(BytesRef target, TermState otherState) { + // if (DEBUG) { + // System.out.println("BTTR.seekExact termState seg=" + segment + " target=" + + // target.utf8ToString() + " " + target + " state=" + otherState); + // } + assert clearEOF(); + if (target.compareTo(term.get()) != 0 || !termExists) { + assert otherState != null && otherState instanceof BlockTermState; + currentFrame = staticFrame; + currentFrame.state.copyFrom(otherState); + term.copyBytes(target); + currentFrame.metaDataUpto = currentFrame.getTermBlockOrd(); + assert currentFrame.metaDataUpto > 0; + validIndexPrefix = 0; + } else { + // if (DEBUG) { + // System.out.println(" skip seek: already on target state=" + currentFrame.state); + // } + } + } + + @Override + public TermState termState() throws IOException { + assert !eof; + currentFrame.decodeMetaData(); + TermState ts = currentFrame.state.clone(); + // if (DEBUG) System.out.println("BTTR.termState seg=" + segment + " state=" + ts); + return ts; + } + + @Override + public void seekExact(long ord) { + throw new UnsupportedOperationException(); + } + + @Override + public long ord() { + throw new UnsupportedOperationException(); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnumFrame.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnumFrame.java new file mode 100644 index 000000000000..1f3fcfcd05cd --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/SegmentTermsEnumFrame.java @@ -0,0 +1,879 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.util.Arrays; +import org.apache.lucene.codecs.BlockTermState; +import org.apache.lucene.index.CorruptIndexException; +import org.apache.lucene.index.IndexOptions; +import org.apache.lucene.index.TermsEnum.SeekStatus; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +final class SegmentTermsEnumFrame { + // Our index in stack[]: + final int ord; + + boolean hasTerms; + boolean hasTermsOrig; + boolean isFloor; + + TrieReader.Node node; + + // static boolean DEBUG = BlockTreeTermsWriter.DEBUG; + + // File pointer where this block was loaded from + long fp; + long fpOrig; + long fpEnd; + long totalSuffixBytes; // for stats + + byte[] suffixBytes = new byte[128]; + final ByteArrayDataInput suffixesReader = new ByteArrayDataInput(); + + byte[] suffixLengthBytes; + final ByteArrayDataInput suffixLengthsReader; + + byte[] statBytes = new byte[64]; + int statsSingletonRunLength = 0; + final ByteArrayDataInput statsReader = new ByteArrayDataInput(); + + long rewindPos; + + long floorDataPos; + IndexInput floorDataReader; + + // Length of prefix shared by all terms in this block + int prefixLength; + + // Number of entries (term or sub-block) in this block + int entCount; + + // Which term we will next read, or -1 if the block + // isn't loaded yet + int nextEnt; + + // True if this block is either not a floor block, + // or, it's the last sub-block of a floor block + boolean isLastInFloor; + + // True if all entries are terms + boolean isLeafBlock; + + // True if all entries have the same length. + boolean allEqual; + + long lastSubFP; + + int nextFloorLabel; + int numFollowFloorBlocks; + + // Next term to decode metaData; we decode metaData + // lazily so that scanning to find the matching term is + // fast and only if you find a match and app wants the + // stats or docs/positions enums, will we decode the + // metaData + int metaDataUpto; + + final BlockTermState state; + + // metadata buffer + byte[] bytes = new byte[32]; + final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); + + private final SegmentTermsEnum ste; + + public SegmentTermsEnumFrame(SegmentTermsEnum ste, int ord) throws IOException { + this.ste = ste; + this.ord = ord; + this.state = ste.fr.parent.postingsReader.newTermState(); + this.state.totalTermFreq = -1; + suffixLengthBytes = new byte[32]; + suffixLengthsReader = new ByteArrayDataInput(); + } + + public void setFloorData(IndexInput in) throws IOException { + floorDataReader = in; + rewindPos = in.getFilePointer(); + numFollowFloorBlocks = in.readVInt(); + nextFloorLabel = in.readByte() & 0xff; + floorDataPos = in.getFilePointer(); + // if (DEBUG) { + // System.out.println(" setFloorData fpOrig=" + fpOrig + " bytes=" + new + // BytesRef(source.bytes, source.offset + in.getPosition(), numBytes) + " numFollowFloorBlocks=" + // + numFollowFloorBlocks + " nextFloorLabel=" + toHex(nextFloorLabel)); + // } + } + + public int getTermBlockOrd() { + return isLeafBlock ? nextEnt : state.termBlockOrd; + } + + void loadNextFloorBlock() throws IOException { + // if (DEBUG) { + // System.out.println(" loadNextFloorBlock fp=" + fp + " fpEnd=" + fpEnd); + // } + assert node == null || isFloor : "node=" + node + " isFloor=" + isFloor; + fp = fpEnd; + nextEnt = -1; + loadBlock(); + } + + void prefetchBlock() throws IOException { + if (nextEnt != -1) { + // Already loaded + return; + } + + // Clone the IndexInput lazily, so that consumers + // that just pull a TermsEnum to + // seekExact(TermState) don't pay this cost: + ste.initIndexInput(); + + // TODO: Could we know the number of bytes to prefetch? + ste.in.prefetch(fp, 1); + } + + /* Does initial decode of next block of terms; this + doesn't actually decode the docFreq, totalTermFreq, + postings details (frq/prx offset, etc.) metadata; + it just loads them as byte[] blobs which are then + decoded on-demand if the metadata is ever requested + for any term in this block. This enables terms-only + intensive consumes (eg certain MTQs, respelling) to + not pay the price of decoding metadata they won't + use. */ + void loadBlock() throws IOException { + + // Clone the IndexInput lazily, so that consumers + // that just pull a TermsEnum to + // seekExact(TermState) don't pay this cost: + ste.initIndexInput(); + + if (nextEnt != -1) { + // Already loaded + return; + } + // System.out.println("blc=" + blockLoadCount); + + ste.in.seek(fp); + int code = ste.in.readVInt(); + entCount = code >>> 1; + assert entCount > 0; + isLastInFloor = (code & 1) != 0; + + assert node == null || (isLastInFloor || isFloor) + : "fp=" + fp + " node=" + node + " isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor; + + // TODO: if suffixes were stored in random-access + // array structure, then we could do binary search + // instead of linear scan to find target term; eg + // we could have simple array of offsets + + final long startSuffixFP = ste.in.getFilePointer(); + // term suffixes: + final long codeL = ste.in.readVLong(); + isLeafBlock = (codeL & 0x04) != 0; + final int numSuffixBytes = (int) (codeL >>> 3); + if (suffixBytes.length < numSuffixBytes) { + suffixBytes = new byte[ArrayUtil.oversize(numSuffixBytes, 1)]; + } + try { + compressionAlg = CompressionAlgorithm.byCode((int) codeL & 0x03); + } catch (IllegalArgumentException e) { + throw new CorruptIndexException(e.getMessage(), ste.in, e); + } + compressionAlg.read(ste.in, suffixBytes, numSuffixBytes); + suffixesReader.reset(suffixBytes, 0, numSuffixBytes); + + int numSuffixLengthBytes = ste.in.readVInt(); + allEqual = (numSuffixLengthBytes & 0x01) != 0; + numSuffixLengthBytes >>>= 1; + if (suffixLengthBytes.length < numSuffixLengthBytes) { + suffixLengthBytes = new byte[ArrayUtil.oversize(numSuffixLengthBytes, 1)]; + } + if (allEqual) { + Arrays.fill(suffixLengthBytes, 0, numSuffixLengthBytes, ste.in.readByte()); + } else { + ste.in.readBytes(suffixLengthBytes, 0, numSuffixLengthBytes); + } + suffixLengthsReader.reset(suffixLengthBytes, 0, numSuffixLengthBytes); + totalSuffixBytes = ste.in.getFilePointer() - startSuffixFP; + + /*if (DEBUG) { + if (node == null) { + System.out.println(" loadBlock (next) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } else { + System.out.println(" loadBlock (seek) fp=" + fp + " entCount=" + entCount + " prefixLen=" + prefix + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " isLastInFloor=" + isLastInFloor + " leaf?=" + isLeafBlock); + } + }*/ + + // stats + int numBytes = ste.in.readVInt(); + if (statBytes.length < numBytes) { + statBytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(statBytes, 0, numBytes); + statsReader.reset(statBytes, 0, numBytes); + statsSingletonRunLength = 0; + metaDataUpto = 0; + + state.termBlockOrd = 0; + nextEnt = 0; + lastSubFP = -1; + + // TODO: we could skip this if !hasTerms; but + // that's rare so won't help much + // metadata + numBytes = ste.in.readVInt(); + if (bytes.length < numBytes) { + bytes = new byte[ArrayUtil.oversize(numBytes, 1)]; + } + ste.in.readBytes(bytes, 0, numBytes); + bytesReader.reset(bytes, 0, numBytes); + + // Sub-blocks of a single floor block are always + // written one after another -- tail recurse: + fpEnd = ste.in.getFilePointer(); + // if (DEBUG) { + // System.out.println(" fpEnd=" + fpEnd); + // } + } + + void rewind() throws IOException { + + // Force reload: + fp = fpOrig; + nextEnt = -1; + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.seek(rewindPos); + numFollowFloorBlocks = floorDataReader.readVInt(); + assert numFollowFloorBlocks > 0; + nextFloorLabel = floorDataReader.readByte() & 0xff; + floorDataPos = floorDataReader.getFilePointer(); + } + + /* + //System.out.println("rewind"); + // Keeps the block loaded, but rewinds its state: + if (nextEnt > 0 || fp != fpOrig) { + if (DEBUG) { + System.out.println(" rewind frame ord=" + ord + " fpOrig=" + fpOrig + " fp=" + fp + " hasTerms?=" + hasTerms + " isFloor?=" + isFloor + " nextEnt=" + nextEnt + " prefixLen=" + prefix); + } + if (fp != fpOrig) { + fp = fpOrig; + nextEnt = -1; + } else { + nextEnt = 0; + } + hasTerms = hasTermsOrig; + if (isFloor) { + floorDataReader.rewind(); + numFollowFloorBlocks = floorDataReader.readVInt(); + nextFloorLabel = floorDataReader.readByte() & 0xff; + } + assert suffixBytes != null; + suffixesReader.rewind(); + assert statBytes != null; + statsReader.rewind(); + metaDataUpto = 0; + state.termBlockOrd = 0; + // TODO: skip this if !hasTerms? Then postings + // impl wouldn't have to write useless 0 byte + postingsReader.resetTermsBlock(fieldInfo, state); + lastSubFP = -1; + } else if (DEBUG) { + System.out.println(" skip rewind fp=" + fp + " fpOrig=" + fpOrig + " nextEnt=" + nextEnt + " ord=" + ord); + } + */ + } + + // Decodes next entry; returns true if it's a sub-block + public boolean next() throws IOException { + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + return nextNonLeaf(); + } + } + + public void nextLeaf() { + // if (DEBUG) System.out.println(" frame.next ord=" + ord + " nextEnt=" + nextEnt + + // " entCount=" + entCount); + assert nextEnt != -1 && nextEnt < entCount + : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + suffixLength = suffixLengthsReader.readVInt(); + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefixLength + suffixLength); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); + ste.termExists = true; + } + + public boolean nextNonLeaf() throws IOException { + // if (DEBUG) System.out.println(" stef.next ord=" + ord + " nextEnt=" + nextEnt + " entCount=" + // + entCount + " fp=" + suffixesReader.getPosition()); + while (true) { + if (nextEnt == entCount) { + assert node == null || (isFloor && isLastInFloor == false) + : "isFloor=" + isFloor + " isLastInFloor=" + isLastInFloor; + loadNextFloorBlock(); + if (isLeafBlock) { + nextLeaf(); + return false; + } else { + continue; + } + } + + assert nextEnt != -1 && nextEnt < entCount + : "nextEnt=" + nextEnt + " entCount=" + entCount + " fp=" + fp; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffixLength = code >>> 1; + startBytePos = suffixesReader.getPosition(); + ste.term.setLength(prefixLength + suffixLength); + ste.term.grow(ste.term.length()); + suffixesReader.readBytes(ste.term.bytes(), prefixLength, suffixLength); + if ((code & 1) == 0) { + // A normal term + ste.termExists = true; + subCode = 0; + state.termBlockOrd++; + return false; + } else { + // A sub-block; make sub-FP absolute: + ste.termExists = false; + subCode = suffixLengthsReader.readVLong(); + lastSubFP = fp - subCode; + // if (DEBUG) { + // System.out.println(" lastSubFP=" + lastSubFP); + // } + return true; + } + } + } + + // TODO: make this array'd so we can do bin search? + // likely not worth it? need to measure how many + // floor blocks we "typically" get + public void scanToFloorFrame(BytesRef target) throws IOException { + + if (!isFloor || target.length <= prefixLength) { + // if (DEBUG) { + // System.out.println(" scanToFloorFrame skip: isFloor=" + isFloor + " target.length=" + + // target.length + " vs prefix=" + prefix); + // } + return; + } + + final int targetLabel = target.bytes[target.offset + prefixLength] & 0xFF; + + // if (DEBUG) { + // System.out.println(" scanToFloorFrame fpOrig=" + fpOrig + " targetLabel=" + + // toHex(targetLabel) + " vs nextFloorLabel=" + toHex(nextFloorLabel) + " numFollowFloorBlocks=" + // + numFollowFloorBlocks); + // } + + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" already on correct block"); + // } + return; + } + + assert numFollowFloorBlocks != 0; + + long newFP = fpOrig; + floorDataReader.seek(floorDataPos); + while (true) { + final long code = floorDataReader.readVLong(); + newFP = fpOrig + (code >>> 1); + hasTerms = (code & 1) != 0; + // if (DEBUG) { + // System.out.println(" label=" + toHex(nextFloorLabel) + " fp=" + newFP + + // " hasTerms?=" + hasTerms + " numFollowFloor=" + numFollowFloorBlocks); + // } + + isLastInFloor = numFollowFloorBlocks == 1; + numFollowFloorBlocks--; + + if (isLastInFloor) { + nextFloorLabel = 256; + // if (DEBUG) { + // System.out.println(" stop! last block nextFloorLabel=" + + // toHex(nextFloorLabel)); + // } + break; + } else { + nextFloorLabel = floorDataReader.readByte() & 0xff; + if (targetLabel < nextFloorLabel) { + // if (DEBUG) { + // System.out.println(" stop! nextFloorLabel=" + toHex(nextFloorLabel)); + // } + break; + } + } + } + floorDataPos = floorDataReader.getFilePointer(); + if (newFP != fp) { + // Force re-load of the block: + // if (DEBUG) { + // System.out.println(" force switch to fp=" + newFP + " oldFP=" + fp); + // } + nextEnt = -1; + fp = newFP; + } else { + // if (DEBUG) { + // System.out.println(" stay on same fp=" + newFP); + // } + } + } + + public void decodeMetaData() throws IOException { + + // if (DEBUG) System.out.println("\nBTTR.decodeMetadata seg=" + segment + " mdUpto=" + + // metaDataUpto + " vs termBlockOrd=" + state.termBlockOrd); + + // lazily catch up on metadata decode: + final int limit = getTermBlockOrd(); + boolean absolute = metaDataUpto == 0; + assert limit > 0; + + // TODO: better API would be "jump straight to term=N"??? + while (metaDataUpto < limit) { + + // TODO: we could make "tiers" of metadata, ie, + // decode docFreq/totalTF but don't decode postings + // metadata; this way caller could get + // docFreq/totalTF w/o paying decode cost for + // postings + + // TODO: if docFreq were bulk decoded we could + // just skipN here: + if (statsSingletonRunLength > 0) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength--; + } else { + int token = statsReader.readVInt(); + if ((token & 1) == 1) { + state.docFreq = 1; + state.totalTermFreq = 1; + statsSingletonRunLength = token >>> 1; + } else { + state.docFreq = token >>> 1; + if (ste.fr.fieldInfo.getIndexOptions() == IndexOptions.DOCS) { + state.totalTermFreq = state.docFreq; + } else { + state.totalTermFreq = state.docFreq + statsReader.readVLong(); + } + } + } + + // metadata + ste.fr.parent.postingsReader.decodeTerm(bytesReader, ste.fr.fieldInfo, state, absolute); + + metaDataUpto++; + absolute = false; + } + state.termBlockOrd = metaDataUpto; + } + + // Used only by assert + private boolean prefixMatches(BytesRef target) { + for (int bytePos = 0; bytePos < prefixLength; bytePos++) { + if (target.bytes[target.offset + bytePos] != ste.term.byteAt(bytePos)) { + return false; + } + } + + return true; + } + + // Scans to sub-block that has this target fp; only + // called by next(); NOTE: does not set + // startBytePos/suffix as a side effect + public void scanToSubBlock(long subFP) { + assert !isLeafBlock; + // if (DEBUG) System.out.println(" scanToSubBlock fp=" + fp + " subFP=" + subFP + " entCount=" + // + entCount + " lastSubFP=" + lastSubFP); + // assert nextEnt == 0; + if (lastSubFP == subFP) { + // if (DEBUG) System.out.println(" already positioned"); + return; + } + assert subFP < fp : "fp=" + fp + " subFP=" + subFP; + final long targetSubCode = fp - subFP; + // if (DEBUG) System.out.println(" targetSubCode=" + targetSubCode); + while (true) { + assert nextEnt < entCount; + nextEnt++; + final int code = suffixLengthsReader.readVInt(); + suffixesReader.skipBytes(code >>> 1); + if ((code & 1) != 0) { + final long subCode = suffixLengthsReader.readVLong(); + if (targetSubCode == subCode) { + // if (DEBUG) System.out.println(" match!"); + lastSubFP = subFP; + return; + } + } else { + state.termBlockOrd++; + } + } + } + + // NOTE: sets startBytePos/suffix as a side effect + public SeekStatus scanToTerm(BytesRef target, boolean exactOnly) throws IOException { + if (isLeafBlock) { + if (allEqual) { + return binarySearchTermLeaf(target, exactOnly); + } else { + return scanToTermLeaf(target, exactOnly); + } + } else { + return scanToTermNonLeaf(target, exactOnly); + } + } + + private int startBytePos; + private int suffixLength; + private long subCode; + CompressionAlgorithm compressionAlg = CompressionAlgorithm.NO_COMPRESSION; + + // Target's prefix matches this block's prefix; we + // scan the entries to check if the suffix matches. + public SeekStatus scanToTermLeaf(BytesRef target, boolean exactOnly) throws IOException { + + // if (DEBUG) System.out.println(" scanToTermLeaf: block fp=" + fp + " prefix=" + prefix + + // " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + + // ToStringUtils.bytesRefToString(target) + + // " term=" + ToStringUtils.bytesRefToString(term)); + + assert nextEnt != -1; + + ste.termExists = true; + subCode = 0; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + // Loop over each entry (term or sub-block) in this block: + do { + nextEnt++; + + suffixLength = suffixLengthsReader.readVInt(); + + // if (DEBUG) { + // BytesRef suffixBytesRef = new BytesRef(); + // suffixBytesRef.bytes = suffixBytes; + // suffixBytesRef.offset = suffixesReader.getPosition(); + // suffixBytesRef.length = suffix; + // System.out.println(" cycle: term " + (nextEnt-1) + " (of " + entCount + ") suffix=" + // + ToStringUtils.bytesRefToString(suffixBytesRef)); + // } + + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffixLength); + + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); + + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! + + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: + + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } while (nextEnt < entCount); + + // It is possible (and OK) that terms index pointed us + // at this block, but, we scanned the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + if (exactOnly) { + fillTerm(); + } + + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return SeekStatus.END; + } + + // Target's prefix matches this block's prefix; + // And all suffixes have the same length in this block, + // we binary search the entries to check if the suffix matches. + public SeekStatus binarySearchTermLeaf(BytesRef target, boolean exactOnly) throws IOException { + // if (DEBUG) System.out.println(" binarySearchTermLeaf: block fp=" + fp + " prefix=" + + // prefix + " + // nextEnt=" + nextEnt + " (of " + entCount + ") target=" + brToString(target) + " term=" + + // brToString(term)); + + assert nextEnt != -1; + + ste.termExists = true; + subCode = 0; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + suffixLength = suffixLengthsReader.readVInt(); + // TODO early terminate when target length unequals suffix + prefix. + // But we need to keep the same status with scanToTermLeaf. + int start = nextEnt; + int end = entCount - 1; + // Binary search the entries (terms) in this leaf block: + int cmp = 0; + while (start <= end) { + int mid = (start + end) >>> 1; + nextEnt = mid + 1; + startBytePos = mid * suffixLength; + + // Compare suffix and target. + cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + if (cmp < 0) { + start = mid + 1; + } else if (cmp > 0) { + end = mid - 1; + } else { + // Exact match! + suffixesReader.setPosition(startBytePos + suffixLength); + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } + + // It is possible (and OK) that terms index pointed us + // at this block, but, we searched the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + SeekStatus seekStatus; + if (end < entCount - 1) { + seekStatus = SeekStatus.NOT_FOUND; + // If binary search ended at the less term, and greater term exists. + // We need to advance to the greater term. + if (cmp < 0) { + startBytePos += suffixLength; + nextEnt++; + } + suffixesReader.setPosition(startBytePos + suffixLength); + fillTerm(); + } else { + seekStatus = SeekStatus.END; + suffixesReader.setPosition(startBytePos + suffixLength); + if (exactOnly) { + fillTerm(); + } + } + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return seekStatus; + } + + // Target's prefix matches this block's prefix; we + // scan the entries to check if the suffix matches. + public SeekStatus scanToTermNonLeaf(BytesRef target, boolean exactOnly) throws IOException { + + // if (DEBUG) System.out.println(" scanToTermNonLeaf: block fp=" + fp + " prefix=" + prefix + + // " nextEnt=" + nextEnt + " (of " + entCount + ") target=" + + // ToStringUtils.bytesRefToString(target) + + // " term=" + ToStringUtils.bytesRefToString(term)); + + assert nextEnt != -1; + + if (nextEnt == entCount) { + if (exactOnly) { + fillTerm(); + ste.termExists = subCode == 0; + } + return SeekStatus.END; + } + + assert prefixMatches(target); + + // Loop over each entry (term or sub-block) in this block: + while (nextEnt < entCount) { + + nextEnt++; + + final int code = suffixLengthsReader.readVInt(); + suffixLength = code >>> 1; + + // if (DEBUG) { + // BytesRef suffixBytesRef = new BytesRef(); + // suffixBytesRef.bytes = suffixBytes; + // suffixBytesRef.offset = suffixesReader.getPosition(); + // suffixBytesRef.length = suffix; + // System.out.println(" cycle: " + ((code&1)==1 ? "sub-block" : "term") + " " + + // (nextEnt-1) + " (of " + entCount + ") suffix=" + + // ToStringUtils.bytesRefToString(suffixBytesRef)); + // } + + startBytePos = suffixesReader.getPosition(); + suffixesReader.skipBytes(suffixLength); + ste.termExists = (code & 1) == 0; + if (ste.termExists) { + state.termBlockOrd++; + subCode = 0; + } else { + subCode = suffixLengthsReader.readVLong(); + lastSubFP = fp - subCode; + } + + // Compare suffix and target. + final int cmp = + Arrays.compareUnsigned( + suffixBytes, + startBytePos, + startBytePos + suffixLength, + target.bytes, + target.offset + prefixLength, + target.offset + target.length); + + if (cmp < 0) { + // Current entry is still before the target; + // keep scanning + } else if (cmp > 0) { + // Done! Current entry is after target -- + // return NOT_FOUND: + fillTerm(); + + // if (DEBUG) System.out.println(" maybe done exactOnly=" + exactOnly + + // " ste.termExists=" + ste.termExists); + + if (!exactOnly && !ste.termExists) { + // System.out.println(" now pushFrame"); + // TODO this + // We are on a sub-block, and caller wants + // us to position to the next term after + // the target, so we must recurse into the + // sub-frame(s): + ste.currentFrame = + ste.pushFrame(null, ste.currentFrame.lastSubFP, prefixLength + suffixLength); + ste.currentFrame.loadBlock(); + while (ste.currentFrame.next()) { + ste.currentFrame = ste.pushFrame(null, ste.currentFrame.lastSubFP, ste.term.length()); + ste.currentFrame.loadBlock(); + } + } + + // if (DEBUG) System.out.println(" not found"); + return SeekStatus.NOT_FOUND; + } else { + // Exact match! + + // This cannot be a sub-block because we + // would have followed the index to this + // sub-block from the start: + + assert ste.termExists; + fillTerm(); + // if (DEBUG) System.out.println(" found!"); + return SeekStatus.FOUND; + } + } + + // It is possible (and OK) that terms index pointed us + // at this block, but, we scanned the entire block and + // did not find the term to position to. This happens + // when the target is after the last term in the block + // (but, before the next term in the index). EG + // target could be foozzz, and terms index pointed us + // to the foo* block, but the last term in this block + // was fooz (and, eg, first term in the next block will + // bee fop). + // if (DEBUG) System.out.println(" block end"); + if (exactOnly) { + fillTerm(); + } + + // TODO: not consistent that in the + // not-exact case we don't next() into the next + // frame here + return SeekStatus.END; + } + + private void fillTerm() { + final int termLength = prefixLength + suffixLength; + ste.term.setLength(termLength); + ste.term.grow(termLength); + System.arraycopy(suffixBytes, startBytePos, ste.term.bytes(), prefixLength, suffixLength); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Stats.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Stats.java new file mode 100644 index 000000000000..7d754a519ce8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/Stats.java @@ -0,0 +1,276 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import static java.nio.charset.StandardCharsets.UTF_8; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.Locale; +import org.apache.lucene.codecs.PostingsReaderBase; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.BytesRef; + +/** + * BlockTree statistics for a single field returned by {@link FieldReader#getStats()}. + * + * @lucene.internal + */ +public class Stats { + /** Byte size of the index. */ + public long indexNumBytes; + + /** Total number of terms in the field. */ + public long totalTermCount; + + /** Total number of bytes (sum of term lengths) across all terms in the field. */ + public long totalTermBytes; + + /** The number of normal (non-floor) blocks in the terms file. */ + public int nonFloorBlockCount; + + /** + * The number of floor blocks (meta-blocks larger than the allowed {@code maxItemsPerBlock}) in + * the terms file. + */ + public int floorBlockCount; + + /** The number of sub-blocks within the floor blocks. */ + public int floorSubBlockCount; + + /** The number of "internal" blocks (that have both terms and sub-blocks). */ + public int mixedBlockCount; + + /** The number of "leaf" blocks (blocks that have only terms). */ + public int termsOnlyBlockCount; + + /** The number of "internal" blocks that do not contain terms (have only sub-blocks). */ + public int subBlocksOnlyBlockCount; + + /** Total number of blocks. */ + public int totalBlockCount; + + /** Number of blocks at each prefix depth. */ + public int[] blockCountByPrefixLen = new int[10]; + + private int startBlockCount; + private int endBlockCount; + + /** Total number of bytes used to store term suffixes. */ + public long totalBlockSuffixBytes; + + /** + * Number of times each compression method has been used. 0 = uncompressed 1 = lowercase_ascii 2 = + * LZ4 + */ + public final long[] compressionAlgorithms = new long[3]; + + /** Total number of suffix bytes before compression. */ + public long totalUncompressedBlockSuffixBytes; + + /** + * Total number of bytes used to store term stats (not including what the {@link + * PostingsReaderBase} stores). + */ + public long totalBlockStatsBytes; + + /** + * Total bytes stored by the {@link PostingsReaderBase}, plus the other few vInts stored in the + * frame. + */ + public long totalBlockOtherBytes; + + /** Segment name. */ + public final String segment; + + /** Field name. */ + public final String field; + + Stats(String segment, String field) { + this.segment = segment; + this.field = field; + } + + void startBlock(SegmentTermsEnumFrame frame, boolean isFloor) { + totalBlockCount++; + if (isFloor) { + if (frame.fp == frame.fpOrig) { + floorBlockCount++; + } + floorSubBlockCount++; + } else { + nonFloorBlockCount++; + } + + if (blockCountByPrefixLen.length <= frame.prefixLength) { + blockCountByPrefixLen = ArrayUtil.grow(blockCountByPrefixLen, 1 + frame.prefixLength); + } + blockCountByPrefixLen[frame.prefixLength]++; + startBlockCount++; + totalBlockSuffixBytes += frame.totalSuffixBytes; + totalUncompressedBlockSuffixBytes += frame.suffixesReader.length(); + if (frame.suffixesReader != frame.suffixLengthsReader) { + totalUncompressedBlockSuffixBytes += frame.suffixLengthsReader.length(); + } + totalBlockStatsBytes += frame.statsReader.length(); + compressionAlgorithms[frame.compressionAlg.code]++; + } + + void endBlock(SegmentTermsEnumFrame frame) { + final int termCount = frame.isLeafBlock ? frame.entCount : frame.state.termBlockOrd; + final int subBlockCount = frame.entCount - termCount; + totalTermCount += termCount; + if (termCount != 0 && subBlockCount != 0) { + mixedBlockCount++; + } else if (termCount != 0) { + termsOnlyBlockCount++; + } else if (subBlockCount != 0) { + subBlocksOnlyBlockCount++; + } else { + throw new IllegalStateException(); + } + endBlockCount++; + final long otherBytes = + frame.fpEnd - frame.fp - frame.totalSuffixBytes - frame.statsReader.length(); + assert otherBytes > 0 + : "otherBytes=" + otherBytes + " frame.fp=" + frame.fp + " frame.fpEnd=" + frame.fpEnd; + totalBlockOtherBytes += otherBytes; + } + + void term(BytesRef term) { + totalTermBytes += term.length; + } + + void finish() { + assert startBlockCount == endBlockCount + : "startBlockCount=" + startBlockCount + " endBlockCount=" + endBlockCount; + assert totalBlockCount == floorSubBlockCount + nonFloorBlockCount + : "floorSubBlockCount=" + + floorSubBlockCount + + " nonFloorBlockCount=" + + nonFloorBlockCount + + " totalBlockCount=" + + totalBlockCount; + assert totalBlockCount == mixedBlockCount + termsOnlyBlockCount + subBlocksOnlyBlockCount + : "totalBlockCount=" + + totalBlockCount + + " mixedBlockCount=" + + mixedBlockCount + + " subBlocksOnlyBlockCount=" + + subBlocksOnlyBlockCount + + " termsOnlyBlockCount=" + + termsOnlyBlockCount; + } + + @Override + public String toString() { + final ByteArrayOutputStream bos = new ByteArrayOutputStream(1024); + PrintStream out = new PrintStream(bos, false, UTF_8); + + out.println(" index trie:"); + out.println(" " + indexNumBytes + " bytes"); + out.println(" terms:"); + out.println(" " + totalTermCount + " terms"); + out.println( + " " + + totalTermBytes + + " bytes" + + (totalTermCount != 0 + ? " (" + + String.format(Locale.ROOT, "%.1f", ((double) totalTermBytes) / totalTermCount) + + " bytes/term)" + : "")); + out.println(" blocks:"); + out.println(" " + totalBlockCount + " blocks"); + out.println(" " + termsOnlyBlockCount + " terms-only blocks"); + out.println(" " + subBlocksOnlyBlockCount + " sub-block-only blocks"); + out.println(" " + mixedBlockCount + " mixed blocks"); + out.println(" " + floorBlockCount + " floor blocks"); + out.println(" " + (totalBlockCount - floorSubBlockCount) + " non-floor blocks"); + out.println(" " + floorSubBlockCount + " floor sub-blocks"); + out.println( + " " + + totalUncompressedBlockSuffixBytes + + " term suffix bytes before compression" + + (totalBlockCount != 0 + ? " (" + + String.format( + Locale.ROOT, "%.1f", ((double) totalBlockSuffixBytes) / totalBlockCount) + + " suffix-bytes/block)" + : "")); + StringBuilder compressionCounts = new StringBuilder(); + for (int code = 0; code < compressionAlgorithms.length; ++code) { + if (compressionAlgorithms[code] == 0) { + continue; + } + if (compressionCounts.length() > 0) { + compressionCounts.append(", "); + } + compressionCounts.append(CompressionAlgorithm.byCode(code)); + compressionCounts.append(": "); + compressionCounts.append(compressionAlgorithms[code]); + } + out.println( + " " + + totalBlockSuffixBytes + + " compressed term suffix bytes" + + (totalBlockCount != 0 + ? " (" + + String.format( + Locale.ROOT, + "%.2f", + ((double) totalBlockSuffixBytes) / totalUncompressedBlockSuffixBytes) + + " compression ratio - compression count by algorithm: " + + compressionCounts + : "") + + ")"); + out.println( + " " + + totalBlockStatsBytes + + " term stats bytes " + + (totalBlockCount != 0 + ? " (" + + String.format( + Locale.ROOT, "%.1f", ((double) totalBlockStatsBytes) / totalBlockCount) + + " stats-bytes/block)" + : "")); + out.println( + " " + + totalBlockOtherBytes + + " other bytes" + + (totalBlockCount != 0 + ? " (" + + String.format( + Locale.ROOT, "%.1f", ((double) totalBlockOtherBytes) / totalBlockCount) + + " other-bytes/block)" + : "")); + if (totalBlockCount != 0) { + out.println(" by prefix length:"); + int total = 0; + for (int prefix = 0; prefix < blockCountByPrefixLen.length; prefix++) { + final int blockCount = blockCountByPrefixLen[prefix]; + total += blockCount; + if (blockCount != 0) { + out.println(" " + String.format(Locale.ROOT, "%2d", prefix) + ": " + blockCount); + } + } + assert totalBlockCount == total; + } + + return bos.toString(UTF_8); + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java new file mode 100644 index 000000000000..63215d18e6e8 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieBuilder.java @@ -0,0 +1,641 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import java.util.ArrayDeque; +import java.util.Arrays; +import java.util.Deque; +import java.util.function.BiConsumer; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.RandomAccessInput; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.BytesRefBuilder; + +/** + * A builder to build prefix tree (trie) as the index of block tree, and can be saved to disk. + * + *

    TODO make this trie builder a more memory efficient structure. + */ +class TrieBuilder { + + static final int SIGN_NO_CHILDREN = 0x00; + static final int SIGN_SINGLE_CHILD_WITH_OUTPUT = 0x01; + static final int SIGN_SINGLE_CHILD_WITHOUT_OUTPUT = 0x02; + static final int SIGN_MULTI_CHILDREN = 0x03; + + static final int LEAF_NODE_HAS_TERMS = 1 << 5; + static final int LEAF_NODE_HAS_FLOOR = 1 << 6; + static final long NON_LEAF_NODE_HAS_TERMS = 1L << 1; + static final long NON_LEAF_NODE_HAS_FLOOR = 1L << 0; + + /** + * The output describing the term block the prefix point to. + * + * @param fp the file pointer to the on-disk terms block which a trie node points to. + * @param hasTerms false if this on-disk block consists entirely of pointers to child blocks. + * @param floorData will be non-null when a large block of terms sharing a single trie prefix is + * split into multiple on-disk blocks. + */ + record Output(long fp, boolean hasTerms, BytesRef floorData) {} + + private enum Status { + BUILDING, + SAVED, + DESTROYED + } + + private static class Node { + + // The utf8 digit that leads to this Node, 0 for root node + private final int label; + // The output of this node. + private Output output; + // The number of children of this node. + private int childrenNum; + // Pointers to relative nodes + private Node next; + private Node firstChild; + private Node lastChild; + + // Vars used during saving: + + // The file pointer point to where the node saved. -1 means the node has not been saved. + private long fp = -1; + // The latest child that have been saved. null means no child has been saved. + private Node savedTo; + + Node(int label, Output output) { + this.label = label; + this.output = output; + } + } + + private final Node root = new Node(0, null); + private final BytesRef minKey; + private BytesRef maxKey; + private Status status = Status.BUILDING; + + static TrieBuilder bytesRefToTrie(BytesRef k, Output v) { + return new TrieBuilder(k, v); + } + + private TrieBuilder(BytesRef k, Output v) { + minKey = maxKey = BytesRef.deepCopyOf(k); + if (k.length == 0) { + root.output = v; + return; + } + Node parent = root; + for (int i = 0; i < k.length; i++) { + int b = k.bytes[i + k.offset] & 0xFF; + Output output = i == k.length - 1 ? v : null; + Node node = new Node(b, output); + parent.firstChild = parent.lastChild = node; + parent.childrenNum = 1; + parent = node; + } + } + + /** + * Append all (K, V) pairs from the given trie into this one. The given trie builder need to + * ensure its keys greater or equals than max key of this one. + * + *

    Note: the given trie will be destroyed after appending. + */ + void append(TrieBuilder trieBuilder) { + if (status != Status.BUILDING || trieBuilder.status != Status.BUILDING) { + throw new IllegalStateException( + "tries have wrong status, got this: " + status + ", append: " + trieBuilder.status); + } + assert this.maxKey.compareTo(trieBuilder.minKey) < 0; + + int mismatch = + Arrays.mismatch( + this.maxKey.bytes, + this.maxKey.offset, + this.maxKey.offset + this.maxKey.length, + trieBuilder.minKey.bytes, + trieBuilder.minKey.offset, + trieBuilder.minKey.offset + trieBuilder.minKey.length); + Node a = this.root; + Node b = trieBuilder.root; + + for (int i = 0; i < mismatch; i++) { + final Node aLast = a.lastChild; + final Node bFirst = b.firstChild; + assert aLast.label == bFirst.label; + + if (b.childrenNum > 1) { + aLast.next = bFirst.next; + a.childrenNum += b.childrenNum - 1; + a.lastChild = b.lastChild; + assert assertChildrenLabelInOrder(a); + } + + a = aLast; + b = bFirst; + } + + assert b.childrenNum > 0; + if (a.childrenNum == 0) { + a.firstChild = b.firstChild; + a.lastChild = b.lastChild; + a.childrenNum = b.childrenNum; + } else { + assert a.lastChild.label < b.firstChild.label; + a.lastChild.next = b.firstChild; + a.lastChild = b.lastChild; + a.childrenNum += b.childrenNum; + } + assert assertChildrenLabelInOrder(a); + + this.maxKey = trieBuilder.maxKey; + trieBuilder.status = Status.DESTROYED; + } + + Output getEmptyOutput() { + return root.output; + } + + /** + * Used for tests only. The recursive impl need to be avoided if someone plans to use for + * production one day. + */ + void visit(BiConsumer consumer) { + assert status == Status.BUILDING; + if (root.output != null) { + consumer.accept(new BytesRef(), root.output); + } + visit(root.firstChild, new BytesRefBuilder(), consumer); + } + + private void visit(Node first, BytesRefBuilder key, BiConsumer consumer) { + while (first != null) { + key.append((byte) first.label); + if (first.output != null) { + consumer.accept(key.toBytesRef(), first.output); + } + visit(first.firstChild, key, consumer); + key.setLength(key.length() - 1); + first = first.next; + } + } + + void save(DataOutput meta, IndexOutput index) throws IOException { + if (status != Status.BUILDING) { + throw new IllegalStateException("only unsaved trie can be saved, got: " + status); + } + meta.writeVLong(index.getFilePointer()); + saveNodes(index); + meta.writeVLong(root.fp); + index.writeLong(0L); // additional 8 bytes for over-reading + meta.writeVLong(index.getFilePointer()); + status = Status.SAVED; + } + + void saveNodes(IndexOutput index) throws IOException { + final long startFP = index.getFilePointer(); + Deque stack = new ArrayDeque<>(); + stack.push(root); + + // Visit and save nodes of this trie in a post-order depth-first traversal. + while (stack.isEmpty() == false) { + Node node = stack.peek(); + assert node.fp == -1; + assert assertChildrenLabelInOrder(node); + + final int childrenNum = node.childrenNum; + + if (childrenNum == 0) { // leaf node + assert node.output != null : "leaf nodes should have output."; + + node.fp = index.getFilePointer() - startFP; + stack.pop(); + + // [n bytes] floor data + // [n bytes] output fp + // [1bit] x | [1bit] has floor | [1bit] has terms | [3bit] output fp bytes | [2bit] sign + + Output output = node.output; + int outputFpBytes = bytesRequiredVLong(output.fp); + int header = + SIGN_NO_CHILDREN + | ((outputFpBytes - 1) << 2) + | (output.hasTerms ? LEAF_NODE_HAS_TERMS : 0) + | (output.floorData != null ? LEAF_NODE_HAS_FLOOR : 0); + index.writeByte(((byte) header)); + writeLongNBytes(output.fp, outputFpBytes, index); + if (output.floorData != null) { + index.writeBytes( + output.floorData.bytes, output.floorData.offset, output.floorData.length); + } + continue; + } + + // If there are any children have not been saved, push the first one into stack and continue. + // We want to ensure saving children before parent. + + if (node.savedTo == null) { + node.savedTo = node.firstChild; + stack.push(node.savedTo); + continue; + } + if (node.savedTo.next != null) { + assert node.savedTo.fp >= 0; + node.savedTo = node.savedTo.next; + stack.push(node.savedTo); + continue; + } + + // All children have been written, now it's time to write the parent! + + assert assertNonLeafNodePreparingSaving(node); + node.fp = index.getFilePointer() - startFP; + stack.pop(); + + if (childrenNum == 1) { + + // [n bytes] floor data + // [n bytes] encoded output fp | [n bytes] child fp | [1 byte] label + // [3bit] encoded output fp bytes | [3bit] child fp bytes | [2bit] sign + + long childDeltaFp = node.fp - node.firstChild.fp; + assert childDeltaFp > 0 : "parent node is always written after children: " + childDeltaFp; + int childFpBytes = bytesRequiredVLong(childDeltaFp); + int encodedOutputFpBytes = + node.output == null ? 0 : bytesRequiredVLong(node.output.fp << 2); + + // TODO if we have only one child and no output, we can store child labels in this node. + // E.g. for a single term trie [foobar], we can save only two nodes [fooba] and [r] + + int sign = + node.output != null ? SIGN_SINGLE_CHILD_WITH_OUTPUT : SIGN_SINGLE_CHILD_WITHOUT_OUTPUT; + int header = sign | ((childFpBytes - 1) << 2) | ((encodedOutputFpBytes - 1) << 5); + index.writeByte((byte) header); + index.writeByte((byte) node.firstChild.label); + writeLongNBytes(childDeltaFp, childFpBytes, index); + + if (node.output != null) { + Output output = node.output; + long encodedFp = encodeFP(output); + writeLongNBytes(encodedFp, encodedOutputFpBytes, index); + if (output.floorData != null) { + index.writeBytes( + output.floorData.bytes, output.floorData.offset, output.floorData.length); + } + } + } else { + + // [n bytes] floor data + // [n bytes] children fps | [n bytes] strategy data + // [1 byte] children count (if floor data) | [n bytes] encoded output fp | [1 byte] label + // [5bit] strategy bytes | 2bit children strategy | [3bit] encoded output fp bytes + // [1bit] has output | [3bit] children fp bytes | [2bit] sign + + final int minLabel = node.firstChild.label; + final int maxLabel = node.lastChild.label; + assert maxLabel > minLabel; + ChildSaveStrategy childSaveStrategy = + ChildSaveStrategy.choose(minLabel, maxLabel, childrenNum); + int strategyBytes = childSaveStrategy.needBytes(minLabel, maxLabel, childrenNum); + assert strategyBytes > 0 && strategyBytes <= 32; + + // children fps are in order, so the first child's fp is min, then delta is max. + long maxChildDeltaFp = node.fp - node.firstChild.fp; + assert maxChildDeltaFp > 0 : "parent always written after all children"; + + int childrenFpBytes = bytesRequiredVLong(maxChildDeltaFp); + int encodedOutputFpBytes = + node.output == null ? 1 : bytesRequiredVLong(node.output.fp << 2); + int header = + SIGN_MULTI_CHILDREN + | ((childrenFpBytes - 1) << 2) + | ((node.output != null ? 1 : 0) << 5) + | ((encodedOutputFpBytes - 1) << 6) + | (childSaveStrategy.code << 9) + | ((strategyBytes - 1) << 11) + | (minLabel << 16); + + writeLongNBytes(header, 3, index); + + if (node.output != null) { + Output output = node.output; + long encodedFp = encodeFP(output); + writeLongNBytes(encodedFp, encodedOutputFpBytes, index); + if (output.floorData != null) { + // We need this childrenNum to compute where the floor data start. + index.writeByte((byte) (childrenNum - 1)); + } + } + + long strategyStartFp = index.getFilePointer(); + childSaveStrategy.save(node, childrenNum, strategyBytes, index); + assert index.getFilePointer() == strategyStartFp + strategyBytes + : childSaveStrategy.name() + + " strategy bytes compute error, computed: " + + strategyBytes + + " actual: " + + (index.getFilePointer() - strategyStartFp); + + for (Node child = node.firstChild; child != null; child = child.next) { + assert node.fp > child.fp : "parent always written after all children"; + writeLongNBytes(node.fp - child.fp, childrenFpBytes, index); + } + + if (node.output != null && node.output.floorData != null) { + BytesRef floorData = node.output.floorData; + index.writeBytes(floorData.bytes, floorData.offset, floorData.length); + } + } + } + } + + private long encodeFP(Output output) { + assert output.fp < 1L << 62; + return (output.floorData != null ? NON_LEAF_NODE_HAS_FLOOR : 0) + | (output.hasTerms ? NON_LEAF_NODE_HAS_TERMS : 0) + | (output.fp << 2); + } + + private static int bytesRequiredVLong(long v) { + return Long.BYTES - (Long.numberOfLeadingZeros(v | 1) >>> 3); + } + + /** + * Write the first (LSB order) n bytes of the given long v into the DataOutput. + * + *

    This differs from writeVLong because it can write more bytes than would be needed for vLong + * when the incoming int n is larger. + */ + private static void writeLongNBytes(long v, int n, DataOutput out) throws IOException { + for (int i = 0; i < n; i++) { + // Note that we sometimes write trailing 0 bytes here, when the incoming int n is bigger than + // would be required for a "normal" vLong + out.writeByte((byte) v); + v >>>= 8; + } + assert v == 0; + } + + private static boolean assertChildrenLabelInOrder(Node node) { + if (node.childrenNum == 0) { + assert node.firstChild == null; + assert node.lastChild == null; + } else if (node.childrenNum == 1) { + assert node.firstChild == node.lastChild; + assert node.firstChild.next == null; + } else if (node.childrenNum > 1) { + int n = 0; + for (Node child = node.firstChild; child != null; child = child.next) { + n++; + assert child.next == null || child.label < child.next.label + : " the label of children nodes should always be in strictly increasing order."; + } + assert node.childrenNum == n; + } + return true; + } + + private static boolean assertNonLeafNodePreparingSaving(Node node) { + assert assertChildrenLabelInOrder(node); + assert node.childrenNum != 0; + if (node.childrenNum == 1) { + assert node.firstChild == node.lastChild; + assert node.firstChild.next == null; + assert node.savedTo == node.firstChild; + assert node.firstChild.fp >= 0; + } else { + int n = 0; + for (Node child = node.firstChild; child != null; child = child.next) { + n++; + assert child.fp >= 0; + assert child.next == null || child.fp < child.next.fp + : " the fp or children nodes should always be in order."; + } + assert node.childrenNum == n; + assert node.lastChild == node.savedTo; + assert node.savedTo.next == null; + } + return true; + } + + enum ChildSaveStrategy { + + /** + * Store children labels in a bitset, this is likely the most efficient storage as we can + * compute position with bitCount instruction, so we give it the highest priority. + */ + BITS(2) { + @Override + int needBytes(int minLabel, int maxLabel, int labelCnt) { + int byteDistance = maxLabel - minLabel + 1; + return (byteDistance + 7) >>> 3; + } + + @Override + void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + throws IOException { + byte presenceBits = 1; // The first arc is always present. + int presenceIndex = 0; + int previousLabel = parent.firstChild.label; + for (Node child = parent.firstChild.next; child != null; child = child.next) { + int label = child.label; + assert label > previousLabel; + presenceIndex += label - previousLabel; + while (presenceIndex >= Byte.SIZE) { + output.writeByte(presenceBits); + presenceBits = 0; + presenceIndex -= Byte.SIZE; + } + // Set the bit at presenceIndex to flag that the corresponding arc is present. + presenceBits |= 1 << presenceIndex; + previousLabel = label; + } + assert presenceIndex == (parent.lastChild.label - parent.firstChild.label) % 8; + assert presenceBits != 0; // The last byte is not 0. + assert (presenceBits & (1 << presenceIndex)) != 0; // The last arc is always present. + output.writeByte(presenceBits); + } + + @Override + int lookup( + int targetLabel, RandomAccessInput in, long offset, int strategyBytes, int minLabel) + throws IOException { + int bitIndex = targetLabel - minLabel; + if (bitIndex >= (strategyBytes << 3)) { + return -1; + } + int wordIndex = bitIndex >>> 6; + long wordFp = offset + (wordIndex << 3); + long word = in.readLong(wordFp); + long mask = 1L << bitIndex; + if ((word & mask) == 0) { + return -1; + } + int pos = 0; + for (long fp = offset; fp < wordFp; fp += 8L) { + pos += Long.bitCount(in.readLong(fp)); + } + pos += Long.bitCount(word & (mask - 1)); + return pos; + } + }, + + /** + * Store labels in an array and lookup with binary search. + * + *

    TODO: Can we use VectorAPI to speed up the lookup? we can check 64 labels once on AVX512! + */ + ARRAY(1) { + @Override + int needBytes(int minLabel, int maxLabel, int labelCnt) { + return labelCnt - 1; // min label saved + } + + @Override + void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + throws IOException { + for (Node child = parent.firstChild.next; child != null; child = child.next) { + output.writeByte((byte) child.label); + } + } + + @Override + int lookup( + int targetLabel, RandomAccessInput in, long offset, int strategyBytes, int minLabel) + throws IOException { + int low = 0; + int high = strategyBytes - 1; + while (low <= high) { + int mid = (low + high) >>> 1; + int midLabel = in.readByte(offset + mid) & 0xFF; + if (midLabel < targetLabel) { + low = mid + 1; + } else if (midLabel > targetLabel) { + high = mid - 1; + } else { + return mid + 1; // min label not included, plus 1 + } + } + return -1; + } + }, + + /** + * Store labels that not existing within the range. E.g. store 10(max label) and 3, 5(absent + * label) for [1, 2, 4, 6, 7, 8, 9, 10]. + * + *

    TODO: Can we use VectorAPI to speed up the lookup? we can check 64 labels once on AVX512! + */ + REVERSE_ARRAY(0) { + + @Override + int needBytes(int minLabel, int maxLabel, int labelCnt) { + int byteDistance = maxLabel - minLabel + 1; + return byteDistance - labelCnt + 1; + } + + @Override + void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + throws IOException { + output.writeByte((byte) parent.lastChild.label); + int lastLabel = parent.firstChild.label; + for (Node child = parent.firstChild.next; child != null; child = child.next) { + while (++lastLabel < child.label) { + output.writeByte((byte) lastLabel); + } + } + } + + @Override + int lookup( + int targetLabel, RandomAccessInput in, long offset, int strategyBytes, int minLabel) + throws IOException { + int maxLabel = in.readByte(offset++) & 0xFF; + if (targetLabel >= maxLabel) { + return targetLabel == maxLabel ? maxLabel - minLabel - strategyBytes + 1 : -1; + } + if (strategyBytes == 1) { + return targetLabel - minLabel; + } + + int low = 0; + int high = strategyBytes - 2; + while (low <= high) { + int mid = (low + high) >>> 1; + int midLabel = in.readByte(offset + mid) & 0xFF; + if (midLabel < targetLabel) { + low = mid + 1; + } else if (midLabel > targetLabel) { + high = mid - 1; + } else { + return -1; + } + } + return targetLabel - minLabel - low; + } + }; + + private static final ChildSaveStrategy[] STRATEGIES_IN_PRIORITY_ORDER = + new ChildSaveStrategy[] {BITS, ARRAY, REVERSE_ARRAY}; + private static final ChildSaveStrategy[] STRATEGIES_BY_CODE; + + static { + STRATEGIES_BY_CODE = new ChildSaveStrategy[ChildSaveStrategy.values().length]; + for (ChildSaveStrategy strategy : ChildSaveStrategy.values()) { + assert STRATEGIES_BY_CODE[strategy.code] == null; + STRATEGIES_BY_CODE[strategy.code] = strategy; + } + } + + final int code; + + ChildSaveStrategy(int code) { + this.code = code; + } + + abstract int needBytes(int minLabel, int maxLabel, int labelCnt); + + abstract void save(Node parent, int labelCnt, int strategyBytes, IndexOutput output) + throws IOException; + + abstract int lookup( + int targetLabel, RandomAccessInput in, long offset, int strategyBytes, int minLabel) + throws IOException; + + static ChildSaveStrategy byCode(int code) { + return STRATEGIES_BY_CODE[code]; + } + + static ChildSaveStrategy choose(int minLabel, int maxLabel, int labelCnt) { + ChildSaveStrategy childSaveStrategy = null; + int strategyBytes = Integer.MAX_VALUE; + for (ChildSaveStrategy strategy : ChildSaveStrategy.STRATEGIES_IN_PRIORITY_ORDER) { + int strategyCost = strategy.needBytes(minLabel, maxLabel, labelCnt); + if (strategyCost < strategyBytes) { + childSaveStrategy = strategy; + strategyBytes = strategyCost; + } + } + assert childSaveStrategy != null; + assert strategyBytes > 0 && strategyBytes <= 32; + return childSaveStrategy; + } + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieReader.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieReader.java new file mode 100644 index 000000000000..367be262cc64 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/TrieReader.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import java.io.IOException; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.RandomAccessInput; + +class TrieReader { + + private static final long NO_OUTPUT = -1; + private static final long NO_FLOOR_DATA = -1; + private static final long[] BYTES_MINUS_1_MASK = + new long[] { + 0xFFL, + 0xFFFFL, + 0xFFFFFFL, + 0xFFFFFFFFL, + 0xFFFFFFFFFFL, + 0xFFFFFFFFFFFFL, + 0xFFFFFFFFFFFFFFL, + 0xFFFFFFFFFFFFFFFFL + }; + + static class Node { + + // single child + private long childDeltaFp; + + // multi children + private long strategyFp; + private int childSaveStrategy; + private int strategyBytes; + private int childrenDeltaFpBytes; + + // common + private int sign; + private long fp; + private int minChildrenLabel; + int label; + long outputFp; + boolean hasTerms; + long floorDataFp; + + boolean hasOutput() { + return outputFp != NO_OUTPUT; + } + + boolean isFloor() { + return floorDataFp != NO_FLOOR_DATA; + } + + IndexInput floorData(TrieReader r) throws IOException { + assert isFloor(); + r.input.seek(floorDataFp); + return r.input; + } + } + + final RandomAccessInput access; + final IndexInput input; + final Node root; + + TrieReader(IndexInput input, long rootFP) throws IOException { + this.access = input.randomAccessSlice(0, input.length()); + this.input = input; + this.root = new Node(); + load(root, rootFP); + } + + private void load(Node node, long fp) throws IOException { + node.fp = fp; + long termFlagsLong = access.readLong(fp); + int termFlags = (int) termFlagsLong; + int sign = node.sign = termFlags & 0x03; + + if (sign == TrieBuilder.SIGN_NO_CHILDREN) { + loadLeafNode(fp, termFlags, termFlagsLong, node); + } else if (sign == TrieBuilder.SIGN_MULTI_CHILDREN) { + loadMultiChildrenNode(fp, termFlags, termFlagsLong, node); + } else { + loadSingleChildNode(fp, sign, termFlags, termFlagsLong, node); + } + } + + private void loadLeafNode(long fp, int term, long termLong, Node node) throws IOException { + + // [n bytes] floor data + // [n bytes] output fp + // [1bit] x | [1bit] has floor | [1bit] has terms | [3bit] output fp bytes | [2bit] sign + + int fpBytesMinus1 = (term >>> 2) & 0x07; + if (fpBytesMinus1 <= 6) { + node.outputFp = (termLong >>> 8) & BYTES_MINUS_1_MASK[fpBytesMinus1]; + } else { + node.outputFp = access.readLong(fp + 1); + } + node.hasTerms = (term & TrieBuilder.LEAF_NODE_HAS_TERMS) != 0; + if ((term & TrieBuilder.LEAF_NODE_HAS_FLOOR) != 0) { // has floor + node.floorDataFp = fp + 2 + fpBytesMinus1; + } else { + node.floorDataFp = NO_FLOOR_DATA; + } + } + + private void loadSingleChildNode(long fp, int sign, int term, long termLong, Node node) + throws IOException { + + // [n bytes] floor data + // [n bytes] encoded output fp | [n bytes] child fp | [1 byte] label + // [3bit] encoded output fp bytes | [3bit] child fp bytes | [2bit] sign + + int childDeltaFpBytesMinus1 = (term >>> 2) & 0x07; + long l = childDeltaFpBytesMinus1 <= 5 ? termLong >>> 16 : access.readLong(fp + 2); + node.childDeltaFp = l & BYTES_MINUS_1_MASK[childDeltaFpBytesMinus1]; + node.minChildrenLabel = (term >>> 8) & 0xFF; + + if (sign == TrieBuilder.SIGN_SINGLE_CHILD_WITHOUT_OUTPUT) { + node.outputFp = NO_OUTPUT; + } else { // has output + assert sign == TrieBuilder.SIGN_SINGLE_CHILD_WITH_OUTPUT; + int encodedOutputFpBytesMinus1 = (term >>> 5) & 0x07; + long offset = fp + childDeltaFpBytesMinus1 + 3; + long encodedFp = access.readLong(offset) & BYTES_MINUS_1_MASK[encodedOutputFpBytesMinus1]; + node.outputFp = encodedFp >>> 2; + node.hasTerms = (encodedFp & TrieBuilder.NON_LEAF_NODE_HAS_TERMS) != 0; + if ((encodedFp & TrieBuilder.NON_LEAF_NODE_HAS_FLOOR) != 0) { // has floor + node.floorDataFp = offset + encodedOutputFpBytesMinus1 + 1; + } else { + node.floorDataFp = NO_FLOOR_DATA; + } + } + } + + private void loadMultiChildrenNode(long fp, int term, long termLong, Node node) + throws IOException { + + // [n bytes] floor data + // [n bytes] children fps | [n bytes] strategy data + // [1 byte] children count (if floor data) | [n bytes] encoded output fp | [1 byte] label + // [5bit] strategy bytes | 2bit children strategy | [3bit] encoded output fp bytes + // [1bit] has output | [3bit] children fp bytes | [2bit] sign + + node.childrenDeltaFpBytes = ((term >>> 2) & 0x07) + 1; + node.childSaveStrategy = (term >>> 9) & 0x03; + node.strategyBytes = ((term >>> 11) & 0x1F) + 1; + node.minChildrenLabel = (term >>> 16) & 0xFF; + + if ((term & 0x20) != 0) { // has output + int encodedOutputFpBytesMinus1 = (term >>> 6) & 0x07; + long l = encodedOutputFpBytesMinus1 <= 4 ? termLong >>> 24 : access.readLong(fp + 3); + long encodedFp = l & BYTES_MINUS_1_MASK[encodedOutputFpBytesMinus1]; + node.outputFp = encodedFp >>> 2; + node.hasTerms = (encodedFp & TrieBuilder.NON_LEAF_NODE_HAS_TERMS) != 0; + + if ((encodedFp & TrieBuilder.NON_LEAF_NODE_HAS_FLOOR) != 0) { // has floor + long offset = fp + 4 + encodedOutputFpBytesMinus1; + long childrenNum = (access.readByte(offset) & 0xFFL) + 1L; + node.strategyFp = offset + 1L; + node.floorDataFp = + node.strategyFp + node.strategyBytes + childrenNum * node.childrenDeltaFpBytes; + } else { + node.floorDataFp = NO_FLOOR_DATA; + node.strategyFp = fp + 4 + encodedOutputFpBytesMinus1; + } + } else { + node.outputFp = NO_OUTPUT; + node.strategyFp = fp + 3; + } + } + + /** Overwrite (and return) the incoming Node child, or null if the targetLabel was not found. */ + Node lookupChild(int targetLabel, Node parent, Node child) throws IOException { + final int sign = parent.sign; + if (sign == TrieBuilder.SIGN_NO_CHILDREN) { + return null; + } + + if (sign != TrieBuilder.SIGN_MULTI_CHILDREN) { + // single child + if (targetLabel != parent.minChildrenLabel) { + return null; + } + child.label = targetLabel; + load(child, parent.fp - parent.childDeltaFp); + return child; + } + + final long strategyBytesStartFp = parent.strategyFp; + final int minLabel = parent.minChildrenLabel; + final int strategyBytes = parent.strategyBytes; + + int position = -1; + if (targetLabel == minLabel) { + position = 0; + } else if (targetLabel > minLabel) { + position = + TrieBuilder.ChildSaveStrategy.byCode(parent.childSaveStrategy) + .lookup(targetLabel, access, strategyBytesStartFp, strategyBytes, minLabel); + } + + if (position < 0) { + return null; + } + + final int bytesPerEntry = parent.childrenDeltaFpBytes; + final long pos = strategyBytesStartFp + strategyBytes + (long) bytesPerEntry * position; + final long fp = parent.fp - (access.readLong(pos) & BYTES_MINUS_1_MASK[bytesPerEntry - 1]); + child.label = targetLabel; + load(child, fp); + + return child; + } +} diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/package-info.java new file mode 100644 index 000000000000..71c963ac0d37 --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/blocktree/package-info.java @@ -0,0 +1,29 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * BlockTree terms dictionary. + * + *

    This terms dictionary organizes all terms into blocks according to shared prefix, such that + * each block has enough terms, and then stores the prefix trie in memory as a trie as the index + * structure. It allows you to plug in your own {@link org.apache.lucene.codecs.PostingsWriterBase} + * to implement the postings. + * + *

    See {@link org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter} for the + * file format. + */ +package org.apache.lucene.codecs.lucene103.blocktree; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py index b1b36db096a7..f4f3fed630c5 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForDeltaUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForDeltaUtil.py @@ -40,14 +40,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; import org.apache.lucene.store.DataOutput; import org.apache.lucene.util.packed.PackedInts; -import static org.apache.lucene.codecs.lucene101.ForUtil.*; +import static org.apache.lucene.codecs.lucene103.ForUtil.*; /** * Inspired from https://fulmicoton.com/posts/bitpacking/ diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForUtil.py similarity index 99% rename from lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py rename to lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForUtil.py index 4640c9bd8863..6b1a977a99d4 100644 --- a/lucene/core/src/java/org/apache/lucene/codecs/lucene101/gen_ForUtil.py +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/gen_ForUtil.py @@ -40,7 +40,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.internal.vectorization.PostingDecodingUtil; diff --git a/lucene/core/src/java/org/apache/lucene/codecs/lucene103/package-info.java b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/package-info.java new file mode 100644 index 000000000000..0c4bb5a1ddee --- /dev/null +++ b/lucene/core/src/java/org/apache/lucene/codecs/lucene103/package-info.java @@ -0,0 +1,436 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/** + * Lucene 10.3 file format. + * + *

    Apache Lucene - Index File Formats

    + * + * + * + *

    Introduction

    + * + *
    + * + *

    This document defines the index file formats used in this version of Lucene. If you are using + * a different version of Lucene, please consult the copy of docs/ that was distributed + * with the version you are using. + * + *

    This document attempts to provide a high-level definition of the Apache Lucene file formats. + *

    + * + *

    Definitions

    + * + *
    + * + *

    The fundamental concepts in Lucene are index, document, field and term. + * + *

    An index contains a sequence of documents. + * + *

      + *
    • A document is a sequence of fields. + *
    • A field is a named sequence of terms. + *
    • A term is a sequence of bytes. + *
    + * + *

    The same sequence of bytes in two different fields is considered a different term. Thus terms + * are represented as a pair: the string naming the field, and the bytes within the field. + * + *

    Inverted Indexing

    + * + *

    Lucene's index stores terms and statistics about those terms in order to make term-based + * search more efficient. Lucene's terms index falls into the family of indexes known as an + * inverted index. This is because it can list, for a term, the documents that contain it. + * This is the inverse of the natural relationship, in which documents list terms. + * + *

    Types of Fields

    + * + *

    In Lucene, fields may be stored, in which case their text is stored in the index + * literally, in a non-inverted manner. Fields that are inverted are called indexed. A field + * may be both stored and indexed. + * + *

    The text of a field may be tokenized into terms to be indexed, or the text of a field + * may be used literally as a term to be indexed. Most fields are tokenized, but sometimes it is + * useful for certain identifier fields to be indexed literally. + * + *

    See the {@link org.apache.lucene.document.Field Field} java docs for more information on + * Fields. + * + *

    Segments

    + * + *

    Lucene indexes may be composed of multiple sub-indexes, or segments. Each segment is a + * fully independent index, which could be searched separately. Indexes evolve by: + * + *

      + *
    1. Creating new segments for newly added documents. + *
    2. Merging existing segments. + *
    + * + *

    Searches may involve multiple segments and/or multiple indexes, each index potentially + * composed of a set of segments. + * + *

    Document Numbers

    + * + *

    Internally, Lucene refers to documents by an integer document number. The first + * document added to an index is numbered zero, and each subsequent document added gets a number one + * greater than the previous. + * + *

    Note that a document's number may change, so caution should be taken when storing these + * numbers outside of Lucene. In particular, numbers may change in the following situations: + * + *

      + *
    • + *

      The numbers stored in each segment are unique only within the segment, and must be + * converted before they can be used in a larger context. The standard technique is to + * allocate each segment a range of values, based on the range of numbers used in that + * segment. To convert a document number from a segment to an external value, the segment's + * base document number is added. To convert an external value back to a + * segment-specific value, the segment is identified by the range that the external value is + * in, and the segment's base value is subtracted. For example two five document segments + * might be combined, so that the first segment has a base value of zero, and the second of + * five. Document three from the second segment would have an external value of eight. + *

    • + *

      When documents are deleted, gaps are created in the numbering. These are eventually + * removed as the index evolves through merging. Deleted documents are dropped when segments + * are merged. A freshly-merged segment thus has no gaps in its numbering. + *

    + * + *
    + * + *

    Index Structure Overview

    + * + *
    + * + *

    Each segment index maintains the following: + * + *

      + *
    • {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment info}. This + * contains metadata about a segment, such as the number of documents, what files it uses, and + * information about how the segment is sorted + *
    • {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Field names}. This + * contains metadata about the set of named fields used in the index. + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Stored Field values}. + * This contains, for each document, a list of attribute-value pairs, where the attributes are + * field names. These are used to store auxiliary information about the document, such as its + * title, url, or an identifier to access a database. The set of stored fields are what is + * returned for each hit when searching. This is keyed by document number. + *
    • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term dictionary}. A + * dictionary containing all of the terms used in all of the indexed fields of all of the + * documents. The dictionary also contains the number of documents which contain the term, and + * pointers to the term's frequency and proximity data. + *
    • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Frequency data}. For + * each term in the dictionary, the numbers of all the documents that contain that term, and + * the frequency of the term in that document, unless frequencies are omitted ({@link + * org.apache.lucene.index.IndexOptions#DOCS IndexOptions.DOCS}) + *
    • {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Proximity data}. For + * each term in the dictionary, the positions that the term occurs in each document. Note that + * this will not exist if all fields in all documents omit position data. + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Normalization factors}. For + * each field in each document, a value is stored that is multiplied into the score for hits + * on that field. + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vectors}. For each + * field in each document, the term vector (sometimes called document vector) may be stored. A + * term vector consists of term text and term frequency. To add Term Vectors to your index see + * the {@link org.apache.lucene.document.Field Field} constructors + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-document values}. Like + * stored values, these are also keyed by document number, but are generally intended to be + * loaded into main memory for fast access. Whereas stored values are generally intended for + * summary results from searches, per-document values are useful for things like scoring + * factors. + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live documents}. An + * optional file indicating which documents are live. + *
    • {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}. Optional pair + * of files, recording dimensionally indexed fields, to enable fast numeric range filtering + * and large numeric values like BigInteger and BigDecimal (1D) and geographic shape + * intersection (2D, 3D). + *
    • {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}. The + * vector format stores numeric vectors in a format optimized for random access and + * computation, supporting high-dimensional nearest-neighbor search. + *
    + * + *

    Details on each of these are provided in their linked pages.

    + * + *

    File Naming

    + * + *
    + * + *

    All files belonging to a segment have the same name with varying extensions. The extensions + * correspond to the different file formats described below. When using the Compound File format + * (default for small segments) these files (except for the Segment info file, the Lock file, and + * Deleted documents file) are collapsed into a single .cfs file (see below for details) + * + *

    Typically, all segments in an index are stored in a single directory, although this is not + * required. + * + *

    File names are never re-used. That is, when any file is saved to the Directory it is given a + * never before used filename. This is achieved using a simple generations approach. For example, + * the first segments file is segments_1, then segments_2, etc. The generation is a sequential long + * integer represented in alpha-numeric (base 36) form.

    + * + *

    Summary of File Extensions

    + * + *
    + * + *

    The following table summarizes the names and extensions of the files in Lucene: + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
    lucene filenames by extension
    NameExtensionBrief Description
    {@link org.apache.lucene.index.SegmentInfos Segments File}segments_NStores information about a commit point
    Lock Filewrite.lockThe Write lock prevents multiple IndexWriters from writing to the same + * file.
    {@link org.apache.lucene.codecs.lucene99.Lucene99SegmentInfoFormat Segment Info}.siStores metadata about a segment
    {@link org.apache.lucene.codecs.lucene90.Lucene90CompoundFormat Compound File}.cfs, .cfeAn optional "virtual" file consisting of all the other index files for + * systems that frequently run out of file handles.
    {@link org.apache.lucene.codecs.lucene94.Lucene94FieldInfosFormat Fields}.fnmStores information about the fields
    {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Index}.fdxContains pointers to field data
    {@link org.apache.lucene.codecs.lucene90.Lucene90StoredFieldsFormat Field Data}.fdtThe stored fields for documents
    {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Dictionary}.timThe term dictionary, stores term info
    {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Term Index}.tipThe index into the Term Dictionary
    {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Frequencies}.docContains the list of docs which contain each term along with frequency
    {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Positions}.posStores position information about where a term occurs in the index
    {@link org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat Payloads}.payStores additional per-position metadata information such as character offsets and user payloads
    {@link org.apache.lucene.codecs.lucene90.Lucene90NormsFormat Norms}.nvd, .nvmEncodes length and boost factors for docs and fields
    {@link org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat Per-Document Values}.dvd, .dvmEncodes additional scoring factors or other per-document information.
    {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Index}.tvxStores offset into the document data file
    {@link org.apache.lucene.codecs.lucene90.Lucene90TermVectorsFormat Term Vector Data}.tvdContains term vector data.
    {@link org.apache.lucene.codecs.lucene90.Lucene90LiveDocsFormat Live Documents}.livInfo about what documents are live
    {@link org.apache.lucene.codecs.lucene90.Lucene90PointsFormat Point values}.kdd, .kdi, .kdmHolds indexed points
    {@link org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat Vector values}.vec, .vem, .veq, vexHolds indexed vectors; .vec files contain the raw vector data, + * .vem the vector metadata, .veq the quantized vector data, and .vex the + * hnsw graph data.
    + * + *

    + * + *

    Lock File

    + * + * The write lock, which is stored in the index directory by default, is named "write.lock". If the + * lock directory is different from the index directory then the write lock will be named + * "XXXX-write.lock" where XXXX is a unique prefix derived from the full path to the index + * directory. When this file is present, a writer is currently modifying the index (adding or + * removing documents). This lock file ensures that only one writer is modifying the index at a + * time. + * + *

    History

    + * + *

    Compatibility notes are provided in this document, describing how file formats have changed + * from prior versions: + * + *

      + *
    • In version 2.1, the file format was changed to allow lock-less commits (ie, no more commit + * lock). The change is fully backwards compatible: you can open a pre-2.1 index for searching + * or adding/deleting of docs. When the new segments file is saved (committed), it will be + * written in the new file format (meaning no specific "upgrade" process is needed). But note + * that once a commit has occurred, pre-2.1 Lucene will not be able to read the index. + *
    • In version 2.3, the file format was changed to allow segments to share a single set of doc + * store (vectors & stored fields) files. This allows for faster indexing in certain + * cases. The change is fully backwards compatible (in the same way as the lock-less commits + * change in 2.1). + *
    • In version 2.4, Strings are now written as true UTF-8 byte sequence, not Java's modified + * UTF-8. See LUCENE-510 for + * details. + *
    • In version 2.9, an optional opaque Map<String,String> CommitUserData may be passed to + * IndexWriter's commit methods (and later retrieved), which is recorded in the segments_N + * file. See LUCENE-1382 for + * details. Also, diagnostics were added to each segment written recording details about why + * it was written (due to flush, merge; which OS/JRE was used; etc.). See issue LUCENE-1654 for details. + *
    • In version 3.0, compressed fields are no longer written to the index (they can still be + * read, but on merge the new segment will write them, uncompressed). See issue LUCENE-1960 for details. + *
    • In version 3.1, segments records the code version that created them. See LUCENE-2720 for details. + * Additionally segments track explicitly whether or not they have term vectors. See LUCENE-2811 for details. + *
    • In version 3.2, numeric fields are written as natively to stored fields file, previously + * they were stored in text format only. + *
    • In version 3.4, fields can omit position data while still indexing term frequencies. + *
    • In version 4.0, the format of the inverted index became extensible via the {@link + * org.apache.lucene.codecs.Codec Codec} api. Fast per-document storage ({@code DocValues}) + * was introduced. Normalization factors need no longer be a single byte, they can be any + * {@link org.apache.lucene.index.NumericDocValues NumericDocValues}. Terms need not be + * unicode strings, they can be any byte sequence. Term offsets can optionally be indexed into + * the postings lists. Payloads can be stored in the term vectors. + *
    • In version 4.1, the format of the postings list changed to use either of FOR compression or + * variable-byte encoding, depending upon the frequency of the term. Terms appearing only once + * were changed to inline directly into the term dictionary. Stored fields are compressed by + * default. + *
    • In version 4.2, term vectors are compressed by default. DocValues has a new multi-valued + * type (SortedSet), that can be used for faceting/grouping/joining on multi-valued fields. + *
    • In version 4.5, DocValues were extended to explicitly represent missing values. + *
    • In version 4.6, FieldInfos were extended to support per-field DocValues generation, to + * allow updating NumericDocValues fields. + *
    • In version 4.8, checksum footers were added to the end of each index file for improved data + * integrity. Specifically, the last 8 bytes of every index file contain the zlib-crc32 + * checksum of the file. + *
    • In version 4.9, DocValues has a new multi-valued numeric type (SortedNumeric) that is + * suitable for faceting/sorting/analytics. + *
    • In version 5.4, DocValues have been improved to store more information on disk: addresses + * for binary fields and ord indexes for multi-valued fields. + *
    • In version 6.0, Points were added, for multi-dimensional range/distance search. + *
    • In version 6.2, new Segment info format that reads/writes the index sort, to support index + * sorting. + *
    • In version 7.0, DocValues have been improved to better support sparse doc values thanks to + * an iterator API. + *
    • In version 8.0, postings have been enhanced to record, for each block of doc ids, the (term + * freq, normalization factor) pairs that may trigger the maximum score of the block. This + * information is recorded alongside skip data in order to be able to skip blocks of doc ids + * if they may not produce high enough scores. Additionally doc values and norms has been + * extended with jump-tables to make access O(1) instead of O(n), where n is the number of + * elements to skip when advancing in the data. + *
    • In version 8.4, postings, positions, offsets and payload lengths have move to a more + * performant encoding that is vectorized. + *
    • In version 8.6, index sort serialization is delegated to the sorts themselves, to allow + * user-defined sorts to be used + *
    • In version 8.6, points fields split the index tree and leaf data into separate files, to + * allow for different access patterns to the different data structures + *
    • In version 8.7, stored fields compression became adaptive to better handle documents with + * smaller stored fields. + *
    • In version 9.0, vector-valued fields were added. + *
    • In version 9.1, vector-valued fields were modified to add a graph hierarchy. + *
    • In version 9.2, docs of vector-valued fields were moved from .vem to .vec and encoded by + * IndexDISI. ordToDoc mappings was added to .vem. + *
    • In version 9.5, HNSW graph connections were changed to be delta-encoded with vints. + * Additionally, metadata file size improvements were made by delta-encoding nodes by graph + * layer and not writing the node ids for the zeroth layer. + *
    • In version 9.9, Vector scalar quantization support was added. Allowing the HNSW vector + * format to utilize int8 quantized vectors for float32 vector search. + *
    • In version 9.12, skip data was refactored to have only two levels: every 128 docs and every + * 4,06 docs, and to be inlined in postings lists. This resulted in a speedup for queries that + * need skipping, especially conjunctions. + *
    • In version 10.1, block encoding changed to be optimized for int[] storage instead of + * long[]. + *
    • In version 10.3, the index of block tree changed to be specialized trie instead of FST. + *
    + * + * + * + *

    Limitations

    + * + *
    + * + *

    Lucene uses a Java int to refer to document numbers, and the index file format + * uses an Int32 on-disk to store document numbers. This is a limitation of both the + * index file format and the current implementation. Eventually these should be replaced with either + * UInt64 values, or better yet, {@link org.apache.lucene.store.DataOutput#writeVInt + * VInt} values which have no limit.

    + */ +package org.apache.lucene.codecs.lucene103; diff --git a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java index b9b7b2dbd411..f725de389a6a 100644 --- a/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java +++ b/lucene/core/src/java/org/apache/lucene/internal/vectorization/VectorizationProvider.java @@ -215,8 +215,8 @@ private static Optional lookupVectorModule() { Set.of( "org.apache.lucene.codecs.hnsw.FlatVectorScorerUtil", "org.apache.lucene.util.VectorUtil", - "org.apache.lucene.codecs.lucene101.Lucene101PostingsReader", - "org.apache.lucene.codecs.lucene101.PostingIndexInput", + "org.apache.lucene.codecs.lucene103.Lucene103PostingsReader", + "org.apache.lucene.codecs.lucene103.PostingIndexInput", "org.apache.lucene.tests.util.TestSysoutsLimits"); private static void ensureCaller() { diff --git a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java index b1d283cc3899..e478c08e05e5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java +++ b/lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java @@ -21,8 +21,8 @@ import java.util.Arrays; import java.util.List; import java.util.Objects; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; @@ -399,10 +399,10 @@ public boolean equals(Object obj) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. + * Lucene103PostingsReader.BlockPostingsEnum#nextPosition()}. * *

    Aside: Instead of being constant this could depend among others on {@link - * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link + * Lucene103PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs), * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block * size of the device storing the index. @@ -410,7 +410,7 @@ public boolean equals(Object obj) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} + * Number of simple operations in {@link Lucene103PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java index de304ea27222..a08b36413a58 100644 --- a/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java +++ b/lucene/core/src/java/org/apache/lucene/search/ScorerUtil.java @@ -18,7 +18,7 @@ import java.util.stream.LongStream; import java.util.stream.StreamSupport; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.index.ImpactsEnum; import org.apache.lucene.util.Bits; import org.apache.lucene.util.FixedBitSet; @@ -28,7 +28,7 @@ class ScorerUtil { private static final Class DEFAULT_IMPACTS_ENUM_CLASS = - Lucene101PostingsFormat.getImpactsEnumImpl(); + Lucene103PostingsFormat.getImpactsEnumImpl(); private static final Class DEFAULT_ACCEPT_DOCS_CLASS = new FixedBitSet(1).asReadOnlyBits().getClass(); diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec index f7a246c76849..b85efc3c37fe 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.Codec @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene101.Lucene101Codec +org.apache.lucene.codecs.lucene103.Lucene103Codec diff --git a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat index 874ebafd971b..12d44d82f7dc 100644 --- a/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat +++ b/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat +org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForDeltaUtil.java similarity index 97% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForDeltaUtil.java index d41ab472ea60..224731bdaf18 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForDeltaUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForDeltaUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; @@ -68,7 +68,7 @@ public void testEncodeDecode() throws IOException { // decode IndexInput in = d.openInput("test.bin", IOContext.READONCE); PostingDecodingUtil pdu = - Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + Lucene103PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); ForDeltaUtil forDeltaUtil = new ForDeltaUtil(); for (int i = 0; i < iterations; ++i) { int base = 0; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForUtil.java similarity index 97% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForUtil.java index d93cb0ab3ecd..935eff6e8fde 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestForUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestForUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; @@ -71,7 +71,7 @@ public void testEncodeDecode() throws IOException { // decode IndexInput in = d.openInput("test.bin", IOContext.READONCE); PostingDecodingUtil pdu = - Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + Lucene103PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); ForUtil forUtil = new ForUtil(); for (int i = 0; i < iterations; ++i) { final int bitsPerValue = in.readByte(); diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java new file mode 100644 index 000000000000..210fde9fa23c --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestLucene103PostingsFormat.java @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import org.apache.lucene.codecs.Codec; +import org.apache.lucene.codecs.CompetitiveImpactAccumulator; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader.MutableImpactList; +import org.apache.lucene.codecs.lucene103.blocktree.FieldReader; +import org.apache.lucene.codecs.lucene103.blocktree.Stats; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.Impact; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.store.ByteArrayDataInput; +import org.apache.lucene.store.ByteArrayDataOutput; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.analysis.MockAnalyzer; +import org.apache.lucene.tests.index.BasePostingsFormatTestCase; +import org.apache.lucene.tests.util.TestUtil; + +public class TestLucene103PostingsFormat extends BasePostingsFormatTestCase { + + @Override + protected Codec getCodec() { + return TestUtil.alwaysPostingsFormat(new Lucene103PostingsFormat()); + } + + public void testVInt15() throws IOException { + byte[] bytes = new byte[5]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytes); + ByteArrayDataInput in = new ByteArrayDataInput(); + for (int i : new int[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE}) { + out.reset(bytes); + Lucene103PostingsWriter.writeVInt15(out, i); + in.reset(bytes, 0, out.getPosition()); + assertEquals(i, Lucene103PostingsReader.readVInt15(in)); + assertEquals(out.getPosition(), in.getPosition()); + } + } + + public void testVLong15() throws IOException { + byte[] bytes = new byte[9]; + ByteArrayDataOutput out = new ByteArrayDataOutput(bytes); + ByteArrayDataInput in = new ByteArrayDataInput(); + for (long i : new long[] {0, 1, 127, 128, 32767, 32768, Integer.MAX_VALUE, Long.MAX_VALUE}) { + out.reset(bytes); + Lucene103PostingsWriter.writeVLong15(out, i); + in.reset(bytes, 0, out.getPosition()); + assertEquals(i, Lucene103PostingsReader.readVLong15(in)); + assertEquals(out.getPosition(), in.getPosition()); + } + } + + /** Make sure the final sub-block(s) are not skipped. */ + public void testFinalBlock() throws Exception { + Directory d = newDirectory(); + IndexWriter w = new IndexWriter(d, new IndexWriterConfig(new MockAnalyzer(random()))); + for (int i = 0; i < 25; i++) { + Document doc = new Document(); + doc.add(newStringField("field", Character.toString((char) (97 + i)), Field.Store.NO)); + doc.add(newStringField("field", "z" + Character.toString((char) (97 + i)), Field.Store.NO)); + w.addDocument(doc); + } + w.forceMerge(1); + + DirectoryReader r = DirectoryReader.open(w); + assertEquals(1, r.leaves().size()); + FieldReader field = (FieldReader) r.leaves().get(0).reader().terms("field"); + // We should see exactly two blocks: one root block (prefix empty string) and one block for z* + // terms (prefix z): + Stats stats = field.getStats(); + assertEquals(0, stats.floorBlockCount); + assertEquals(2, stats.nonFloorBlockCount); + r.close(); + w.close(); + d.close(); + } + + public void testImpactSerialization() throws IOException { + // omit norms and omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 1L))); + + // omit freqs + doTestImpactSerialization(Collections.singletonList(new Impact(1, 42L))); + // omit freqs with very large norms + doTestImpactSerialization(Collections.singletonList(new Impact(1, -100L))); + + // omit norms + doTestImpactSerialization(Collections.singletonList(new Impact(30, 1L))); + // omit norms with large freq + doTestImpactSerialization(Collections.singletonList(new Impact(500, 1L))); + + // freqs and norms, basic + doTestImpactSerialization( + Arrays.asList( + new Impact(1, 7L), + new Impact(3, 9L), + new Impact(7, 10L), + new Impact(15, 11L), + new Impact(20, 13L), + new Impact(28, 14L))); + + // freqs and norms, high values + doTestImpactSerialization( + Arrays.asList( + new Impact(2, 2L), + new Impact(10, 10L), + new Impact(12, 50L), + new Impact(50, -100L), + new Impact(1000, -80L), + new Impact(1005, -3L))); + } + + private void doTestImpactSerialization(List impacts) throws IOException { + CompetitiveImpactAccumulator acc = new CompetitiveImpactAccumulator(); + for (Impact impact : impacts) { + acc.add(impact.freq, impact.norm); + } + try (Directory dir = newDirectory()) { + try (IndexOutput out = dir.createOutput("foo", IOContext.DEFAULT)) { + Lucene103PostingsWriter.writeImpacts(acc.getCompetitiveFreqNormPairs(), out); + } + try (IndexInput in = dir.openInput("foo", IOContext.DEFAULT)) { + byte[] b = new byte[Math.toIntExact(in.length())]; + in.readBytes(b, 0, b.length); + List impacts2 = + Lucene103PostingsReader.readImpacts( + new ByteArrayDataInput(b), + new MutableImpactList(impacts.size() + random().nextInt(3))); + assertEquals(impacts, impacts2); + } + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPForUtil.java similarity index 97% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPForUtil.java index 4e9ab4b55ee2..61879034ffed 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPForUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPForUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import com.carrotsearch.randomizedtesting.generators.RandomNumbers; import java.io.IOException; @@ -41,7 +41,7 @@ public void testEncodeDecode() throws IOException { IndexInput in = d.openInput("test.bin", IOContext.READONCE); PostingDecodingUtil pdu = - Lucene101PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); + Lucene103PostingsReader.VECTORIZATION_PROVIDER.newPostingDecodingUtil(in); final PForUtil pforUtil = new PForUtil(); for (int i = 0; i < iterations; ++i) { if (random().nextInt(5) == 0) { diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPostingsUtil.java similarity index 98% rename from lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java rename to lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPostingsUtil.java index 5d02d0561e33..32edd3608ba2 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene101/TestPostingsUtil.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/TestPostingsUtil.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.lucene.codecs.lucene101; +package org.apache.lucene.codecs.lucene103; import java.io.IOException; import org.apache.lucene.store.Directory; diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java new file mode 100644 index 000000000000..9a0dc499c3e1 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene103/blocktree/TestTrie.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.codecs.lucene103.blocktree; + +import static org.apache.lucene.codecs.lucene103.blocktree.TrieBuilder.ChildSaveStrategy; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Map; +import java.util.TreeMap; +import java.util.function.Supplier; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.IOContext; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.tests.util.LuceneTestCase; +import org.apache.lucene.util.BytesRef; +import org.junit.Assert; + +public class TestTrie extends LuceneTestCase { + + public void testStrategyChoose() { + // bits use 32 bytes while reverse_array use 31 bytes, choose reverse_array + assertSame(ChildSaveStrategy.REVERSE_ARRAY, ChildSaveStrategy.choose(0, 255, 226)); + // bits use 32 bytes while array use 31 bytes, choose array + assertSame(ChildSaveStrategy.ARRAY, ChildSaveStrategy.choose(0, 255, 32)); + // array and bits both use 32 position bytes, we choose bits. + assertSame(ChildSaveStrategy.BITS, ChildSaveStrategy.choose(0, 255, 33)); + // reverse_array and bits both use 32 position bytes, we choose bits. + assertSame(ChildSaveStrategy.BITS, ChildSaveStrategy.choose(0, 255, 225)); + } + + public void testRandomTerms() throws Exception { + Supplier supplier = TestTrie::randomBytes; + testTrieBuilder(supplier, atLeast(10000)); + testTrieLookup(supplier, TEST_NIGHTLY ? 18 : 12); + } + + public void testVeryLongTerms() throws Exception { + Supplier supplier = + () -> { + byte[] bytes = new byte[65535]; + for (int i = 1; i < bytes.length; i++) { + bytes[i] = (byte) random().nextInt(i / 256 + 1); + } + return bytes; + }; + testTrieLookup(supplier, 5); + } + + public void testOneByteTerms() throws Exception { + // heavily test single byte terms to generate various label distribution. + Supplier supplier = () -> new byte[] {(byte) random().nextInt()}; + int round = atLeast(50); + for (int i = 0; i < round; i++) { + testTrieLookup(supplier, 10); + } + } + + private void testTrieBuilder(Supplier randomBytesSupplier, int count) { + Map expected = new TreeMap<>(); + expected.put(new BytesRef(""), new TrieBuilder.Output(0L, false, new BytesRef("emptyOutput"))); + for (int i = 0; i < count; i++) { + BytesRef key = new BytesRef(randomBytesSupplier.get()); + TrieBuilder.Output value = + new TrieBuilder.Output( + random().nextLong(1L << 62), + random().nextBoolean(), + new BytesRef(randomBytesSupplier.get())); + expected.put(key, value); + } + + TrieBuilder trieBuilder = + TrieBuilder.bytesRefToTrie( + new BytesRef(""), new TrieBuilder.Output(0L, false, new BytesRef("emptyOutput"))); + for (var entry : expected.entrySet()) { + if (entry.getKey().equals(new BytesRef(""))) { + continue; + } + TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue()); + trieBuilder.append(add); + Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder)); + Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add)); + } + Map actual = new TreeMap<>(); + trieBuilder.visit(actual::put); + assertEquals(expected, actual); + } + + private void testTrieLookup(Supplier randomBytesSupplier, int round) throws IOException { + for (int iter = 1; iter <= round; iter++) { + Map expected = new TreeMap<>(); + expected.put( + new BytesRef(""), new TrieBuilder.Output(0L, false, new BytesRef("emptyOutput"))); + int n = 1 << iter; + for (int i = 0; i < n; i++) { + BytesRef key = new BytesRef(randomBytesSupplier.get()); + TrieBuilder.Output value = + new TrieBuilder.Output( + random().nextLong(1L << 62), + random().nextBoolean(), + random().nextBoolean() ? null : new BytesRef(randomBytesSupplier.get())); + expected.put(key, value); + } + + TrieBuilder trieBuilder = + TrieBuilder.bytesRefToTrie( + new BytesRef(""), new TrieBuilder.Output(0L, false, new BytesRef("emptyOutput"))); + for (var entry : expected.entrySet()) { + if (entry.getKey().equals(new BytesRef(""))) { + continue; + } + TrieBuilder add = TrieBuilder.bytesRefToTrie(entry.getKey(), entry.getValue()); + trieBuilder.append(add); + Assert.assertThrows(IllegalStateException.class, () -> add.append(trieBuilder)); + Assert.assertThrows(IllegalStateException.class, () -> trieBuilder.append(add)); + } + + try (Directory directory = newDirectory()) { + try (IndexOutput index = directory.createOutput("index", IOContext.DEFAULT); + IndexOutput meta = directory.createOutput("meta", IOContext.DEFAULT)) { + trieBuilder.save(meta, index); + assertThrows(IllegalStateException.class, () -> trieBuilder.save(meta, index)); + assertThrows( + IllegalStateException.class, + () -> + trieBuilder.append( + TrieBuilder.bytesRefToTrie( + new BytesRef(), new TrieBuilder.Output(0L, true, null)))); + } + + try (IndexInput indexIn = directory.openInput("index", IOContext.DEFAULT); + IndexInput metaIn = directory.openInput("meta", IOContext.DEFAULT)) { + long start = metaIn.readVLong(); + long rootFP = metaIn.readVLong(); + long end = metaIn.readVLong(); + TrieReader reader = new TrieReader(indexIn.slice("outputs", start, end - start), rootFP); + + for (Map.Entry entry : expected.entrySet()) { + assertResult(reader, entry.getKey(), entry.getValue()); + } + + int testNotFound = atLeast(100); + for (int i = 0; i < testNotFound; i++) { + BytesRef key = new BytesRef(randomBytes()); + while (expected.containsKey(key)) { + key = new BytesRef(randomBytes()); + } + BytesRef lastK = new BytesRef(); + for (BytesRef k : expected.keySet()) { + if (k.compareTo(key) > 0) { + assert lastK.compareTo(key) < 0; + int mismatch1 = + Arrays.mismatch( + lastK.bytes, + lastK.offset, + lastK.offset + lastK.length, + key.bytes, + key.offset, + key.offset + key.length); + int mismatch2 = + Arrays.mismatch( + k.bytes, + k.offset, + k.offset + k.length, + key.bytes, + key.offset, + key.offset + key.length); + assertNotFoundOnLevelN(reader, key, Math.max(mismatch1, mismatch2)); + break; + } + lastK = k; + } + } + } + } + } + } + + private static byte[] randomBytes() { + byte[] bytes = new byte[random().nextInt(256) + 1]; + for (int i = 1; i < bytes.length; i++) { + bytes[i] = (byte) random().nextInt(1 << (i % 9)); + } + return bytes; + } + + private static void assertResult(TrieReader reader, BytesRef term, TrieBuilder.Output expected) + throws IOException { + TrieReader.Node parent = reader.root; + TrieReader.Node child = new TrieReader.Node(); + for (int i = 0; i < term.length; i++) { + TrieReader.Node found = reader.lookupChild(term.bytes[i + term.offset] & 0xFF, parent, child); + Assert.assertNotNull(found); + parent = child; + child = new TrieReader.Node(); + } + assertTrue(parent.hasOutput()); + assertEquals(expected.fp(), parent.outputFp); + assertEquals(expected.hasTerms(), parent.hasTerms); + if (expected.floorData() == null) { + assertFalse(parent.isFloor()); + } else { + byte[] bytes = new byte[expected.floorData().length]; + parent.floorData(reader).readBytes(bytes, 0, bytes.length); + assertArrayEquals(BytesRef.deepCopyOf(expected.floorData()).bytes, bytes); + } + } + + private static void assertNotFoundOnLevelN(TrieReader reader, BytesRef term, int n) + throws IOException { + TrieReader.Node parent = reader.root; + TrieReader.Node child = new TrieReader.Node(); + for (int i = 0; i < term.length; i++) { + TrieReader.Node found = reader.lookupChild(term.bytes[i + term.offset] & 0xFF, parent, child); + if (i == n) { + assertNull(found); + break; + } + Assert.assertNotNull(found); + parent = child; + child = new TrieReader.Node(); + } + } +} diff --git a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java index 69fbf96f6da8..b41988631e68 100644 --- a/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java +++ b/lucene/core/src/test/org/apache/lucene/codecs/lucene90/TestLucene90StoredFieldsFormatHighCompression.java @@ -18,7 +18,7 @@ import com.carrotsearch.randomizedtesting.generators.RandomPicks; import org.apache.lucene.codecs.Codec; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.document.Document; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; @@ -31,7 +31,7 @@ public class TestLucene90StoredFieldsFormatHighCompression extends BaseStoredFieldsFormatTestCase { @Override protected Codec getCodec() { - return new Lucene101Codec(Lucene101Codec.Mode.BEST_COMPRESSION); + return new Lucene103Codec(Lucene103Codec.Mode.BEST_COMPRESSION); } /** @@ -42,7 +42,7 @@ public void testMixedCompressions() throws Exception { for (int i = 0; i < 10; i++) { IndexWriterConfig iwc = newIndexWriterConfig(); iwc.setCodec( - new Lucene101Codec(RandomPicks.randomFrom(random(), Lucene101Codec.Mode.values()))); + new Lucene103Codec(RandomPicks.randomFrom(random(), Lucene103Codec.Mode.values()))); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig()); Document doc = new Document(); doc.add(new StoredField("field1", "value1")); @@ -72,7 +72,7 @@ public void testInvalidOptions() { expectThrows( NullPointerException.class, () -> { - new Lucene101Codec(null); + new Lucene103Codec(null); }); expectThrows( diff --git a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java index eadf9960518d..3bcca24d215b 100644 --- a/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java +++ b/lucene/core/src/test/org/apache/lucene/index/TestForTooMuchCloning.java @@ -87,7 +87,7 @@ public void test() throws Exception { // perform. assertTrue( "too many calls to IndexInput.clone during TermRangeQuery: " + queryCloneCount, - queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 5); + queryCloneCount <= Math.max(s.getLeafContexts().size(), s.getSlices().length) * 6); r.close(); dir.close(); } diff --git a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java index 6c914dfcc032..0a9fb4b1ce9d 100644 --- a/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java +++ b/lucene/core/src/test/org/apache/lucene/internal/vectorization/TestPostingDecodingUtil.java @@ -16,7 +16,7 @@ */ package org.apache.lucene.internal.vectorization; -import org.apache.lucene.codecs.lucene101.ForUtil; +import org.apache.lucene.codecs.lucene103.ForUtil; import org.apache.lucene.store.Directory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; diff --git a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java index 3b484ed2430a..ba22fad1ed04 100644 --- a/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java +++ b/lucene/misc/src/test/org/apache/lucene/misc/index/TestBpVectorReorderer.java @@ -26,7 +26,7 @@ import java.util.concurrent.ForkJoinPool; import java.util.concurrent.ForkJoinWorkerThread; import org.apache.lucene.codecs.KnnVectorsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.document.Document; import org.apache.lucene.document.KnnFloatVectorField; @@ -64,7 +64,7 @@ public void setUp() throws Exception { private void createQuantizedIndex(Directory dir, List vectors) throws IOException { IndexWriterConfig cfg = new IndexWriterConfig(); cfg.setCodec( - new Lucene101Codec() { + new Lucene103Codec() { @Override public KnnVectorsFormat getKnnVectorsFormatForField(String field) { return new Lucene99HnswScalarQuantizedVectorsFormat(8, 32); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index c1360a9d9a7e..e075aaccc515 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -21,8 +21,8 @@ import java.util.Collection; import java.util.Collections; import java.util.Objects; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.PostingsEnum; @@ -261,10 +261,10 @@ public void visit(String field, QueryVisitor visitor) { /** * A guess of the average number of simple operations for the initial seek and buffer refill per * document for the positions of a term. See also {@link - * Lucene101PostingsReader.BlockPostingsEnum#nextPosition()}. + * Lucene103PostingsReader.BlockPostingsEnum#nextPosition()}. * *

    Aside: Instead of being constant this could depend among others on {@link - * Lucene101PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link + * Lucene103PostingsFormat#BLOCK_SIZE}, {@link TermsEnum#docFreq()}, {@link * TermsEnum#totalTermFreq()}, {@link DocIdSetIterator#cost()} (expected number of matching docs), * {@link LeafReader#maxDoc()} (total number of docs in the segment), and the seek time and block * size of the device storing the index. @@ -272,7 +272,7 @@ public void visit(String field, QueryVisitor visitor) { private static final int TERM_POSNS_SEEK_OPS_PER_DOC = 128; /** - * Number of simple operations in {@link Lucene101PostingsReader.BlockPostingsEnum#nextPosition()} + * Number of simple operations in {@link Lucene103PostingsReader.BlockPostingsEnum#nextPosition()} * when no seek or buffer refill is done. */ private static final int TERM_OPS_PER_POS = 7; diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsFormat.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsFormat.java index d4c08667cbb5..32072af9283a 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsFormat.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/IDVersionPostingsFormat.java @@ -22,7 +22,7 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; import org.apache.lucene.search.LiveFieldValues; @@ -66,15 +66,15 @@ public class IDVersionPostingsFormat extends PostingsFormat { public IDVersionPostingsFormat() { this( - Lucene90BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, - Lucene90BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); + Lucene103BlockTreeTermsWriter.DEFAULT_MIN_BLOCK_SIZE, + Lucene103BlockTreeTermsWriter.DEFAULT_MAX_BLOCK_SIZE); } public IDVersionPostingsFormat(int minTermsInBlock, int maxTermsInBlock) { super("IDVersion"); this.minTermsInBlock = minTermsInBlock; this.maxTermsInBlock = maxTermsInBlock; - Lucene90BlockTreeTermsWriter.validateSettings(minTermsInBlock, maxTermsInBlock); + Lucene103BlockTreeTermsWriter.validateSettings(minTermsInBlock, maxTermsInBlock); } @Override diff --git a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java index 6d498ba950a6..c416e0083c5f 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java +++ b/lucene/sandbox/src/java/org/apache/lucene/sandbox/codecs/idversion/VersionBlockTreeTermsWriter.java @@ -24,7 +24,7 @@ import org.apache.lucene.codecs.FieldsConsumer; import org.apache.lucene.codecs.NormsProducer; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInfos; import org.apache.lucene.index.Fields; @@ -83,9 +83,9 @@ order, meaning if you just next() the file pointer will */ /** - * This is just like {@link Lucene90BlockTreeTermsWriter}, except it also stores a version per term, - * and adds a method to its TermsEnum implementation to seekExact only if the version is >= the - * specified version. The version is added to the terms index to avoid seeking if no term in the + * This is just like {@link Lucene103BlockTreeTermsWriter}, except it also stores a version per + * term, and adds a method to its TermsEnum implementation to seekExact only if the version is >= + * the specified version. The version is added to the terms index to avoid seeking if no term in the * block has a high enough version. The term blocks file is .tiv and the terms index extension is * .tipv. * @@ -181,7 +181,7 @@ public VersionBlockTreeTermsWriter( int minItemsInBlock, int maxItemsInBlock) throws IOException { - Lucene90BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock); + Lucene103BlockTreeTermsWriter.validateSettings(minItemsInBlock, maxItemsInBlock); maxDoc = state.segmentInfo.maxDoc(); final String termsFileName = diff --git a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java index e9dde1705b9e..5e419016945c 100644 --- a/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java +++ b/lucene/suggest/src/java/org/apache/lucene/search/suggest/document/Completion101PostingsFormat.java @@ -20,7 +20,7 @@ /** * {@link CompletionPostingsFormat} for {@link - * org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat} + * org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat} * * @lucene.experimental */ @@ -32,6 +32,6 @@ public Completion101PostingsFormat() { @Override protected PostingsFormat delegatePostingsFormat() { - return PostingsFormat.forName("Lucene101"); + return PostingsFormat.forName("Lucene103"); } } diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java index 7c5a0bcf2545..17ceff0d4c21 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneFixedGap.java @@ -28,9 +28,9 @@ import org.apache.lucene.codecs.blockterms.FixedGapTermsIndexWriter; import org.apache.lucene.codecs.blockterms.TermsIndexReaderBase; import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -38,7 +38,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene101PostingsFormat} that uses {@link FixedGapTermsIndexWriter}. + * Customized version of {@link Lucene103PostingsFormat} that uses {@link FixedGapTermsIndexWriter}. */ public final class LuceneFixedGap extends PostingsFormat { final int termIndexInterval; @@ -54,7 +54,7 @@ public LuceneFixedGap(int termIndexInterval) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene101PostingsWriter(state); + PostingsWriterBase docs = new Lucene103PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -91,7 +91,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene101PostingsReader(state); + PostingsReaderBase postings = new Lucene103PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java index ff5e0fc07dda..741ce7325185 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapDocFreqInterval.java @@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -39,7 +39,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene101PostingsFormat} that uses {@link + * Customized version of {@link Lucene103PostingsFormat} that uses {@link * VariableGapTermsIndexWriter} with a fixed interval, but forcing high docfreq terms to be indexed * terms. */ @@ -59,7 +59,7 @@ public LuceneVarGapDocFreqInterval(int docFreqThreshold, int termIndexInterval) @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene101PostingsWriter(state); + PostingsWriterBase docs = new Lucene103PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -100,7 +100,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene101PostingsReader(state); + PostingsReaderBase postings = new Lucene103PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java index 7899f3d54363..93dd141b061a 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/blockterms/LuceneVarGapFixedInterval.java @@ -29,9 +29,9 @@ import org.apache.lucene.codecs.blockterms.TermsIndexWriterBase; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexReader; import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.index.SegmentReadState; import org.apache.lucene.index.SegmentWriteState; @@ -39,7 +39,7 @@ // any PostingsFormat and make it ord-able... /** - * Customized version of {@link Lucene101PostingsFormat} that uses {@link + * Customized version of {@link Lucene103PostingsFormat} that uses {@link * VariableGapTermsIndexWriter} with a fixed interval. */ public final class LuceneVarGapFixedInterval extends PostingsFormat { @@ -56,7 +56,7 @@ public LuceneVarGapFixedInterval(int termIndexInterval) { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException { - PostingsWriterBase docs = new Lucene101PostingsWriter(state); + PostingsWriterBase docs = new Lucene103PostingsWriter(state); // TODO: should we make the terms index more easily // pluggable? Ie so that this codec would record which @@ -95,7 +95,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException @Override public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException { - PostingsReaderBase postings = new Lucene101PostingsReader(state); + PostingsReaderBase postings = new Lucene103PostingsReader(state); TermsIndexReaderBase indexReader; boolean success = false; diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java index 8770e0d81fa2..e660ccc44784 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/mockrandom/MockRandomPostingsFormat.java @@ -35,10 +35,10 @@ import org.apache.lucene.codecs.blockterms.VariableGapTermsIndexWriter; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsReader; import org.apache.lucene.codecs.blocktreeords.OrdsBlockTreeTermsWriter; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsReader; -import org.apache.lucene.codecs.lucene90.blocktree.Lucene90BlockTreeTermsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsReader; +import org.apache.lucene.codecs.lucene103.blocktree.Lucene103BlockTreeTermsWriter; import org.apache.lucene.codecs.memory.FSTTermsReader; import org.apache.lucene.codecs.memory.FSTTermsWriter; import org.apache.lucene.index.FieldInfo; @@ -121,7 +121,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException random.nextInt(); // consume a random for buffersize - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(state); + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(state); final FieldsConsumer fields; final int t1 = random.nextInt(4); @@ -151,7 +151,7 @@ public FieldsConsumer fieldsConsumer(SegmentWriteState state) throws IOException boolean success = false; try { fields = - new Lucene90BlockTreeTermsWriter( + new Lucene103BlockTreeTermsWriter( state, postingsWriter, minTermsInBlock, maxTermsInBlock); success = true; } finally { @@ -289,7 +289,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException System.out.println("MockRandomCodec: readBufferSize=" + readBufferSize); } - PostingsReaderBase postingsReader = new Lucene101PostingsReader(state); + PostingsReaderBase postingsReader = new Lucene103PostingsReader(state); final FieldsProducer fields; final int t1 = random.nextInt(4); @@ -311,7 +311,7 @@ public FieldsProducer fieldsProducer(SegmentReadState state) throws IOException boolean success = false; try { - fields = new Lucene90BlockTreeTermsReader(postingsReader, state); + fields = new Lucene103BlockTreeTermsReader(postingsReader, state); success = true; } finally { if (!success) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java index 2b1613dc8e4b..63f7bcbf2cb7 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/codecs/uniformsplit/UniformSplitRot13PostingsFormat.java @@ -23,8 +23,8 @@ import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.PostingsReaderBase; import org.apache.lucene.codecs.PostingsWriterBase; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsReader; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsWriter; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsReader; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsWriter; import org.apache.lucene.codecs.uniformsplit.BlockDecoder; import org.apache.lucene.codecs.uniformsplit.BlockEncoder; import org.apache.lucene.codecs.uniformsplit.IndexDictionary; @@ -68,7 +68,7 @@ public static void resetEncodingFlags() { @Override public FieldsConsumer fieldsConsumer(SegmentWriteState segmentWriteState) throws IOException { - PostingsWriterBase postingsWriter = new Lucene101PostingsWriter(segmentWriteState); + PostingsWriterBase postingsWriter = new Lucene103PostingsWriter(segmentWriteState); boolean success = false; try { FieldsConsumer fieldsConsumer = createFieldsConsumer(segmentWriteState, postingsWriter); @@ -146,7 +146,7 @@ public void writeTo(DataOutput dataOutput) throws IOException { @Override public FieldsProducer fieldsProducer(SegmentReadState segmentReadState) throws IOException { - PostingsReaderBase postingsReader = new Lucene101PostingsReader(segmentReadState); + PostingsReaderBase postingsReader = new Lucene103PostingsReader(segmentReadState); boolean success = false; try { FieldsProducer fieldsProducer = createFieldsProducer(segmentReadState, postingsReader); diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java index 1ace55a98a06..13db0d34e783 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestRuleSetupAndRestoreClassEnv.java @@ -38,7 +38,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.DocValuesFormat; import org.apache.lucene.codecs.PostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; import org.apache.lucene.codecs.simpletext.SimpleTextCodec; import org.apache.lucene.search.similarities.Similarity; import org.apache.lucene.tests.codecs.asserting.AssertingCodec; @@ -190,7 +190,7 @@ public String toString() { codec = CompressingCodec.randomInstance(random); } else if ("Lucene100".equals(TEST_CODEC) || ("random".equals(TEST_CODEC) && randomVal == 5 && !shouldAvoidCodec("Lucene100"))) { - codec = new Lucene101Codec(RandomPicks.randomFrom(random, Lucene101Codec.Mode.values())); + codec = new Lucene103Codec(RandomPicks.randomFrom(random, Lucene103Codec.Mode.values())); } else if (!"random".equals(TEST_CODEC)) { codec = Codec.forName(TEST_CODEC); } else if ("random".equals(TEST_POSTINGSFORMAT)) { diff --git a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java index b83b92652731..f1e42ed2f471 100644 --- a/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java +++ b/lucene/test-framework/src/java/org/apache/lucene/tests/util/TestUtil.java @@ -55,8 +55,8 @@ import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.PostingsFormat; import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat; -import org.apache.lucene.codecs.lucene101.Lucene101Codec; -import org.apache.lucene.codecs.lucene101.Lucene101PostingsFormat; +import org.apache.lucene.codecs.lucene103.Lucene103Codec; +import org.apache.lucene.codecs.lucene103.Lucene103PostingsFormat; import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat; @@ -1334,7 +1334,7 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) { * different from {@link Codec#getDefault()} because that is randomized. */ public static Codec getDefaultCodec() { - return new Lucene101Codec(); + return new Lucene103Codec(); } /** @@ -1342,7 +1342,7 @@ public static Codec getDefaultCodec() { * Lucene. */ public static PostingsFormat getDefaultPostingsFormat() { - return new Lucene101PostingsFormat(); + return new Lucene103PostingsFormat(); } /** @@ -1353,7 +1353,7 @@ public static PostingsFormat getDefaultPostingsFormat() { */ public static PostingsFormat getDefaultPostingsFormat( int minItemsPerBlock, int maxItemsPerBlock) { - return new Lucene101PostingsFormat(minItemsPerBlock, maxItemsPerBlock); + return new Lucene103PostingsFormat(minItemsPerBlock, maxItemsPerBlock); } /** Returns a random postings format that supports term ordinals */