diff --git a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java index 40110dbdbe0..2914fcf4473 100644 --- a/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/bzip2/BZip2CompressorInputStream.java @@ -31,6 +31,7 @@ import org.apache.commons.compress.compressors.CompressorException; import org.apache.commons.compress.compressors.CompressorInputStream; +import org.apache.commons.compress.compressors.support.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.InputStreamStatistics; import org.apache.commons.io.input.CloseShieldInputStream; @@ -59,16 +60,16 @@ static final class Data { */ final int[] unzftab = new int[256]; // 1024 byte - // Needs indexes from 0 to MAX_CODE_LEN inclusive. - final int[][] limit = new int[N_GROUPS][MAX_CODE_LEN + 1]; - // Needs indexes from 0 to MAX_CODE_LEN + 1 inclusive. - final int[][] base = new int[N_GROUPS][MAX_CODE_LEN + 2]; - final int[][] perm = new int[N_GROUPS][MAX_ALPHA_SIZE]; // 6192 byte - final int[] minLens = new int[N_GROUPS]; // 24 byte - + /** + * Huffman decoding tables. + */ + final HuffmanDecoder[] huffmanDecoders = new HuffmanDecoder[N_GROUPS]; + /** + * Number of non-null {@link #huffmanDecoders}. + */ + int huffmanDecodersCount; final int[] cftab = new int[257]; // 1028 byte final char[] getAndMoveToFrontDecode_yy = new char[256]; // 512 byte - final char[][] temp_charArray2d = new char[N_GROUPS][MAX_ALPHA_SIZE]; // 3096 // byte final byte[] recvDecodingTables_pos = new byte[N_GROUPS]; // 6 byte // --------------- @@ -158,85 +159,6 @@ private static void checkBounds(final int checkVal, final int limitExclusive, fi } } - /** - * Builds the Huffman decoding tables for use by {@code recvDecodingTables()}. - * - * @param alphaSize the alphabet size, guaranteed by the caller to be in the range [2, 258] - * (RUNA, RUNB, 255 byte values, and EOB). - * @param nGroups the number of Huffman coding groups, guaranteed by the caller to be in the range [0, 6]. - * @param dataShadow the data structure into which the tables are built; requires - * {@code temp_charArray2d} to be initialized. - */ - static void createHuffmanDecodingTables(final int alphaSize, final int nGroups, final Data dataShadow) { - final char[][] len = dataShadow.temp_charArray2d; - final int[] minLens = dataShadow.minLens; - final int[][] limit = dataShadow.limit; - final int[][] base = dataShadow.base; - final int[][] perm = dataShadow.perm; - - for (int t = 0; t < nGroups; t++) { - final char[] len_t = len[t]; - int minLen = len_t[0]; - int maxLen = len_t[0]; - for (int i = 1; i < alphaSize; i++) { - final char lent = len_t[i]; - if (lent > maxLen) { - maxLen = lent; - } - if (lent < minLen) { - minLen = lent; - } - } - hbCreateDecodeTables(limit[t], base[t], perm[t], len[t], minLen, maxLen, alphaSize); - minLens[t] = minLen; - } - } - - /** - * Called by createHuffmanDecodingTables() exclusively. - * - * @param minLen minimum code length in the range [1, {@value MAX_CODE_LEN}] guaranteed by the caller. - * @param maxLen maximum code length in the range [1, {@value MAX_CODE_LEN}] guaranteed by the caller. - */ - private static void hbCreateDecodeTables(final int[] limit, final int[] base, final int[] perm, final char[] length, final int minLen, final int maxLen, - final int alphaSize) { - for (int i = minLen, pp = 0; i <= maxLen; i++) { - for (int j = 0; j < alphaSize; j++) { - if (length[j] == i) { - perm[pp++] = j; - } - } - } - // Ensure the arrays were not reused. - Arrays.fill(base, 0); - Arrays.fill(limit, minLen, maxLen + 1, 0); - // Compute histogram of code lengths, shifted by 1. - for (int i = 0; i < alphaSize; i++) { - final int len = length[i] + 1; - base[len]++; - } - // Compute cumulative counts: base[len] = # of codes with length < len. - // In other terms: base[len] = index of the first code in the `perm` table. - for (int len = 1; len < base.length; len++) { - base[len] += base[len - 1]; - } - // Compute the last code for each length. - int vec = 0; - for (int len = minLen; len <= maxLen; len++) { - // increment by the number of length `len` codes - vec += base[len + 1] - base[len]; - // vec is now the last code of length `len` + 1 - limit[len] = vec - 1; - vec <<= 1; - } - // Compute the bias between code value and table index. - // base[minLen] cannot be computed using this rule, since limit[minLen - 1] does not exist, - // but has already the correct value 0. - for (int len = minLen + 1; len <= maxLen; len++) { - base[len] = (limit[len - 1] + 1 << 1) - base[len]; - } - } - private static void makeMaps(final Data data) throws IOException { final boolean[] inUse = data.inUse; final byte[] seqToUnseq = data.seqToUnseq; @@ -334,27 +256,24 @@ static void recvDecodingTables(final BitInputStream bin, final Data dataShadow) selector[i] = tmp; } - final char[][] len = dataShadow.temp_charArray2d; - - /* Now the coding tables */ + /* Now the Huffman coding tables */ for (int t = 0; t < nGroups; t++) { + final int[] codeLengths = new int[alphaSize]; int curr = bsR(bin, 5); - final char[] len_t = len[t]; for (int i = 0; i < alphaSize; i++) { while (bsGetBit(bin)) { curr += bsGetBit(bin) ? -1 : 1; } - // Same condition as in bzip2 - if (curr < 1 || curr > MAX_CODE_LEN) { - throw new CompressorException( - "Corrupted input, code length value out of range [%d, %d]: %d", 1, MAX_CODE_LEN, curr); - } - len_t[i] = (char) curr; + codeLengths[i] = curr; + } + try { + // Same limits as in the reference C implementation of bzip2 + dataShadow.huffmanDecoders[t] = new HuffmanDecoder(codeLengths, alphaSize, 1, MAX_CODE_LEN); + } catch (final IllegalArgumentException e) { + throw new CompressorException("Invalid Huffman data: " + e.getMessage(), e); } } - - // finally create the Huffman tables - createHuffmanDecodingTables(alphaSize, nGroups, dataShadow); + dataShadow.huffmanDecodersCount = nGroups; } // Variables used by setup* methods exclusively @@ -474,10 +393,6 @@ private void getAndMoveToFrontDecode() throws IOException { final byte[] selector = dataShadow.selector; final byte[] seqToUnseq = dataShadow.seqToUnseq; final char[] yy = dataShadow.getAndMoveToFrontDecode_yy; - final int[] minLens = dataShadow.minLens; - final int[][] limit = dataShadow.limit; - final int[][] base = dataShadow.base; - final int[][] perm = dataShadow.perm; final int limitLast = this.blockSize100k * 100000; /* * Setting up the unzftab entries here is not strictly necessary, but it does save having to do it later in a separate pass, and so saves a block's @@ -487,18 +402,15 @@ private void getAndMoveToFrontDecode() throws IOException { yy[i] = (char) i; unzftab[i] = 0; } - int groupNo = 0; int groupPos = G_SIZE - 1; final int eob = dataShadow.inUseCount + 1; - int nextSym = getAndMoveToFrontDecode0(); int lastShadow = -1; + + // Initialize group, selector and huffmanDecoder + int groupNo = 0; int zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - int[] base_zt = base[zt]; - int[] limit_zt = limit[zt]; - int[] perm_zt = perm[zt]; - int minLens_zt = minLens[zt]; + HuffmanDecoder currentDecoder = getHuffmanDecoder(dataShadow, zt); + int nextSym = currentDecoder.decodeSymbol(bin); while (nextSym != eob) { if (nextSym == RUNA || nextSym == RUNB) { int s = -1; @@ -514,25 +426,11 @@ private void getAndMoveToFrontDecode() throws IOException { groupPos = G_SIZE - 1; checkBounds(++groupNo, selector.length, "groupNo"); zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - base_zt = base[zt]; - limit_zt = limit[zt]; - perm_zt = perm[zt]; - minLens_zt = minLens[zt]; + currentDecoder = getHuffmanDecoder(dataShadow, zt); } else { groupPos--; } - int zn = minLens_zt; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int tmp = zvec - base_zt[zn]; - checkBounds(tmp, perm_zt.length, "zvec"); - nextSym = perm_zt[tmp]; + nextSym = currentDecoder.decodeSymbol(bin); } checkBounds(s, this.data.ll8.length, "s"); final int yy0 = yy[0]; @@ -570,45 +468,19 @@ private void getAndMoveToFrontDecode() throws IOException { groupPos = G_SIZE - 1; checkBounds(++groupNo, selector.length, "groupNo"); zt = selector[groupNo] & 0xff; - // All arrays have the same length - checkBounds(zt, base.length, "zt"); - base_zt = base[zt]; - limit_zt = limit[zt]; - perm_zt = perm[zt]; - minLens_zt = minLens[zt]; + currentDecoder = getHuffmanDecoder(dataShadow, zt); } else { groupPos--; } - int zn = minLens_zt; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int idx = zvec - base_zt[zn]; - checkBounds(idx, perm_zt.length, "zvec"); - nextSym = perm_zt[idx]; + nextSym = currentDecoder.decodeSymbol(bin); } } this.last = lastShadow; } - private int getAndMoveToFrontDecode0() throws IOException { - final Data dataShadow = this.data; - final int zt = dataShadow.selector[0] & 0xff; - checkBounds(zt, dataShadow.limit.length, "zt"); - final int[] limit_zt = dataShadow.limit[zt]; - int zn = dataShadow.minLens[zt]; - checkBounds(zn, limit_zt.length, "zn"); - int zvec = bsR(bin, zn); - while (zvec > limit_zt[zn]) { - checkBounds(++zn, limit_zt.length, "zn"); - zvec = zvec << 1 | bsR(bin, 1); - } - final int tmp = zvec - dataShadow.base[zt][zn]; - checkBounds(tmp, dataShadow.perm[zt].length, "zvec"); - return dataShadow.perm[zt][tmp]; + private static HuffmanDecoder getHuffmanDecoder(final Data dataShadow, final int zt) throws IOException { + checkBounds(zt, dataShadow.huffmanDecodersCount, "zt"); + return dataShadow.huffmanDecoders[zt]; } /** diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java index 0b5623b79d6..925bc6d5fd7 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64CompressorInputStream.java @@ -35,11 +35,11 @@ */ public class Deflate64CompressorInputStream extends CompressorInputStream implements InputStreamStatistics { private InputStream originalStream; - private HuffmanDecoder decoder; + private Deflate64Decoder decoder; private long compressedBytesRead; private final byte[] oneByte = new byte[1]; - Deflate64CompressorInputStream(final HuffmanDecoder decoder) { + Deflate64CompressorInputStream(final Deflate64Decoder decoder) { this.decoder = decoder; } @@ -49,7 +49,7 @@ public class Deflate64CompressorInputStream extends CompressorInputStream implem * @param in the stream to read from */ public Deflate64CompressorInputStream(final InputStream in) { - this(new HuffmanDecoder(in)); + this(new Deflate64Decoder(in)); originalStream = in; } diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java similarity index 80% rename from src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java rename to src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java index e9821f7abee..d4682956b5f 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanDecoder.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64Decoder.java @@ -18,10 +18,10 @@ */ package org.apache.commons.compress.compressors.deflate64; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.DYNAMIC_CODES; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.FIXED_CODES; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.INITIAL; -import static org.apache.commons.compress.compressors.deflate64.HuffmanState.STORED; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.DYNAMIC_CODES; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.FIXED_CODES; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.INITIAL; +import static org.apache.commons.compress.compressors.deflate64.Deflate64State.STORED; import java.io.Closeable; import java.io.EOFException; @@ -30,6 +30,7 @@ import java.nio.ByteOrder; import java.util.Arrays; +import org.apache.commons.compress.compressors.support.HuffmanDecoder; import org.apache.commons.compress.utils.BitInputStream; import org.apache.commons.compress.utils.ExactMath; import org.apache.commons.lang3.ArrayFill; @@ -38,38 +39,7 @@ /** * TODO This class can't be final because it is mocked by Mockito. */ -class HuffmanDecoder implements Closeable { - - private static final class BinaryTreeNode { - private final int bits; - int literal = -1; - BinaryTreeNode leftNode; - BinaryTreeNode rightNode; - - private BinaryTreeNode(final int bits) { - this.bits = bits; - } - - void leaf(final int symbol) { - literal = symbol; - leftNode = null; - rightNode = null; - } - - BinaryTreeNode left() { - if (leftNode == null && literal == -1) { - leftNode = new BinaryTreeNode(bits + 1); - } - return leftNode; - } - - BinaryTreeNode right() { - if (rightNode == null && literal == -1) { - rightNode = new BinaryTreeNode(bits + 1); - } - return rightNode; - } - } +class Deflate64Decoder implements Closeable { private abstract static class DecoderState { abstract int available() throws IOException; @@ -78,7 +48,7 @@ private abstract static class DecoderState { abstract int read(byte[] b, int off, int len) throws IOException; - abstract HuffmanState state(); + abstract Deflate64State state(); } private static final class DecodingMemory { @@ -132,18 +102,18 @@ void recordToBuffer(final int distance, final int length, final byte[] buff) { private final class HuffmanCodes extends DecoderState { private boolean endOfBlock; - private final HuffmanState state; - private final BinaryTreeNode lengthTree; - private final BinaryTreeNode distanceTree; + private final Deflate64State state; + private final HuffmanDecoder symbolDecoder; + private final HuffmanDecoder distanceDecoder; private int runBufferPos; private byte[] runBuffer = ArrayUtils.EMPTY_BYTE_ARRAY; private int runBufferLength; - HuffmanCodes(final HuffmanState state, final int[] lengths, final int[] distance) { + HuffmanCodes(final Deflate64State state, final int[] lengths, final int[] distance) { this.state = state; - lengthTree = buildTree(lengths); - distanceTree = buildTree(distance); + symbolDecoder = new HuffmanDecoder(lengths); + distanceDecoder = new HuffmanDecoder(distance); } @Override @@ -169,7 +139,7 @@ private int decodeNext(final byte[] b, final int off, final int len) throws IOEx int result = copyFromRunBuffer(b, off, len); while (result < len) { - final int symbol = nextSymbol(reader, lengthTree); + final int symbol = symbolDecoder.decodeSymbol(reader); if (symbol < 256) { b[off + result++] = memory.add((byte) symbol); } else if (symbol > 256) { @@ -178,7 +148,7 @@ private int decodeNext(final byte[] b, final int off, final int len) throws IOEx final int runXtra = runMask & 0x1F; run = ExactMath.add(run, readBits(runXtra)); - final int distSym = nextSymbol(reader, distanceTree); + final int distSym = distanceDecoder.decodeSymbol(reader); final int distMask = DISTANCE_TABLE[distSym]; int dist = distMask >>> 4; @@ -209,14 +179,11 @@ boolean hasData() { @Override int read(final byte[] b, final int off, final int len) throws IOException { - if (len == 0) { - return 0; - } - return decodeNext(b, off, len); + return len == 0 ? 0 : decodeNext(b, off, len); } @Override - HuffmanState state() { + Deflate64State state() { return endOfBlock ? INITIAL : state; } } @@ -233,7 +200,7 @@ boolean hasData() { } @Override - int read(final byte[] b, final int off, final int len) throws IOException { + int read(final byte[] b, final int off, final int len) { if (len == 0) { return 0; } @@ -241,7 +208,7 @@ int read(final byte[] b, final int off, final int len) throws IOException { } @Override - HuffmanState state() { + Deflate64State state() { return INITIAL; } } @@ -292,7 +259,7 @@ int read(final byte[] b, final int off, final int len) throws IOException { } @Override - HuffmanState state() { + Deflate64State state() { return read < blockLength ? STORED : INITIAL; } } @@ -367,71 +334,16 @@ HuffmanState state() { FIXED_DISTANCE = ArrayFill.fill(new int[32], 5); } - private static BinaryTreeNode buildTree(final int[] litTable) { - final int[] literalCodes = getCodes(litTable); - - final BinaryTreeNode root = new BinaryTreeNode(0); - - for (int i = 0; i < litTable.length; i++) { - final int len = litTable[i]; - if (len != 0) { - BinaryTreeNode node = root; - final int lit = literalCodes[len - 1]; - for (int p = len - 1; p >= 0; p--) { - final int bit = lit & 1 << p; - node = bit == 0 ? node.left() : node.right(); - if (node == null) { - throw new IllegalStateException("node doesn't exist in Huffman tree"); - } - } - node.leaf(i); - literalCodes[len - 1]++; - } - } - return root; - } - - private static int[] getCodes(final int[] litTable) { - int max = 0; - int[] blCount = new int[65]; - - for (final int aLitTable : litTable) { - if (aLitTable < 0 || aLitTable > 64) { - throw new IllegalArgumentException("Invalid code " + aLitTable + " in literal table"); - } - max = Math.max(max, aLitTable); - blCount[aLitTable]++; - } - blCount = Arrays.copyOf(blCount, max + 1); - - int code = 0; - final int[] nextCode = new int[max + 1]; - for (int i = 0; i <= max; i++) { - code = code + blCount[i] << 1; - nextCode[i] = code; - } - - return nextCode; - } - - private static int nextSymbol(final BitInputStream reader, final BinaryTreeNode tree) throws IOException { - BinaryTreeNode node = tree; - while (node != null && node.literal == -1) { - final long bit = readBits(reader, 1); - node = bit == 0 ? node.leftNode : node.rightNode; - } - return node != null ? node.literal : -1; - } - private static void populateDynamicTables(final BitInputStream reader, final int[] literals, final int[] distances) throws IOException { final int codeLengths = (int) (readBits(reader, 4) + 4); - final int[] codeLengthValues = new int[19]; + final int[] codeLengthValues = new int[CODE_LENGTHS_ORDER.length]; for (int cLen = 0; cLen < codeLengths; cLen++) { codeLengthValues[CODE_LENGTHS_ORDER[cLen]] = (int) readBits(reader, 3); } - final BinaryTreeNode codeLengthTree = buildTree(codeLengthValues); + // Decoder to decode the code lengths for the literal and distance tables + final HuffmanDecoder codeLengthDecoder = new HuffmanDecoder(codeLengthValues); final int[] auxBuffer = new int[literals.length + distances.length]; @@ -443,7 +355,7 @@ private static void populateDynamicTables(final BitInputStream reader, final int auxBuffer[off++] = value; length--; } else { - final int symbol = nextSymbol(reader, codeLengthTree); + final int symbol = codeLengthDecoder.decodeSymbol(reader); if (symbol < 16) { value = symbol; auxBuffer[off++] = value; @@ -489,7 +401,7 @@ private static long readBits(final BitInputStream reader, final int numBits) thr private final DecodingMemory memory = new DecodingMemory(); - HuffmanDecoder(final InputStream in) { + Deflate64Decoder(final InputStream in) { this.reader = new BitInputStream(in, ByteOrder.LITTLE_ENDIAN); this.in = in; state = new InitialState(); diff --git a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java similarity index 97% rename from src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java rename to src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java index 20f3979a91f..434b5d90ae1 100644 --- a/src/main/java/org/apache/commons/compress/compressors/deflate64/HuffmanState.java +++ b/src/main/java/org/apache/commons/compress/compressors/deflate64/Deflate64State.java @@ -18,6 +18,6 @@ */ package org.apache.commons.compress.compressors.deflate64; -enum HuffmanState { +enum Deflate64State { INITIAL, STORED, DYNAMIC_CODES, FIXED_CODES } diff --git a/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java new file mode 100644 index 00000000000..411b2043741 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/support/HuffmanDecoder.java @@ -0,0 +1,270 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.commons.compress.compressors.support; + +import java.io.EOFException; +import java.io.IOException; +import java.nio.ByteOrder; +import java.util.Objects; + +import org.apache.commons.compress.compressors.CompressorException; +import org.apache.commons.compress.utils.BitInputStream; + +/** + * Canonical Huffman decoder. + *
+ * This class builds decoding tables from an array of code lengths (one entry per symbol) + * and then decodes symbols from a {@link BitInputStream}. The code set is expected to be a + * complete prefix code; i.e., the code lengths must satisfy Kraft's equality. + *
+ * + *{@code
+ * int[] codeLengths = ...; // length per symbol (0 => unused)
+ * int symbolCount = codeLengths.length;
+ * int maxLen = 15; // maximum non-zero code length in codeLengths
+ * HuffmanDecoder dec = new HuffmanDecoder(codeLengths, symbolCount, maxLen);
+ * int sym = dec.decodeSymbol(bitIn);
+ * }
+ *
+ * + * The {@code codeLengths} array provides, for each symbol index {@code i} in {@code [0, codeLengthSize)}, + * the length (in bits) of that symbol's code. + * A value of {@code 0} marks an unused symbol. + * All non-zero lengths must be {@code <= maxCodeLength}. + *
+ * + * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} + * @throws NullPointerException if {@code codeLengths} is {@code null} + * @throws IllegalArgumentException if any code length is out of range [0, 30] + */ + public HuffmanDecoder(final int[] codeLengths) { + this(codeLengths, codeLengths.length, 0, MAX_SUPPORTED_CODE_LENGTH); + } + + /** + * Constructs a decoder from canonical code lengths. + *+ * The {@code codeLengths} array provides, for each symbol index {@code i} in {@code [0, codeLengthSize)}, + * the length (in bits) of that symbol's code. + * A value of {@code 0} marks an unused symbol. + * All non-zero lengths must be {@code <= maxCodeLength}. + *
+ * + * @param codeLengths code length per symbol; {@code 0} means the symbol is not used; not {@code null} + * @param codeLengthSize number of symbols to read from {@code codeLengths} + * (must be {@code > 0} and {@code <= codeLengths.length}) + * @param minCodeLength minimum allowed code length present in {@code codeLengths} + * @param maxCodeLength maximum allowed code length present in {@code codeLengths} + * @throws NullPointerException if {@code codeLengths} is {@code null} + * @throws IllegalArgumentException if {@code codeLengthSize} is out of range, if any code length is out of range + * or if {@code maxCodeLength} exceeds the implementation limit (30) + */ + public HuffmanDecoder( + final int[] codeLengths, final int codeLengthSize, final int minCodeLength, final int maxCodeLength) + throws IllegalArgumentException { + Objects.requireNonNull(codeLengths, "codeLengths"); + if (maxCodeLength > MAX_SUPPORTED_CODE_LENGTH) { + throw new IllegalArgumentException(String.format( + "maxCodeLength (%d) exceeds supported limit (%d)", maxCodeLength, MAX_SUPPORTED_CODE_LENGTH)); + } + if (codeLengthSize <= 0) { + throw new IllegalArgumentException(String.format("codeLengthSize must be > 0; was %d", codeLengthSize)); + } + if (codeLengths.length < codeLengthSize) { + throw new IllegalArgumentException(String.format( + "codeLengthSize (%d) exceeds codeLengths.length (%d)", codeLengthSize, codeLengths.length)); + } + // Validate and find min/max lengths + int min = maxCodeLength; + int max = minCodeLength; + + for (int i = 0; i < codeLengthSize; i++) { + final int len = codeLengths[i]; + if (len < minCodeLength || len > maxCodeLength) { + throw new IllegalArgumentException(String.format( + "invalid code length at symbol %d: %d (expected in [%d, %d])", + i, len, minCodeLength, maxCodeLength)); + } + if (len == 0) { + continue; // unused symbol + } + if (len < min) { + min = len; + } + if (len > max) { + max = len; + } + } + + this.minLength = min; + this.maxLength = max; + // Allocate outputs; we reuse them as scratch inside fillCodeTable + this.bias = new int[max + 1]; + this.limit = new int[max + 1]; + this.sorted = new int[codeLengthSize]; + + // Arrays are zero-initialized; no additional temps needed. + fillCodeTable(codeLengths, minLength, max, codeLengthSize, bias, limit, sorted); + } + + /** + * Returns the minimum code length (in bits) for this code set. + * + * @return minimum code length (in bits) + */ + public int getMinLength() { + return minLength; + } + + /** + * Returns the maximum code length (in bits) for this code set. + * + * @return maximum code length (in bits) + */ + public int getMaxLength() { + return maxLength; + } + + /** + * Build canonical decode tables. + */ + private static void fillCodeTable( + final int[] codeLengths, + final int minLen, + final int maxLen, + final int codeLengthSize, + final int[] bias, + final int[] limit, + final int[] sorted) { + + // 1) Histogram of code lengths + final int[] count = new int[maxLen + 1]; + for (int symbol = 0; symbol < codeLengthSize; symbol++) { + final int len = codeLengths[symbol]; + if (len == 0) { + continue; + } + count[codeLengths[symbol]]++; + } + + // 2) Generate starting offsets into sorted symbol table + // The offsets are biased by -1 to simplify code in the next step + final int[] offset = new int[maxLen + 1]; + offset[0] = -1; + for (int len = 1; len <= maxLen; len++) { + offset[len] = offset[len - 1] + count[len - 1]; + } + + // 3) Build table of symbols sorted by length, then by symbol + // Adjust offsets to point to the last element of each length + for (int symbol = 0; symbol < codeLengthSize; symbol++) { + final int len = codeLengths[symbol]; + if (len == 0) { + continue; + } + sorted[++offset[len]] = symbol; + } + + // 4) Compute the largest left-justified code for each length + int firstCode = 0; + for (int len = minLen; len <= maxLen; len++) { + firstCode += count[len]; + limit[len] = firstCode - 1; + firstCode <<= 1; // prepare for next length + } + + // 5) Compute the bias for each length + for (int len = minLen; len <= maxLen; len++) { + bias[len] = limit[len] - offset[len]; + } + } + + /** + * Decodes one symbol from the input bitstream. + * + * @param in the source of bits (MSB-first) to read from + * @return the decoded symbol index + * @throws EOFException if the input ends in the middle of a Huffman codeword + * @throws IOException if an I/O error occurs while reading from {@code in} + */ + public int decodeSymbol(final BitInputStream in) throws IOException { + int len = minLength; + int code = readBitsFully(in, len); + while (len <= maxLength && code > limit[len]) { + final int b = readBit(in); + code = (code << 1) | b; + len++; + } + if (len > maxLength) { + throw new CompressorException("Invalid Huffman code: " + code); + } + return sorted[code - bias[len]]; + } + + private static int readBit(final BitInputStream in) throws IOException { + final int bit = in.readBit(); + if (bit < 0) { + throw new EOFException("Truncated Huffman bitstream"); + } + return bit; + } + + private static int readBitsFully(final BitInputStream in, final int numBits) throws IOException { + final int code = (int) in.readBits(numBits); + if (code < 0) { + throw new EOFException("Truncated Huffman bitstream"); + } + // Adjust for bit order + return in.getByteOrder() == ByteOrder.BIG_ENDIAN ? code : Integer.reverse(code) >>> (32 - numBits); + } +} diff --git a/src/main/java/org/apache/commons/compress/compressors/support/package-info.java b/src/main/java/org/apache/commons/compress/compressors/support/package-info.java new file mode 100644 index 00000000000..542424d0748 --- /dev/null +++ b/src/main/java/org/apache/commons/compress/compressors/support/package-info.java @@ -0,0 +1,23 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +/** + * Algorithms reused by various compressors. + * @since 1.29.0 + */ +package org.apache.commons.compress.compressors.support; diff --git a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java index 5086db56791..f2766b03114 100644 --- a/src/main/java/org/apache/commons/compress/utils/BitInputStream.java +++ b/src/main/java/org/apache/commons/compress/utils/BitInputStream.java @@ -45,10 +45,39 @@ public class BitInputStream implements Closeable { private int bitsCachedSize; /** - * Constructor taking an InputStream and its bit arrangement. + * Constructs a {@code BitInputStream} that reads individual bits from the + * given {@link InputStream}, interpreting them according to the specified + * bit ordering. * - * @param in the InputStream. - * @param byteOrder the bit arrangement across byte boundaries, either BIG_ENDIAN (aaaaabbb bb000000) or LITTLE_ENDIAN (bbbaaaaa 000000bb). + *The bit ordering determines how consecutive bits are packed into bytes:
+ * + *Bits are read from the high (bit 7) to the low (bit 0) + * position of each byte.
+ * + *{@code
+ * byte 0 byte 1
+ * bit 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0
+ * a4 a3 a2 a1 a0 b4 b3 b2 b1 b0 0 0 0 0 0 0
+ * }
+ * Bits are read from the low (bit 0) to the high (bit 7) + * position of each byte.
+ * + *{@code
+ * byte 0 byte 1
+ * bit 7 6 5 4 3 2 1 0 | 7 6 5 4 3 2 1 0
+ * b2 b1 b0 a4 a3 a2 a1 a0 0 0 0 0 0 b4 b3 b2
+ * }
+ *