Skip to content

Commit 59dede8

Browse files
committed
Add GzipCompressorInputStream.Builder.setOnMemberStart(Consumer) to
monitor member parsing Add GzipCompressorInputStream.Builder.setOnMemberEnd(Consumer) to monitor member parsing
1 parent 71d16d9 commit 59dede8

File tree

7 files changed

+395
-118
lines changed

7 files changed

+395
-118
lines changed

src/changes/changes.xml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ The <action> type attribute can be add,update,fix,remove.
7272
<action type="add" dev="ggregory" due-to="Danny Deschenes, Gary Gregory">GzipCompressorInputStream writes the modification time (MTIME) the value incorrectly divided by 1,000.</action>
7373
<action type="add" dev="ggregory" due-to="Danny Deschenes, Gary Gregory">Add optional FHCRC to GZIP header #627.</action>
7474
<action type="add" dev="ggregory" due-to="Gary Gregory">Add GzipCompressorInputStream.Builder allowing to customize the file name and comment Charsets.</action>
75+
<action type="add" dev="ggregory" due-to="Gary Gregory">Add GzipCompressorInputStream.Builder.setOnMemberStart(Consumer) to monitor member parsing.</action>
76+
<action type="add" dev="ggregory" due-to="Gary Gregory">Add GzipCompressorInputStream.Builder.setOnMemberEnd(Consumer) to monitor member parsing.</action>
7577
<!-- UPDATE -->
7678
<action type="update" dev="ggregory" due-to="Dependabot, Gary Gregory">Bump org.apache.commons:commons-parent from 72 to 78 #563, #567, #574, #582, #587, #595.</action>
7779
<action type="update" dev="ggregory" due-to="Dependabot, Gary Gregory">Bump com.github.luben:zstd-jni from 1.5.6-4 to 1.5.6-8 #565, #578, #601, #616.</action>

src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorInputStream.java

Lines changed: 73 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@
3434
import org.apache.commons.compress.compressors.CompressorInputStream;
3535
import org.apache.commons.compress.utils.ByteUtils;
3636
import org.apache.commons.compress.utils.InputStreamStatistics;
37+
import org.apache.commons.io.IOUtils;
3738
import org.apache.commons.io.build.AbstractStreamBuilder;
39+
import org.apache.commons.io.function.IOConsumer;
3840
import org.apache.commons.io.input.BoundedInputStream;
3941

4042
/**
@@ -45,29 +47,29 @@
4547
* </p>
4648
*
4749
* <p>
48-
* Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder.
49-
* The actual decompression is done with {@link java.util.zip.Inflater}.
50+
* Instead of using {@code java.util.zip.GZIPInputStream}, this class has its own GZIP member decoder. The actual decompression is done with
51+
* {@link java.util.zip.Inflater}.
5052
* </p>
5153
*
5254
* <p>
53-
* If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)},
54-
* then {@link #read} will return -1 as soon as the first encoded GZIP member has been completely read. In this case,
55-
* if the underlying input stream supports {@link InputStream#mark mark()} and {@link InputStream#reset reset()},
56-
* then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number
57-
* of extra bytes following the encoded GZIP member will have been consumed and discarded.
55+
* If you use the constructor {@code GzipCompressorInputStream(in)} or {@code GzipCompressorInputStream(in, false)}, then {@link #read} will return -1 as soon
56+
* as the first encoded GZIP member has been completely read. In this case, if the underlying input stream supports {@link InputStream#mark mark()} and
57+
* {@link InputStream#reset reset()}, then it will be left positioned just after the end of the encoded GZIP member; otherwise, some indeterminate number of
58+
* extra bytes following the encoded GZIP member will have been consumed and discarded.
5859
* </p>
5960
*
6061
* <p>
61-
* If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after
62-
* the entire input stream has been exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded
63-
* GZIP member, otherwise an {@link IOException} is thrown. The data read from a stream constructed this way will consist
64-
* of the concatenated data of all of the encoded GZIP members in order.
62+
* If you use the constructor {@code GzipCompressorInputStream(in, true)} then {@link #read} will return -1 only after the entire input stream has been
63+
* exhausted; any bytes that follow an encoded GZIP member must constitute a new encoded GZIP member, otherwise an {@link IOException} is thrown. The data read
64+
* from a stream constructed this way will consist of the concatenated data of all of the encoded GZIP members in order.
6565
* </p>
6666
*
6767
* @see <a href="https://datatracker.ietf.org/doc/html/rfc1952">RFC 1952 GZIP File Format Specification</a>
6868
*/
6969
public class GzipCompressorInputStream extends CompressorInputStream implements InputStreamStatistics {
7070

71+
private static final IOConsumer<GzipCompressorInputStream> NOOP = IOConsumer.noop();
72+
7173
/**
7274
* Constructs a new builder of {@link GzipCompressorInputStream}.
7375
*
@@ -80,17 +82,20 @@ public static class Builder extends AbstractStreamBuilder<GzipCompressorInputStr
8082

8183
private Charset fileNameCharset = GzipUtils.GZIP_ENCODING;
8284

85+
private IOConsumer<GzipCompressorInputStream> onMemberStart;
86+
87+
private IOConsumer<GzipCompressorInputStream> onMemberEnd;
88+
8389
/**
8490
* Constructs a new builder of {@link GzipCompressorInputStream}.
8591
*/
8692
public Builder() {
8793
// empty
8894
}
8995

90-
@SuppressWarnings("resource") // caller closes
9196
@Override
9297
public GzipCompressorInputStream get() throws IOException {
93-
return new GzipCompressorInputStream(getInputStream(), decompressConcatenated, fileNameCharset);
98+
return new GzipCompressorInputStream(this);
9499
}
95100

96101
/**
@@ -121,6 +126,34 @@ public Builder setFileNameCharset(final Charset fileNameCharset) {
121126
this.fileNameCharset = fileNameCharset;
122127
return this;
123128
}
129+
130+
/**
131+
* Sets the consumer called when a member header is parsed. Note that the member size is unknown at call time, it is stored in a member
132+
* <em>trailer</em> and used for validation.
133+
*
134+
* @param onMemberEnd The consumer.
135+
* @return this instance.
136+
* @see GzipCompressorInputStream#getMetaData()
137+
*/
138+
public Builder setOnMemberEnd(final IOConsumer<GzipCompressorInputStream> onMemberEnd) {
139+
this.onMemberEnd = onMemberEnd;
140+
return this;
141+
}
142+
143+
/**
144+
* Sets the consumer called when a member trailer is parsed.
145+
* <p>
146+
* There are two values set from the trailer in the current {@link GzipParameters}: {@code trailerCrc} and {@code trailerISize}.
147+
* </p>
148+
*
149+
* @param onMemberStart The consumer.
150+
* @return this instance.
151+
* @see GzipCompressorInputStream#getMetaData()
152+
*/
153+
public Builder setOnMemberStart(final IOConsumer<GzipCompressorInputStream> onMemberStart) {
154+
this.onMemberStart = onMemberStart;
155+
return this;
156+
}
124157
}
125158

126159
/**
@@ -175,8 +208,7 @@ private static byte[] readToNull(final DataInput inData) throws IOException {
175208
private final Charset fileNameCharset;
176209

177210
/**
178-
* Compressed input stream, possibly wrapped in a
179-
* BufferedInputStream, always wrapped in countingStream above
211+
* Compressed input stream, possibly wrapped in a BufferedInputStream, always wrapped in countingStream above
180212
*/
181213
private final InputStream in;
182214

@@ -186,7 +218,11 @@ private static byte[] readToNull(final DataInput inData) throws IOException {
186218
/** Buffer for no-argument read method. */
187219
private final byte[] oneByte = new byte[1];
188220

189-
private final GzipParameters parameters = new GzipParameters();
221+
private GzipParameters parameters;
222+
223+
private final IOConsumer<GzipCompressorInputStream> onMemberStart;
224+
225+
private final IOConsumer<GzipCompressorInputStream> onMemberEnd;
190226

191227
/**
192228
* Constructs a new input stream that decompresses gzip-compressed data from the specified input stream.
@@ -198,7 +234,7 @@ private static byte[] readToNull(final DataInput inData) throws IOException {
198234
* @throws IOException if the stream could not be created
199235
*/
200236
public GzipCompressorInputStream(final InputStream inputStream) throws IOException {
201-
this(inputStream, false, GzipUtils.GZIP_ENCODING);
237+
this(builder().setInputStream(inputStream));
202238
}
203239

204240
/**
@@ -216,16 +252,19 @@ public GzipCompressorInputStream(final InputStream inputStream) throws IOExcepti
216252
*/
217253
@Deprecated
218254
public GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated) throws IOException {
219-
this(inputStream, decompressConcatenated, GzipUtils.GZIP_ENCODING);
255+
this(builder().setInputStream(inputStream).setDecompressConcatenated(decompressConcatenated));
220256
}
221257

222-
private GzipCompressorInputStream(final InputStream inputStream, final boolean decompressConcatenated, final Charset fileNameCharset) throws IOException {
223-
countingStream = BoundedInputStream.builder().setInputStream(inputStream).get();
258+
@SuppressWarnings("resource") // caller closes
259+
private GzipCompressorInputStream(final Builder builder) throws IOException {
260+
countingStream = BoundedInputStream.builder().setInputStream(builder.getInputStream()).get();
224261
// Mark support is strictly needed for concatenated files only,
225262
// but it's simpler if it is always available.
226263
in = countingStream.markSupported() ? countingStream : new BufferedInputStream(countingStream);
227-
this.decompressConcatenated = decompressConcatenated;
228-
this.fileNameCharset = fileNameCharset;
264+
this.decompressConcatenated = builder.decompressConcatenated;
265+
this.fileNameCharset = builder.fileNameCharset;
266+
this.onMemberStart = builder.onMemberStart != null ? builder.onMemberStart : NOOP;
267+
this.onMemberEnd = builder.onMemberEnd != null ? builder.onMemberEnd : NOOP;
229268
init(true);
230269
}
231270

@@ -269,32 +308,28 @@ private boolean init(final boolean isFirstMember) throws IOException {
269308
if (!isFirstMember && !decompressConcatenated) { // at least one must be true
270309
throw new IllegalStateException("Unexpected: isFirstMember and decompressConcatenated are both false.");
271310
}
272-
parameters.setFileNameCharset(fileNameCharset);
273311
// Check the magic bytes without a possibility of EOFException.
274312
final int magic0 = in.read();
275-
276313
// If end of input was reached after decompressing at least
277314
// one .gz member, we have reached the end of the file successfully.
278315
if (magic0 == -1 && !isFirstMember) {
279316
return false;
280317
}
281-
282318
if (magic0 != GzipUtils.ID1 || in.read() != GzipUtils.ID2) {
283319
throw new IOException(isFirstMember ? "Input is not in the .gz format." : "Unexpected data after a valid .gz stream.");
284320
}
285-
321+
parameters = new GzipParameters();
322+
parameters.setFileNameCharset(fileNameCharset);
286323
// Parsing the rest of the header may throw EOFException.
287324
final DataInput inData = new DataInputStream(in);
288325
final int method = inData.readUnsignedByte();
289326
if (method != Deflater.DEFLATED) {
290327
throw new IOException("Unsupported compression method " + method + " in the .gz header");
291328
}
292-
293329
final int flg = inData.readUnsignedByte();
294330
if ((flg & GzipUtils.FRESERVED) != 0) {
295331
throw new IOException("Reserved flags are set in the .gz header.");
296332
}
297-
298333
parameters.setModificationTime(ByteUtils.fromLittleEndian(inData, 4));
299334
switch (inData.readUnsignedByte()) { // extra flags
300335
case GzipUtils.XFL_MAX_COMPRESSION:
@@ -308,7 +343,6 @@ private boolean init(final boolean isFirstMember) throws IOException {
308343
break;
309344
}
310345
parameters.setOperatingSystem(inData.readUnsignedByte());
311-
312346
// Extra field
313347
if ((flg & GzipUtils.FEXTRA) != 0) {
314348
int xlen = inData.readUnsignedByte();
@@ -317,17 +351,14 @@ private boolean init(final boolean isFirstMember) throws IOException {
317351
inData.readFully(extra);
318352
parameters.setExtraField(ExtraField.fromBytes(extra));
319353
}
320-
321354
// Original file name
322355
if ((flg & GzipUtils.FNAME) != 0) {
323356
parameters.setFileName(new String(readToNull(inData), parameters.getFileNameCharset()));
324357
}
325-
326358
// Comment
327359
if ((flg & GzipUtils.FCOMMENT) != 0) {
328360
parameters.setComment(new String(readToNull(inData), parameters.getFileNameCharset()));
329361
}
330-
331362
// Header "CRC16" which is actually a truncated CRC32 (which isn't
332363
// as good as real CRC16). I don't know if any encoder implementation
333364
// sets this, so it's not worth trying to verify it. GNU gzip 1.4
@@ -337,11 +368,10 @@ private boolean init(final boolean isFirstMember) throws IOException {
337368
parameters.setHeaderCRC(true);
338369
inData.readShort();
339370
}
340-
341371
// Reset
342372
inflater.reset();
343373
crc.reset();
344-
374+
onMemberStart.accept(this);
345375
return true;
346376
}
347377

@@ -397,30 +427,25 @@ public int read(final byte[] b, int off, int len) throws IOException {
397427
// We may have read too many bytes. Rewind the read
398428
// position to match the actual amount used.
399429
in.reset();
400-
401430
final int skipAmount = bufUsed - inflater.getRemaining();
402-
if (org.apache.commons.io.IOUtils.skip(in, skipAmount) != skipAmount) {
431+
if (IOUtils.skip(in, skipAmount) != skipAmount) {
403432
throw new IOException();
404433
}
405-
406434
bufUsed = 0;
407-
408435
final DataInput inData = new DataInputStream(in);
409-
410436
// CRC32
411-
final long crcStored = ByteUtils.fromLittleEndian(inData, 4);
412-
413-
if (crcStored != crc.getValue()) {
437+
final long trailerCrc = ByteUtils.fromLittleEndian(inData, 4);
438+
if (trailerCrc != crc.getValue()) {
414439
throw new IOException("Gzip-compressed data is corrupt (CRC32 error).");
415440
}
416-
417-
// Uncompressed size modulo 2^32 (ISIZE in the spec)
418-
final long isize = ByteUtils.fromLittleEndian(inData, 4);
419-
420-
if (isize != (inflater.getBytesWritten() & 0xffffffffL)) {
441+
// Uncompressed size modulo 2^32, ISIZE in the RFC.
442+
final long iSize = ByteUtils.fromLittleEndian(inData, 4);
443+
if (iSize != (inflater.getBytesWritten() & 0xffffffffL)) {
421444
throw new IOException("Gzip-compressed data is corrupt (uncompressed size mismatch).");
422445
}
423-
446+
parameters.setTrailerCrc(trailerCrc);
447+
parameters.setTrailerISize(iSize);
448+
onMemberEnd.accept(this);
424449
// See if this is the end of the file.
425450
if (!decompressConcatenated || !init(false)) {
426451
inflater.end();

src/main/java/org/apache/commons/compress/compressors/gzip/GzipCompressorOutputStream.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ public void finish() throws IOException {
105105
deflate();
106106
}
107107
writeMemberTrailer();
108+
deflater.reset();
108109
}
109110
}
110111

@@ -125,6 +126,7 @@ public void write(final byte[] buffer) throws IOException {
125126
*/
126127
@Override
127128
public void write(final byte[] buffer, final int offset, final int length) throws IOException {
129+
checkOpen();
128130
if (deflater.finished()) {
129131
throw new IOException("Cannot write more data, the end of the compressed data stream has been reached.");
130132
}

0 commit comments

Comments
 (0)