|
36 | 36 | import sys
|
37 | 37 | import time
|
38 | 38 | from typing import Optional, SupportsInt
|
39 |
| -import _compression # noqa: I201 # Not third-party |
40 | 39 |
|
41 | 40 | from . import igzip_lib, isal_zlib
|
| 41 | +from .isal_zlib import _GzipReader |
42 | 42 |
|
43 | 43 | __all__ = ["IGzipFile", "open", "compress", "decompress", "BadGzipFile",
|
44 | 44 | "READ_BUFFER_SIZE"]
|
|
47 | 47 | _COMPRESS_LEVEL_TRADEOFF = isal_zlib.ISAL_DEFAULT_COMPRESSION
|
48 | 48 | _COMPRESS_LEVEL_BEST = isal_zlib.ISAL_BEST_COMPRESSION
|
49 | 49 |
|
50 |
| -#: The amount of data that is read in at once when decompressing a file. |
51 |
| -#: Increasing this value may increase performance. |
52 |
| -#: 128K is also the size used by pigz and cat to read files from the |
53 |
| -# filesystem. |
54 |
| -READ_BUFFER_SIZE = 128 * 1024 |
| 50 | +# The amount of data that is read in at once when decompressing a file. |
| 51 | +# Increasing this value may increase performance. |
| 52 | +# After 512K the performance does not increase anymore on a Ryzen 5 3600 test |
| 53 | +# system. |
| 54 | +READ_BUFFER_SIZE = 512 * 1024 |
55 | 55 |
|
56 | 56 | FTEXT, FHCRC, FEXTRA, FNAME, FCOMMENT = 1, 2, 4, 8, 16
|
57 | 57 | READ, WRITE = 1, 2
|
58 | 58 |
|
59 |
| -try: |
60 |
| - BadGzipFile = gzip.BadGzipFile # type: ignore |
61 |
| -except AttributeError: # Versions lower than 3.8 do not have BadGzipFile |
62 |
| - BadGzipFile = OSError # type: ignore |
| 59 | +BadGzipFile = gzip.BadGzipFile # type: ignore |
63 | 60 |
|
64 | 61 |
|
65 | 62 | # The open method was copied from the CPython source with minor adjustments.
|
@@ -166,7 +163,7 @@ def __init__(self, filename=None, mode=None,
|
166 | 163 | isal_zlib.DEF_MEM_LEVEL,
|
167 | 164 | 0)
|
168 | 165 | if self.mode == READ:
|
169 |
| - raw = _IGzipReader(self.fileobj) |
| 166 | + raw = _GzipReader(self.fileobj, READ_BUFFER_SIZE) |
170 | 167 | self._buffer = io.BufferedReader(raw)
|
171 | 168 |
|
172 | 169 | def __repr__(self):
|
@@ -220,186 +217,9 @@ def write(self, data):
|
220 | 217 | return length
|
221 | 218 |
|
222 | 219 |
|
223 |
| -def _read_exact(fp, n): |
224 |
| - '''Read exactly *n* bytes from `fp` |
225 |
| -
|
226 |
| - This method is required because fp may be unbuffered, |
227 |
| - i.e. return short reads. |
228 |
| - ''' |
229 |
| - data = fp.read(n) |
230 |
| - while len(data) < n: |
231 |
| - b = fp.read(n - len(data)) |
232 |
| - if not b: |
233 |
| - raise EOFError("Compressed file ended before the " |
234 |
| - "end-of-stream marker was reached") |
235 |
| - data += b |
236 |
| - return data |
237 |
| - |
238 |
| - |
239 |
| -def _read_gzip_header(fp): |
240 |
| - '''Read a gzip header from `fp` and progress to the end of the header. |
241 |
| -
|
242 |
| - Returns None if header not present. Parses mtime from the header, looks |
243 |
| - for BGZF format blocks and parses the block size, setting it to None if |
244 |
| - not present. Returns a tuple of mtime, block_size if a header was present. |
245 |
| - ''' |
246 |
| - # Do not use read_exact because a header may not be present. Read twice |
247 |
| - # since fp might be unbuffered. |
248 |
| - magic = fp.read(1) + fp.read(1) |
249 |
| - if magic == b'': |
250 |
| - return None |
251 |
| - |
252 |
| - if magic != b'\037\213': |
253 |
| - raise BadGzipFile('Not a gzipped file (%r)' % magic) |
254 |
| - |
255 |
| - common_fields = _read_exact(fp, 8) |
256 |
| - (method, flag, last_mtime) = struct.unpack("<BBIxx", common_fields) |
257 |
| - if method != 8: |
258 |
| - raise BadGzipFile('Unknown compression method') |
259 |
| - block_size = None |
260 |
| - if not flag: # Likely when data compressed in memory |
261 |
| - return last_mtime, block_size |
262 |
| - header = magic + common_fields |
263 |
| - if flag & FEXTRA: |
264 |
| - # Read & discard the extra field, if present |
265 |
| - encoded_length = _read_exact(fp, 2) |
266 |
| - extra_len, = struct.unpack("<H", encoded_length) |
267 |
| - extra_field = _read_exact(fp, extra_len) |
268 |
| - # Bgzip file detection |
269 |
| - if extra_len == 6: |
270 |
| - s1, s2, slen, bsize = struct.unpack("<BBHH", extra_field) |
271 |
| - if s1 == 66 and s2 == 67 and slen == 2: |
272 |
| - # Bgzip magic and correct slen. |
273 |
| - block_size = bsize |
274 |
| - header = header + encoded_length + extra_field |
275 |
| - if flag & FNAME: |
276 |
| - # Read and discard a null-terminated string containing the filename |
277 |
| - while True: |
278 |
| - s = _read_exact(fp, 1) |
279 |
| - header += s |
280 |
| - if s == b'\000': |
281 |
| - break |
282 |
| - if flag & FCOMMENT: |
283 |
| - # Read and discard a null-terminated string containing a comment |
284 |
| - while True: |
285 |
| - s = _read_exact(fp, 1) |
286 |
| - header += s |
287 |
| - if s == b'\000': |
288 |
| - break |
289 |
| - if flag & FHCRC: |
290 |
| - header_crc_encoded = _read_exact(fp, 2) |
291 |
| - header_crc, = struct.unpack("<H", header_crc_encoded) |
292 |
| - crc = isal_zlib.crc32(header) & 0xFFFF |
293 |
| - if header_crc != crc: |
294 |
| - raise BadGzipFile(f"Corrupted gzip header. Checksums do not " |
295 |
| - f"match: {crc:04x} != {header_crc:04x}") |
296 |
| - return last_mtime, block_size |
297 |
| - |
298 |
| - |
299 |
| -class _PaddedFile(gzip._PaddedFile): |
300 |
| - # Overwrite _PaddedFile from gzip as its prepend method assumes that |
301 |
| - # the prepended data is always read from its _buffer. Unfortunately in |
302 |
| - # isal_zlib.decompressobj there is a bitbuffer as well which may be added. |
303 |
| - # So an extra check is added to prepend to ensure no extra data in front |
304 |
| - # of the buffer was present. (Negative self._read). |
305 |
| - def prepend(self, prepend=b''): |
306 |
| - if self._read is not None: |
307 |
| - # Assume data was read since the last prepend() call |
308 |
| - self._read -= len(prepend) |
309 |
| - if self._read >= 0: |
310 |
| - return |
311 |
| - # If self._read is negative the data was read further back and |
312 |
| - # the buffer needs to be reset. |
313 |
| - self._buffer = prepend |
314 |
| - self._length = len(self._buffer) |
315 |
| - self._read = 0 |
316 |
| - |
317 |
| - |
318 |
| -class _IGzipReader(gzip._GzipReader): |
319 |
| - def __init__(self, fp): |
320 |
| - # Call the init method of gzip._GzipReader's parent here. |
321 |
| - # It is not very invasive and allows us to override _PaddedFile |
322 |
| - _compression.DecompressReader.__init__( |
323 |
| - self, _PaddedFile(fp), igzip_lib.IgzipDecompressor, |
324 |
| - hist_bits=igzip_lib.MAX_HIST_BITS, flag=igzip_lib.DECOMP_DEFLATE) |
325 |
| - # Set flag indicating start of a new member |
326 |
| - self._new_member = True |
327 |
| - self._last_mtime = None |
328 |
| - self._read_buffer_size = READ_BUFFER_SIZE |
329 |
| - |
330 |
| - def _read_gzip_header(self): |
331 |
| - header_info = _read_gzip_header(self._fp) |
332 |
| - if header_info is None: |
333 |
| - return False |
334 |
| - # Get the BGZF block size from the header if present. If the read |
335 |
| - # buffer size is set to exactly the block size, there will be less |
336 |
| - # overhead as reading the file will stop right before the gzip trailer. |
337 |
| - # On normal gzip files nothing happens and this optimization is not |
338 |
| - # detrimental. |
339 |
| - last_mtime, block_size = header_info |
340 |
| - self._last_mtime = last_mtime |
341 |
| - self._read_buffer_size = (block_size if block_size is not None |
342 |
| - else READ_BUFFER_SIZE) |
343 |
| - return True |
344 |
| - |
345 |
| - def read(self, size=-1): |
346 |
| - if size < 0: |
347 |
| - return self.readall() |
348 |
| - # size=0 is special because decompress(max_length=0) is not supported |
349 |
| - if not size: |
350 |
| - return b"" |
351 |
| - |
352 |
| - # For certain input data, a single |
353 |
| - # call to decompress() may not return |
354 |
| - # any data. In this case, retry until we get some data or reach EOF. |
355 |
| - while True: |
356 |
| - if self._decompressor.eof: |
357 |
| - # Ending case: we've come to the end of a member in the file, |
358 |
| - # so finish up this member, and read a new gzip header. |
359 |
| - # Check the CRC and file size, and set the flag so we read |
360 |
| - # a new member |
361 |
| - self._read_eof() |
362 |
| - self._new_member = True |
363 |
| - self._decompressor = self._decomp_factory( |
364 |
| - **self._decomp_args) |
365 |
| - |
366 |
| - if self._new_member: |
367 |
| - # If the _new_member flag is set, we have to |
368 |
| - # jump to the next member, if there is one. |
369 |
| - self._crc = isal_zlib.crc32(b"") |
370 |
| - # Decompressed size of unconcatenated stream |
371 |
| - self._stream_size = 0 |
372 |
| - if not self._read_gzip_header(): |
373 |
| - self._size = self._pos |
374 |
| - return b"" |
375 |
| - self._new_member = False |
376 |
| - |
377 |
| - # Read a chunk of data from the file |
378 |
| - if self._decompressor.needs_input: |
379 |
| - buf = self._fp.read(self._read_buffer_size) |
380 |
| - uncompress = self._decompressor.decompress(buf, size) |
381 |
| - else: |
382 |
| - uncompress = self._decompressor.decompress(b"", size) |
383 |
| - if self._decompressor.unused_data != b"": |
384 |
| - # Prepend the already read bytes to the fileobj so they can |
385 |
| - # be seen by _read_eof() and _read_gzip_header() |
386 |
| - self._fp.prepend(self._decompressor.unused_data) |
387 |
| - |
388 |
| - if uncompress != b"": |
389 |
| - break |
390 |
| - if buf == b"": |
391 |
| - raise EOFError("Compressed file ended before the " |
392 |
| - "end-of-stream marker was reached") |
393 |
| - |
394 |
| - self._crc = isal_zlib.crc32(uncompress, self._crc) |
395 |
| - self._stream_size += len(uncompress) |
396 |
| - self._pos += len(uncompress) |
397 |
| - return uncompress |
398 |
| - |
399 |
| - |
400 | 220 | # Aliases for improved compatibility with CPython gzip module.
|
401 | 221 | GzipFile = IGzipFile
|
402 |
| -_GzipReader = _IGzipReader |
| 222 | +_IGzipReader = _GzipReader |
403 | 223 |
|
404 | 224 |
|
405 | 225 | def _create_simple_gzip_header(compresslevel: int,
|
@@ -440,28 +260,9 @@ def decompress(data):
|
440 | 260 | isal_zlib.decompress(data, wbits=31) is faster in cases where only one
|
441 | 261 | gzip member is guaranteed to be present.
|
442 | 262 | """
|
443 |
| - decompressed_members = [] |
444 |
| - while True: |
445 |
| - if not data: # Empty data returns empty bytestring |
446 |
| - return b"".join(decompressed_members) |
447 |
| - fp = io.BytesIO(data) |
448 |
| - if _read_gzip_header(fp) is None: |
449 |
| - return b"".join(decompressed_members) |
450 |
| - header_end = fp.tell() |
451 |
| - # Use a zlib raw deflate compressor |
452 |
| - do = isal_zlib.decompressobj(wbits=-isal_zlib.MAX_WBITS) |
453 |
| - # Read all the data except the header |
454 |
| - decompressed = do.decompress(data[header_end:]) |
455 |
| - if not do.eof or len(do.unused_data) < 8: |
456 |
| - raise EOFError("Compressed file ended before the end-of-stream " |
457 |
| - "marker was reached") |
458 |
| - crc, length = struct.unpack("<II", do.unused_data[:8]) |
459 |
| - if crc != isal_zlib.crc32(decompressed): |
460 |
| - raise BadGzipFile("CRC check failed") |
461 |
| - if length != (len(decompressed) & 0xffffffff): |
462 |
| - raise BadGzipFile("Incorrect length of data produced") |
463 |
| - decompressed_members.append(decompressed) |
464 |
| - data = do.unused_data[8:].lstrip(b"\x00") |
| 263 | + fp = io.BytesIO(data) |
| 264 | + reader = _GzipReader(fp, max(len(data), 16)) |
| 265 | + return reader.readall() |
465 | 266 |
|
466 | 267 |
|
467 | 268 | def _argument_parser():
|
@@ -563,8 +364,6 @@ def main():
|
563 | 364 | else:
|
564 | 365 | out_file = sys.stdout.buffer
|
565 | 366 |
|
566 |
| - global READ_BUFFER_SIZE |
567 |
| - READ_BUFFER_SIZE = args.buffer_size |
568 | 367 | try:
|
569 | 368 | while True:
|
570 | 369 | block = in_file.read(args.buffer_size)
|
|
0 commit comments