33
33
34
34
from . import igzip_lib , isal_zlib
35
35
36
- __all__ = ["IGzipFile" , "open" , "compress" , "decompress" , "BadGzipFile" ]
36
+ __all__ = ["IGzipFile" , "open" , "compress" , "decompress" , "BadGzipFile" ,
37
+ "READ_BUFFER_SIZE" ]
37
38
38
39
_COMPRESS_LEVEL_FAST = isal_zlib .ISAL_BEST_SPEED
39
40
_COMPRESS_LEVEL_TRADEOFF = isal_zlib .ISAL_DEFAULT_COMPRESSION
40
41
_COMPRESS_LEVEL_BEST = isal_zlib .ISAL_BEST_COMPRESSION
41
42
43
+ #: The amount of data that is read in at once when decompressing a file.
44
+ #: Increasing this value may increase performance.
45
+ READ_BUFFER_SIZE = io .DEFAULT_BUFFER_SIZE
46
+
42
47
FTEXT , FHCRC , FEXTRA , FNAME , FCOMMENT = 1 , 2 , 4 , 8 , 16
43
48
44
49
try :
@@ -229,8 +234,8 @@ def __init__(self, fp):
229
234
# Call the init method of gzip._GzipReader's parent here.
230
235
# It is not very invasive and allows us to override _PaddedFile
231
236
_compression .DecompressReader .__init__ (
232
- self , _PaddedFile (fp ), isal_zlib . decompressobj ,
233
- wbits = - isal_zlib . MAX_WBITS )
237
+ self , _PaddedFile (fp ), igzip_lib . IgzipDecompressor ,
238
+ hist_bits = igzip_lib . MAX_HIST_BITS , flag = igzip_lib . DECOMP_DEFLATE )
234
239
# Set flag indicating start of a new member
235
240
self ._new_member = True
236
241
self ._last_mtime = None
@@ -241,6 +246,57 @@ def _add_read_data(self, data):
241
246
self ._crc = isal_zlib .crc32 (data , self ._crc )
242
247
self ._stream_size += len (data )
243
248
249
+ def read (self , size = - 1 ):
250
+ if size < 0 :
251
+ return self .readall ()
252
+ # size=0 is special because decompress(max_length=0) is not supported
253
+ if not size :
254
+ return b""
255
+
256
+ # For certain input data, a single
257
+ # call to decompress() may not return
258
+ # any data. In this case, retry until we get some data or reach EOF.
259
+ while True :
260
+ if self ._decompressor .eof :
261
+ # Ending case: we've come to the end of a member in the file,
262
+ # so finish up this member, and read a new gzip header.
263
+ # Check the CRC and file size, and set the flag so we read
264
+ # a new member
265
+ self ._read_eof ()
266
+ self ._new_member = True
267
+ self ._decompressor = self ._decomp_factory (
268
+ ** self ._decomp_args )
269
+
270
+ if self ._new_member :
271
+ # If the _new_member flag is set, we have to
272
+ # jump to the next member, if there is one.
273
+ self ._init_read ()
274
+ if not self ._read_gzip_header ():
275
+ self ._size = self ._pos
276
+ return b""
277
+ self ._new_member = False
278
+
279
+ # Read a chunk of data from the file
280
+ if self ._decompressor .needs_input :
281
+ buf = self ._fp .read (READ_BUFFER_SIZE )
282
+ uncompress = self ._decompressor .decompress (buf , size )
283
+ else :
284
+ uncompress = self ._decompressor .decompress (b"" , size )
285
+ if self ._decompressor .unused_data != b"" :
286
+ # Prepend the already read bytes to the fileobj so they can
287
+ # be seen by _read_eof() and _read_gzip_header()
288
+ self ._fp .prepend (self ._decompressor .unused_data )
289
+
290
+ if uncompress != b"" :
291
+ break
292
+ if buf == b"" :
293
+ raise EOFError ("Compressed file ended before the "
294
+ "end-of-stream marker was reached" )
295
+
296
+ self ._add_read_data (uncompress )
297
+ self ._pos += len (uncompress )
298
+ return uncompress
299
+
244
300
245
301
# Aliases for improved compatibility with CPython gzip module.
246
302
GzipFile = IGzipFile
@@ -376,13 +432,18 @@ def _argument_parser():
376
432
dest = "compress" ,
377
433
const = False ,
378
434
help = "Decompress the file instead of compressing." )
379
- parser .add_argument ("-c" , "--stdout" , action = "store_true" ,
380
- help = "write on standard output" )
435
+ output_group = parser .add_mutually_exclusive_group ()
436
+ output_group .add_argument ("-c" , "--stdout" , action = "store_true" ,
437
+ help = "write on standard output" )
438
+ output_group .add_argument ("-o" , "--output" ,
439
+ help = "Write to this output file" )
440
+ parser .add_argument ("-f" , "--force" , action = "store_true" ,
441
+ help = "Overwrite output without prompting" )
381
442
# -b flag not taken by either gzip or igzip. Hidden attribute. Above 32K
382
443
# diminishing returns hit. _compression.BUFFER_SIZE = 8k. But 32K is about
383
444
# ~6% faster.
384
445
parser .add_argument ("-b" , "--buffer-size" ,
385
- default = 32 * 1024 , type = int ,
446
+ default = 128 * 1024 , type = int ,
386
447
help = argparse .SUPPRESS )
387
448
return parser
388
449
@@ -392,32 +453,49 @@ def main():
392
453
393
454
compresslevel = args .compresslevel or _COMPRESS_LEVEL_TRADEOFF
394
455
395
- # Determine input file
396
- if args .compress and args .file is None :
397
- in_file = sys .stdin .buffer
398
- elif args .compress and args .file is not None :
399
- in_file = io .open (args .file , mode = "rb" )
400
- elif not args .compress and args .file is None :
401
- in_file = IGzipFile (mode = "rb" , fileobj = sys .stdin .buffer )
402
- elif not args .compress and args .file is not None :
403
- base , extension = os .path .splitext (args .file )
404
- if extension != ".gz" and not args .stdout :
405
- sys .exit (f"filename doesn't end in .gz: { args .file !r} . "
406
- f"Cannot determine output filename." )
407
- in_file = open (args .file , "rb" )
408
-
409
- # Determine output file
410
- if args .compress and (args .file is None or args .stdout ):
411
- out_file = IGzipFile (mode = "wb" , compresslevel = compresslevel ,
412
- fileobj = sys .stdout .buffer )
413
- elif args .compress and args .file is not None :
414
- out_file = open (args .file + ".gz" , mode = "wb" ,
415
- compresslevel = compresslevel )
416
- elif not args .compress and (args .file is None or args .stdout ):
417
- out_file = sys .stdout .buffer
418
- elif not args .compress and args .file is not None :
419
- out_file = io .open (base , "wb" )
456
+ if args .output :
457
+ out_filepath = args .output
458
+ elif args .stdout :
459
+ out_filepath = None # to stdout
460
+ elif args .file is None :
461
+ out_filepath = None # to stout
462
+ else :
463
+ if args .compress :
464
+ out_filepath = args .file + ".gz"
465
+ else :
466
+ out_filepath , extension = os .path .splitext (args .file )
467
+ if extension != ".gz" and not args .stdout :
468
+ sys .exit (f"filename doesn't end in .gz: { args .file !r} . "
469
+ f"Cannot determine output filename." )
470
+ if out_filepath is not None and not args .force :
471
+ if os .path .exists (out_filepath ):
472
+ yes_or_no = input (f"{ out_filepath } already exists; "
473
+ f"do you wish to overwrite (y/n)?" )
474
+ if yes_or_no not in {"y" , "Y" , "yes" }:
475
+ sys .exit ("not overwritten" )
476
+
477
+ if args .compress :
478
+ if args .file is None :
479
+ in_file = sys .stdin .buffer
480
+ else :
481
+ in_file = io .open (args .file , mode = "rb" )
482
+ if out_filepath is not None :
483
+ out_file = open (out_filepath , "wb" , compresslevel = compresslevel )
484
+ else :
485
+ out_file = IGzipFile (mode = "wb" , fileobj = sys .stdout .buffer ,
486
+ compresslevel = compresslevel )
487
+ else :
488
+ if args .file :
489
+ in_file = open (args .file , mode = "rb" )
490
+ else :
491
+ in_file = IGzipFile (mode = "rb" , fileobj = sys .stdin .buffer )
492
+ if out_filepath is not None :
493
+ out_file = io .open (out_filepath , mode = "wb" )
494
+ else :
495
+ out_file = sys .stdout .buffer
420
496
497
+ global READ_BUFFER_SIZE
498
+ READ_BUFFER_SIZE = args .buffer_size
421
499
try :
422
500
while True :
423
501
block = in_file .read (args .buffer_size )
0 commit comments