Skip to content

Commit 44b28ed

Browse files
committed
gh-75707: tarfile: Add optional open() argument "reproducible"
This makes it possible to create reproducible .tar.gz files without overriding time.time(), by setting the gzip header field mtime to 0.
1 parent 96b7a2e commit 44b28ed

File tree

3 files changed

+31
-7
lines changed

3 files changed

+31
-7
lines changed

Doc/library/tarfile.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,11 @@ Some facts and figures:
137137
a Zstandard dictionary used to improve compression of smaller amounts of
138138
data.
139139

140+
For modes ``'w:gz'`` and ``'w|gz'``, :func:`tarfile.open` accepts the
141+
keyword argument *reproducible* (default ``False``) to create a gzip archive
142+
with an mtime of 0. By default, the mtime is set to the time of creation of
143+
the archive.
144+
140145
For special purposes, there is a second format for *mode*:
141146
``'filemode|[compression]'``. :func:`tarfile.open` will return a :class:`TarFile`
142147
object that processes its data as a stream of blocks. No random seeking will

Lib/tarfile.py

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -339,7 +339,7 @@ class _Stream:
339339
"""
340340

341341
def __init__(self, name, mode, comptype, fileobj, bufsize,
342-
compresslevel, preset):
342+
compresslevel, preset, reproducible):
343343
"""Construct a _Stream object.
344344
"""
345345
self._extfileobj = True
@@ -374,7 +374,7 @@ def __init__(self, name, mode, comptype, fileobj, bufsize,
374374
self.exception = zlib.error
375375
self._init_read_gz()
376376
else:
377-
self._init_write_gz(compresslevel)
377+
self._init_write_gz(compresslevel, reproducible)
378378

379379
elif comptype == "bz2":
380380
try:
@@ -423,15 +423,19 @@ def __del__(self):
423423
if hasattr(self, "closed") and not self.closed:
424424
self.close()
425425

426-
def _init_write_gz(self, compresslevel):
426+
def _init_write_gz(self, compresslevel, reproducible):
427427
"""Initialize for writing with gzip compression.
428428
"""
429429
self.cmp = self.zlib.compressobj(compresslevel,
430430
self.zlib.DEFLATED,
431431
-self.zlib.MAX_WBITS,
432432
self.zlib.DEF_MEM_LEVEL,
433433
0)
434-
timestamp = struct.pack("<L", int(time.time()))
434+
if reproducible:
435+
timestamp = 0
436+
else:
437+
timestamp = int(time.time())
438+
timestamp = struct.pack("<L", timestamp)
435439
self.__write(b"\037\213\010\010" + timestamp + b"\002\377")
436440
if self.name.endswith(".gz"):
437441
self.name = self.name[:-3]
@@ -1726,7 +1730,7 @@ class TarFile(object):
17261730
def __init__(self, name=None, mode="r", fileobj=None, format=None,
17271731
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
17281732
errors="surrogateescape", pax_headers=None, debug=None,
1729-
errorlevel=None, copybufsize=None, stream=False):
1733+
errorlevel=None, copybufsize=None, stream=False, reproducible=False):
17301734
"""Open an (uncompressed) tar archive 'name'. 'mode' is either 'r' to
17311735
read from an existing archive, 'a' to append data to an existing
17321736
file or 'w' to create a new file overwriting an existing one. 'mode'
@@ -1932,8 +1936,9 @@ def not_compressed(comptype):
19321936

19331937
compresslevel = kwargs.pop("compresslevel", 6)
19341938
preset = kwargs.pop("preset", None)
1939+
reproducible = kwargs.pop("reproducible", False)
19351940
stream = _Stream(name, filemode, comptype, fileobj, bufsize,
1936-
compresslevel, preset)
1941+
compresslevel, preset, reproducible)
19371942
try:
19381943
t = cls(name, filemode, stream, **kwargs)
19391944
except:
@@ -1969,7 +1974,11 @@ def gzopen(cls, name, mode="r", fileobj=None, compresslevel=6, **kwargs):
19691974
raise CompressionError("gzip module is not available") from None
19701975

19711976
try:
1972-
fileobj = GzipFile(name, mode + "b", compresslevel, fileobj)
1977+
if kwargs.pop('reproducible', False):
1978+
mtime = 0
1979+
else:
1980+
mtime = None
1981+
fileobj = GzipFile(name, mode + "b", compresslevel, fileobj, mtime=mtime)
19731982
except OSError as e:
19741983
if fileobj is not None and mode == 'r':
19751984
raise ReadError("not a gzip file") from e

Lib/test/test_tarfile.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1809,6 +1809,11 @@ def test_source_directory_not_leaked(self):
18091809
payload = pathlib.Path(tmpname).read_text(encoding='latin-1')
18101810
assert os.path.dirname(tmpname) not in payload
18111811

1812+
def test_create_reproducible(self):
1813+
tarfile.open(tmpname, self.mode, reproducible=True).close()
1814+
with self.open(tmpname, 'r') as fobj:
1815+
fobj.read()
1816+
self.assertEqual(fobj.mtime, 0)
18121817

18131818
class Bz2StreamWriteTest(Bz2Test, StreamWriteTest):
18141819
decompressor = bz2.BZ2Decompressor if bz2 else None
@@ -2115,6 +2120,11 @@ def test_create_with_compresslevel(self):
21152120
with tarfile.open(tmpname, 'r:gz', compresslevel=1) as tobj:
21162121
pass
21172122

2123+
def test_create_reproducible(self):
2124+
tarfile.open(tmpname, self.mode, reproducible=True).close()
2125+
with self.open(tmpname, 'rb') as fobj:
2126+
fobj.read()
2127+
self.assertEqual(fobj.mtime, 0)
21182128

21192129
class Bz2CreateTest(Bz2Test, CreateTest):
21202130

0 commit comments

Comments
 (0)