Skip to content

Commit 926338c

Browse files
committed
Support repack(removed)
1 parent 4c35eb2 commit 926338c

File tree

3 files changed

+171
-22
lines changed

3 files changed

+171
-22
lines changed

Doc/library/zipfile.rst

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -527,20 +527,33 @@ ZipFile Objects
527527
a path is provided.
528528

529529
This does not physically remove the local file entry from the archive;
530-
the ZIP file size remains unchanged. Use :meth:`ZipFile.repack` afterwards
530+
the ZIP file size remains unchanged. Call :meth:`ZipFile.repack` afterwards
531531
to reclaim space.
532532

533533
The archive must be opened with mode ``'w'``, ``'x'`` or ``'a'``.
534534

535+
Returns the removed :class:`ZipInfo` instance.
536+
535537
Calling :meth:`remove` on a closed ZipFile will raise a :exc:`ValueError`.
536538

537539
.. versionadded:: next
538540

539541

540-
.. method:: ZipFile.repack(*, strict_descriptor=False[, chunk_size])
542+
.. method:: ZipFile.repack(removed=None, *, \
543+
strict_descriptor=False[, chunk_size])
544+
545+
Rewrites the archive to remove stale local file entries, shrinking the ZIP
546+
file size.
547+
548+
If *removed* is provided, it must be a sequence of :class:`ZipInfo` objects
549+
representing removed entries; only their corresponding local file entries
550+
will be removed.
541551

542-
Rewrites the archive to remove local file entries that are no longer
543-
referenced, shrinking the ZIP file size.
552+
If *removed* is not provided, local file entries no longer referenced in the
553+
central directory will be removed. The algorithm assumes that local file
554+
entries are stored consecutively. Extra bytes between entries will also be
555+
removed. Data before the first referenced entry is preserved unless it
556+
appears to be a sequence of consecutive local file entries.
544557

545558
``strict_descriptor=True`` can be provided to skip the slower scan for an
546559
unsigned data descriptor (deprecated in the latest ZIP specification and is

Lib/test/test_zipfile/test_core.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,6 +2092,118 @@ def test_repack_overlapping_blocks(self):
20922092
with self.assertRaises(zipfile.BadZipFile):
20932093
zh.repack()
20942094

2095+
def test_repack_removed_basic(self):
2096+
"""Should remove local file entries for provided deleted files."""
2097+
ln = len(self.test_files)
2098+
iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n))
2099+
for ii in iii:
2100+
with self.subTest(remove=ii):
2101+
# calculate the expected results
2102+
test_files = [data for j, data in enumerate(self.test_files) if j not in ii]
2103+
expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files)
2104+
expected_size = os.path.getsize(TESTFN)
2105+
2106+
# do the removal and check the result
2107+
zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files)
2108+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
2109+
zinfos = [zh.remove(self.test_files[i][0]) for i in ii]
2110+
zh.repack(zinfos)
2111+
2112+
# check infolist
2113+
self.assertEqual(
2114+
[ComparableZipInfo(zi) for zi in zh.infolist()],
2115+
expected_zinfos,
2116+
)
2117+
2118+
# check file size
2119+
self.assertEqual(os.path.getsize(TESTFN), expected_size)
2120+
2121+
# make sure the zip file is still valid
2122+
with zipfile.ZipFile(TESTFN) as zh:
2123+
self.assertIsNone(zh.testzip())
2124+
2125+
def test_repack_removed_partial(self):
2126+
"""Should remove local file entries only for provided deleted files."""
2127+
ln = len(self.test_files)
2128+
iii = (ii for n in range(1, ln + 1) for ii in itertools.combinations(range(ln), n))
2129+
for ii in iii:
2130+
with self.subTest(removed=ii):
2131+
# calculate the expected results
2132+
test_files = [data for j, data in enumerate(self.test_files) if j not in ii]
2133+
expected_zinfos = self._prepare_zip_from_test_files(TESTFN, test_files)
2134+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
2135+
for zi in zh.infolist().copy():
2136+
zh.remove(zi)
2137+
expected_size = os.path.getsize(TESTFN)
2138+
2139+
# do the removal and check the result
2140+
zinfos = self._prepare_zip_from_test_files(TESTFN, self.test_files)
2141+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
2142+
zinfos = [zh.remove(self.test_files[i][0]) for i, _ in enumerate(self.test_files)]
2143+
zh.repack([zinfos[i] for i in ii])
2144+
2145+
# check infolist
2146+
self.assertEqual(
2147+
[ComparableZipInfo(zi) for zi in zh.infolist()],
2148+
[],
2149+
)
2150+
2151+
# check file size
2152+
self.assertEqual(os.path.getsize(TESTFN), expected_size)
2153+
2154+
# make sure the zip file is still valid
2155+
with zipfile.ZipFile(TESTFN) as zh:
2156+
self.assertIsNone(zh.testzip())
2157+
2158+
def test_repack_removed_bytes_between_files(self):
2159+
"""Should not remove bytes between local file entries."""
2160+
# calculate the expected results
2161+
for ii in ([0], [1], [2]):
2162+
with self.subTest(removed=ii):
2163+
expected_zinfos = []
2164+
with open(TESTFN, 'wb') as fh:
2165+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
2166+
for j, (file, data) in enumerate(self.test_files):
2167+
if j not in ii:
2168+
zh.writestr(file, data)
2169+
expected_zinfos.append(ComparableZipInfo(zh.getinfo(file)))
2170+
fh.write(b' dummy bytes ')
2171+
expected_size = os.path.getsize(TESTFN)
2172+
2173+
# do the removal and check the result
2174+
with open(TESTFN, 'wb') as fh:
2175+
with zipfile.ZipFile(fh, 'w', self.compression) as zh:
2176+
for i, (file, data) in enumerate(self.test_files):
2177+
zh.writestr(file, data)
2178+
fh.write(b' dummy bytes ')
2179+
with zipfile.ZipFile(TESTFN, 'a', self.compression) as zh:
2180+
zinfos = [zh.remove(self.test_files[i][0]) for i in ii]
2181+
zh.repack(zinfos)
2182+
2183+
# check infolist
2184+
self.assertEqual(
2185+
[ComparableZipInfo(zi) for zi in zh.infolist()],
2186+
expected_zinfos,
2187+
)
2188+
2189+
# check file size
2190+
self.assertEqual(os.path.getsize(TESTFN), expected_size)
2191+
2192+
# make sure the zip file is still valid
2193+
with zipfile.ZipFile(TESTFN) as zh:
2194+
self.assertIsNone(zh.testzip())
2195+
2196+
def test_repack_removed_bad_removed_zinfos(self):
2197+
"""Should raise when providing non-removed zinfos."""
2198+
# calculate the expected results
2199+
for ii in ([0], [1], [2]):
2200+
with self.subTest(removed=ii):
2201+
self._prepare_zip_from_test_files(TESTFN, self.test_files)
2202+
with zipfile.ZipFile(TESTFN, 'a') as zh:
2203+
zinfos = [zh.getinfo(self.test_files[i][0]) for i in ii]
2204+
with self.assertRaises(zipfile.BadZipFile):
2205+
zh.repack(zinfos)
2206+
20952207
@mock.patch('zipfile._ZipRepacker')
20962208
def test_repack_closed(self, m_repack):
20972209
self._prepare_zip_from_test_files(TESTFN, self.test_files)

Lib/zipfile/__init__.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1378,7 +1378,7 @@ def _debug(self, level, *msg):
13781378
if self.debug >= level:
13791379
print(*msg)
13801380

1381-
def repack(self, zfile):
1381+
def repack(self, zfile, removed=None):
13821382
"""
13831383
Repack the ZIP file, removing unrecorded local file entries and random
13841384
bytes not listed in the central directory.
@@ -1442,11 +1442,14 @@ def repack(self, zfile):
14421442
[recorded local file entry 1]
14431443
...
14441444
"""
1445+
removed_zinfos = set(removed or ())
1446+
14451447
fp = zfile.fp
14461448

14471449
# get a sorted filelist by header offset, in case the dir order
14481450
# doesn't match the actual entry order
1449-
filelist = sorted(zfile.filelist, key=lambda x: x.header_offset)
1451+
filelist = (*zfile.filelist, *removed_zinfos)
1452+
filelist = sorted(filelist, key=lambda x: x.header_offset)
14501453

14511454
# calculate each entry size and validate
14521455
entry_size_list = []
@@ -1464,36 +1467,55 @@ def repack(self, zfile):
14641467
self._debug(3, i, zinfo.orig_filename, entry_size, used_entry_size)
14651468
if used_entry_size > entry_size:
14661469
raise BadZipFile(
1467-
f"Overlapped entries: {zinfo.orig_filename!r} "
1468-
f"(possible zip bomb)")
1470+
f"Overlapped entries: {zinfo.orig_filename!r} ")
14691471

14701472
entry_size_list.append(entry_size)
14711473
used_entry_size_list.append(used_entry_size)
14721474

14731475
# calculate the starting entry offset (bytes to skip)
1474-
try:
1475-
data_offset = filelist[0].header_offset
1476-
except IndexError:
1477-
data_offset = zfile.start_dir
1478-
entry_offset = self._calc_initial_entry_offset(fp, data_offset)
1476+
if removed is None:
1477+
try:
1478+
data_offset = filelist[0].header_offset
1479+
except IndexError:
1480+
data_offset = zfile.start_dir
1481+
entry_offset = self._calc_initial_entry_offset(fp, data_offset)
1482+
else:
1483+
entry_offset = 0
14791484

14801485
# move file entries
14811486
for i, zinfo in enumerate(filelist):
14821487
entry_size = entry_size_list[i]
14831488
used_entry_size = used_entry_size_list[i]
14841489

14851490
# update the header and move entry data to the new position
1486-
if entry_offset > 0:
1491+
if zinfo in removed_zinfos:
14871492
old_header_offset = zinfo.header_offset
14881493
zinfo.header_offset -= entry_offset
1489-
self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size)
1494+
self._move_entry_data(
1495+
fp,
1496+
old_header_offset + used_entry_size,
1497+
zinfo.header_offset,
1498+
entry_size - used_entry_size
1499+
)
14901500

1491-
if zinfo._end_offset is not None:
1492-
zinfo._end_offset = zinfo.header_offset + used_entry_size
1501+
if zinfo._end_offset is not None:
1502+
zinfo._end_offset = zinfo.header_offset
14931503

1494-
# update entry_offset for subsequent files to follow
1495-
if used_entry_size < entry_size:
1496-
entry_offset += entry_size - used_entry_size
1504+
# update entry_offset for subsequent files to follow
1505+
entry_offset += used_entry_size
1506+
1507+
else:
1508+
if entry_offset > 0:
1509+
old_header_offset = zinfo.header_offset
1510+
zinfo.header_offset -= entry_offset
1511+
self._move_entry_data(fp, old_header_offset, zinfo.header_offset, used_entry_size)
1512+
1513+
if zinfo._end_offset is not None:
1514+
zinfo._end_offset = zinfo.header_offset + used_entry_size
1515+
1516+
# update entry_offset for subsequent files to follow
1517+
if used_entry_size < entry_size:
1518+
entry_offset += entry_size - used_entry_size
14971519

14981520
# update state
14991521
zfile.start_dir -= entry_offset
@@ -2250,7 +2272,9 @@ def remove(self, zinfo_or_arcname):
22502272

22512273
self._didModify = True
22522274

2253-
def repack(self, **opts):
2275+
return zinfo
2276+
2277+
def repack(self, removed=None, **opts):
22542278
"""Repack a zip file, removing non-referenced file entries.
22552279
22562280
The archive must be opened with mode 'a', as mode 'w'/'x' do not
@@ -2270,7 +2294,7 @@ def repack(self, **opts):
22702294
with self._lock:
22712295
self._writing = True
22722296
try:
2273-
_ZipRepacker(**opts).repack(self)
2297+
_ZipRepacker(**opts).repack(self, removed)
22742298
finally:
22752299
self._writing = False
22762300

0 commit comments

Comments
 (0)