Skip to content

Commit 9376c2d

Browse files
committed
gh-51067: Default ZipFile.repack() to strict_descriptor=True
Locating an unsigned data descriptor for a STORED or encrypted entry requires a byte-by-byte scan that can be ~100-1000x slower and is a potential denial-of-service vector on untrusted input. Default strict_descriptor to True so repack() detects only the signed data descriptors written by Python and other modern tools, leaving the slow scan as an opt-in. Also explain data descriptors and the signed/unsigned distinction in the docs, and add coverage for the new default (unit, encrypted, and an end-to-end repack test across all compression methods).
1 parent 59d7b0d commit 9376c2d

4 files changed

Lines changed: 94 additions & 13 deletions

File tree

Doc/library/zipfile.rst

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ ZipFile objects
576576

577577

578578
.. method:: ZipFile.repack(removed=None, *, \
579-
strict_descriptor=False[, chunk_size])
579+
strict_descriptor=True[, chunk_size])
580580

581581
Rewrites the archive to remove unreferenced local file entries, shrinking
582582
its file size. The archive must be opened with mode ``'a'``.
@@ -587,12 +587,22 @@ ZipFile objects
587587
locate and remove local file entries that are no longer referenced in the
588588
central directory.
589589

590-
When scanning, setting ``strict_descriptor=True`` disables detection of any
591-
entry using an unsigned data descriptor (a format deprecated by the ZIP
592-
specification since version 6.3.0, released on 2006-09-29, and used only by
593-
some legacy tools), which is significantly slower to scan—around 100 to
594-
1000 times in the worst case. This does not affect performance on entries
595-
without such feature.
590+
When scanning, *strict_descriptor* controls how entries written with an
591+
unsigned *data descriptor* are handled. A data descriptor is an optional
592+
record holding an entry's CRC and sizes, stored just after the entry's data;
593+
it is used when the archive is written to a non-seekable stream, and is
594+
*signed* when it begins with a marker signature or *unsigned* otherwise.
595+
Unsigned descriptors have been deprecated by the `PKZIP Application Note`_
596+
since version 6.3.0 (released in 2006) and are written only by some legacy
597+
tools; signed descriptors—written by Python and other modern tools—are always
598+
detected. When *strict_descriptor* is true (the default), only signed data
599+
descriptors are detected, so an unreferenced entry written with an unsigned
600+
descriptor is not located and its space is not reclaimed by the scan.
601+
Setting ``strict_descriptor=False`` additionally detects unsigned
602+
descriptors, at the cost of a significantly slower scan—around 100 to 1000
603+
times in the worst case—which may be exploitable as a denial-of-service
604+
vector on untrusted input. This does not affect entries without a data
605+
descriptor, and is not needed when *removed* is provided.
596606

597607
*chunk_size* may be specified to control the buffer size when moving
598608
entry data (default is 1 MiB).

Lib/test/test_zipfile/test_core.py

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2134,6 +2134,52 @@ def test_repack_overlapping_blocks(self):
21342134
with self.assertRaisesRegex(zipfile.BadZipFile, 'Overlapped entries'):
21352135
zh.repack()
21362136

2137+
def test_repack_scan_unsigned_data_descriptor(self):
2138+
"""By default (strict_descriptor=True) the scan does not reclaim an
2139+
unreferenced entry written with an unsigned data descriptor, but keeps
2140+
the archive valid; strict_descriptor=False reclaims it."""
2141+
removed_name, removed_data = self.test_files[1]
2142+
remaining = [n for n, _ in self.test_files if n != removed_name]
2143+
2144+
# Build an archive whose entries use *unsigned* data descriptors by
2145+
# writing to an unseekable stream with the descriptor signature stripped.
2146+
buf = io.BytesIO()
2147+
with mock.patch.object(struct, 'pack', side_effect=struct_pack_no_dd_sig):
2148+
with zipfile.ZipFile(Unseekable(buf), 'w', self.compression) as zh:
2149+
for file, data in self.test_files:
2150+
with zh.open(file, 'w') as fh:
2151+
fh.write(data)
2152+
archive = buf.getvalue()
2153+
2154+
# sanity: the removed entry really uses a data descriptor (flag bit 3);
2155+
# it is unsigned by construction above
2156+
with zipfile.ZipFile(io.BytesIO(archive)) as zh:
2157+
self.assertTrue(zh.getinfo(removed_name).flag_bits & 0x08)
2158+
2159+
# default repack(): strict_descriptor=True does not locate the unsigned
2160+
# data descriptor, so the local data is preserved (not reclaimed).
2161+
fz = io.BytesIO(archive)
2162+
with zipfile.ZipFile(fz, 'a', self.compression) as zh:
2163+
zh.remove(removed_name)
2164+
zh.repack()
2165+
default_size = len(fz.getvalue())
2166+
with zipfile.ZipFile(fz) as zh:
2167+
self.assertEqual(zh.namelist(), remaining)
2168+
self.assertIsNone(zh.testzip())
2169+
2170+
# strict_descriptor=False: the unsigned data descriptor is detected, so
2171+
# the local data is reclaimed and the archive shrinks.
2172+
fz = io.BytesIO(archive)
2173+
with zipfile.ZipFile(fz, 'a', self.compression) as zh:
2174+
zh.remove(removed_name)
2175+
zh.repack(strict_descriptor=False)
2176+
strict_false_size = len(fz.getvalue())
2177+
with zipfile.ZipFile(fz) as zh:
2178+
self.assertEqual(zh.namelist(), remaining)
2179+
self.assertIsNone(zh.testzip())
2180+
2181+
self.assertLess(strict_false_size, default_size)
2182+
21372183
def test_repack_removed_basic(self):
21382184
"""Should remove local file entries for provided deleted files."""
21392185
ln = len(self.test_files)
@@ -2628,7 +2674,9 @@ def test_validate_local_file_entry_zstd(self):
26282674
self._test_validate_local_file_entry(method=zipfile.ZIP_ZSTANDARD)
26292675

26302676
def _test_validate_local_file_entry(self, method):
2631-
repacker = zipfile._ZipRepacker()
2677+
# strict_descriptor=False to exercise unsigned data descriptor scanning
2678+
# (the default is strict_descriptor=True, tested separately below)
2679+
repacker = zipfile._ZipRepacker(strict_descriptor=False)
26322680

26332681
# basic
26342682
bytes_ = self._generate_local_file_entry(
@@ -2799,7 +2847,9 @@ def test_validate_local_file_entry_zip64_zstd(self):
27992847
self._test_validate_local_file_entry_zip64(method=zipfile.ZIP_ZSTANDARD)
28002848

28012849
def _test_validate_local_file_entry_zip64(self, method):
2802-
repacker = zipfile._ZipRepacker()
2850+
# strict_descriptor=False to exercise unsigned data descriptor scanning
2851+
# (the default is strict_descriptor=True, tested separately below)
2852+
repacker = zipfile._ZipRepacker(strict_descriptor=False)
28032853

28042854
# zip64
28052855
bytes_ = self._generate_local_file_entry(
@@ -2870,7 +2920,9 @@ def _test_validate_local_file_entry_zip64(self, method):
28702920
m_sddns.assert_not_called()
28712921

28722922
def test_validate_local_file_entry_encrypted(self):
2873-
repacker = zipfile._ZipRepacker()
2923+
# strict_descriptor=False to exercise unsigned data descriptor scanning
2924+
# of an encrypted entry (the default strict_descriptor=True is tested below)
2925+
repacker = zipfile._ZipRepacker(strict_descriptor=False)
28742926

28752927
bytes_ = (
28762928
b'PK\x03\x04'
@@ -2903,6 +2955,21 @@ def test_validate_local_file_entry_encrypted(self):
29032955
m_sddnsbd.assert_not_called()
29042956
m_sddns.assert_called_once_with(fz, 38, len(bytes_), False)
29052957

2958+
# return None for the unsigned data descriptor if `strict_descriptor=True`
2959+
repacker = zipfile._ZipRepacker(strict_descriptor=True)
2960+
fz = io.BytesIO(bytes_)
2961+
with mock.patch.object(repacker, '_scan_data_descriptor',
2962+
wraps=repacker._scan_data_descriptor) as m_sdd, \
2963+
mock.patch.object(repacker, '_scan_data_descriptor_no_sig_by_decompression',
2964+
wraps=repacker._scan_data_descriptor_no_sig_by_decompression) as m_sddnsbd, \
2965+
mock.patch.object(repacker, '_scan_data_descriptor_no_sig',
2966+
wraps=repacker._scan_data_descriptor_no_sig) as m_sddns:
2967+
result = repacker._validate_local_file_entry(fz, 0, len(bytes_))
2968+
self.assertEqual(result, None)
2969+
m_sdd.assert_called_once_with(fz, 38, len(bytes_), False)
2970+
m_sddnsbd.assert_not_called()
2971+
m_sddns.assert_not_called()
2972+
29062973
def test_iter_scan_signature(self):
29072974
bytes_ = b'sig__sig__sig__sig'
29082975
ln = len(bytes_)

Lib/test/test_zipfile64.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,9 @@ def _test_strip_removed_large_file_with_dd_no_sig(self, f):
225225

226226
with zipfile.ZipFile(f, 'a') as zh:
227227
zh.remove(file1)
228-
zh.repack()
228+
# strict_descriptor=False to scan the unsigned data descriptor
229+
# (scanning is disabled under the strict_descriptor=True default)
230+
zh.repack(strict_descriptor=False)
229231
self.assertIsNone(zh.testzip())
230232

231233
@requires_zlib()
@@ -255,7 +257,9 @@ def _test_strip_removed_large_file_with_dd_no_sig_by_decompression(self, f, meth
255257

256258
with zipfile.ZipFile(f, 'a') as zh:
257259
zh.remove(file1)
258-
zh.repack()
260+
# strict_descriptor=False to detect the unsigned data descriptor
261+
# (scanning is disabled under the strict_descriptor=True default)
262+
zh.repack(strict_descriptor=False)
259263
self.assertIsNone(zh.testzip())
260264

261265

Lib/zipfile/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1396,7 +1396,7 @@ def close(self):
13961396

13971397
class _ZipRepacker:
13981398
"""Class for ZipFile repacking."""
1399-
def __init__(self, *, strict_descriptor=False, chunk_size=2**20, debug=0):
1399+
def __init__(self, *, strict_descriptor=True, chunk_size=2**20, debug=0):
14001400
self.debug = debug # Level of printing: 0 through 3
14011401
self.chunk_size = chunk_size
14021402
self.strict_descriptor = strict_descriptor

0 commit comments

Comments
 (0)