Skip to content

Commit 8f56b7d

Browse files
serhiy-storchakamcepl
authored andcommitted
pythongh-133890: Handle UnicodeEncodeError in tarfile (pythonGH-134147)
UnicodeEncodeError is now handled the same way as OSError during TarFile member extraction. (cherry picked from commit 9983c7d) Co-authored-by: Serhiy Storchaka <[email protected]>
1 parent ad97d93 commit 8f56b7d

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

Lib/tarfile.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2281,7 +2281,7 @@ def _get_extract_tarinfo(self, member, filter_function, path):
22812281
unfiltered = tarinfo
22822282
try:
22832283
tarinfo = filter_function(tarinfo, path)
2284-
except (OSError, FilterError) as e:
2284+
except (OSError, UnicodeEncodeError, FilterError) as e:
22852285
self._handle_fatal_error(e)
22862286
except ExtractError as e:
22872287
self._handle_nonfatal_error(e)
@@ -2302,7 +2302,7 @@ def _extract_one(self, tarinfo, path, set_attrs, numeric_owner):
23022302
self._extract_member(tarinfo, os.path.join(path, tarinfo.name),
23032303
set_attrs=set_attrs,
23042304
numeric_owner=numeric_owner)
2305-
except OSError as e:
2305+
except (OSError, UnicodeEncodeError) as e:
23062306
self._handle_fatal_error(e)
23072307
except ExtractError as e:
23082308
self._handle_nonfatal_error(e)

Lib/test/test_tarfile.py

Lines changed: 47 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3014,11 +3014,12 @@ class ArchiveMaker:
30143014
with t.open() as tar:
30153015
... # `tar` is now a TarFile with 'filename' in it!
30163016
"""
3017-
def __init__(self):
3017+
def __init__(self, **kwargs):
30183018
self.bio = io.BytesIO()
3019+
self.tar_kwargs = dict(kwargs)
30193020

30203021
def __enter__(self):
3021-
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio)
3022+
self.tar_w = tarfile.TarFile(mode='w', fileobj=self.bio, **self.tar_kwargs)
30223023
return self
30233024

30243025
def __exit__(self, *exc):
@@ -3425,7 +3426,10 @@ def test_tar_filter(self):
34253426
# that in the test archive.)
34263427
with tarfile.TarFile.open(tarname) as tar:
34273428
for tarinfo in tar.getmembers():
3428-
filtered = tarfile.tar_filter(tarinfo, '')
3429+
try:
3430+
filtered = tarfile.tar_filter(tarinfo, '')
3431+
except UnicodeEncodeError:
3432+
continue
34293433
self.assertIs(filtered.name, tarinfo.name)
34303434
self.assertIs(filtered.type, tarinfo.type)
34313435

@@ -3436,13 +3440,50 @@ def test_data_filter(self):
34363440
for tarinfo in tar.getmembers():
34373441
try:
34383442
filtered = tarfile.data_filter(tarinfo, '')
3439-
except tarfile.FilterError:
3443+
except (tarfile.FilterError, UnicodeEncodeError):
34403444
continue
34413445
self.assertIs(filtered.name, tarinfo.name)
34423446
self.assertIs(filtered.type, tarinfo.type)
34433447

3444-
def test_default_filter_warns_not(self):
3445-
"""Ensure the default filter does not warn (like in 3.12)"""
3448+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
3449+
def test_filter_unencodable(self):
3450+
# Sanity check using a valid path.
3451+
tarinfo = tarfile.TarInfo(os_helper.TESTFN)
3452+
filtered = tarfile.tar_filter(tarinfo, '')
3453+
self.assertIs(filtered.name, tarinfo.name)
3454+
filtered = tarfile.data_filter(tarinfo, '')
3455+
self.assertIs(filtered.name, tarinfo.name)
3456+
3457+
tarinfo = tarfile.TarInfo('test\x00')
3458+
self.assertRaises(ValueError, tarfile.tar_filter, tarinfo, '')
3459+
self.assertRaises(ValueError, tarfile.data_filter, tarinfo, '')
3460+
tarinfo = tarfile.TarInfo('\ud800')
3461+
self.assertRaises(UnicodeEncodeError, tarfile.tar_filter, tarinfo, '')
3462+
self.assertRaises(UnicodeEncodeError, tarfile.data_filter, tarinfo, '')
3463+
3464+
@unittest.skipIf(sys.platform == 'win32', 'requires native bytes paths')
3465+
def test_extract_unencodable(self):
3466+
# Create a member with name \xed\xa0\x80 which is UTF-8 encoded
3467+
# lone surrogate \ud800.
3468+
with ArchiveMaker(encoding='ascii', errors='surrogateescape') as arc:
3469+
arc.add('\udced\udca0\udc80')
3470+
with os_helper.temp_cwd() as tmp:
3471+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
3472+
errorlevel=1)
3473+
self.assertEqual(tar.getnames(), ['\ud800'])
3474+
with self.assertRaises(UnicodeEncodeError):
3475+
tar.extractall()
3476+
self.assertEqual(os.listdir(), [])
3477+
3478+
tar = arc.open(encoding='utf-8', errors='surrogatepass',
3479+
errorlevel=0, debug=1)
3480+
with support.captured_stderr() as stderr:
3481+
tar.extractall()
3482+
self.assertEqual(os.listdir(), [])
3483+
self.assertIn('tarfile: UnicodeEncodeError ', stderr.getvalue())
3484+
3485+
def test_default_filter_warns(self):
3486+
"""Ensure the default filter warns"""
34463487
with ArchiveMaker() as arc:
34473488
arc.add('foo')
34483489
# Replicate warnings_helper.check_no_warnings
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
The :mod:`tarfile` module now handles :exc:`UnicodeEncodeError` in the same
2+
way as :exc:`OSError` when cannot extract a member.

0 commit comments

Comments
 (0)