Skip to content

Commit b2d12df

Browse files
committed
#390 Improve handling for archives with unicode names.
* Add extract_with_fallback wrapper for 7zip extraction * New tests for achives with unicode. Signed-off-by: Philippe Ombredanne <[email protected]>
1 parent fd7be76 commit b2d12df

File tree

5 files changed

+108
-43
lines changed

5 files changed

+108
-43
lines changed

src/extractcode/archive.py

Lines changed: 65 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright (c) 2015 nexB Inc. and others. All rights reserved.
2+
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
33
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
44
# The ScanCode software is licensed under the Apache License version 2.0.
55
# Data generated with ScanCode require an acknowledgment.
@@ -85,29 +85,6 @@
8585
- http://en.wikipedia.org/wiki/List_of_file_formats#Archive_and_compressed
8686
"""
8787

88-
# high level aliases to lower level extraction functions
89-
extract_tar = libarchive2.extract
90-
extract_patch = patch.extract
91-
92-
extract_deb = libarchive2.extract
93-
extract_ar = libarchive2.extract
94-
extract_msi = sevenzip.extract
95-
extract_cpio = libarchive2.extract
96-
extract_7z = libarchive2.extract
97-
extract_zip = libarchive2.extract
98-
99-
extract_iso = sevenzip.extract
100-
extract_rar = sevenzip.extract
101-
extract_rpm = sevenzip.extract
102-
extract_xz = sevenzip.extract
103-
extract_lzma = sevenzip.extract
104-
extract_squashfs = sevenzip.extract
105-
extract_cab = sevenzip.extract
106-
extract_nsis = sevenzip.extract
107-
extract_ishield = sevenzip.extract
108-
extract_Z = sevenzip.extract
109-
110-
11188
# if strict, all hanlders criteria must be matched for it to be selected
11289
Handler = namedtuple('Handler', ['name', 'filetypes', 'mimetypes', 'extensions', 'kind', 'extractors', 'strict'])
11390

@@ -319,9 +296,9 @@ def extract_twice(location, target_dir, extractor1, extractor2):
319296
covers most common cases.
320297
"""
321298
abs_location = os.path.abspath(os.path.expanduser(location))
322-
abs_target_dir = os.path.abspath(os.path.expanduser(target_dir))
299+
abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
323300
# extract first the intermediate payload to a temp dir
324-
temp_target = fileutils.get_temp_dir('extract')
301+
temp_target = unicode(fileutils.get_temp_dir('extract'))
325302
warnings = extractor1(abs_location, temp_target)
326303
if DEBUG:
327304
logger.debug('extract_twice: temp_target: %(temp_target)r' % locals())
@@ -335,16 +312,73 @@ def extract_twice(location, target_dir, extractor1, extractor2):
335312
for extracted1_loc in inner_archives:
336313
if DEBUG:
337314
logger.debug('extract_twice: extractor2: %(extracted1_loc)r' % locals())
338-
warnings.extend(extractor2(extracted1_loc, target_dir))
315+
warnings.extend(extractor2(extracted1_loc, abs_target_dir))
339316
finally:
340317
# cleanup the temporary output from extractor1
341318
fileutils.delete(temp_target)
342319
return warnings
343320

344321

345-
"""
346-
List of archive handlers.
347-
"""
322+
def extract_with_fallback(location, target_dir, extractor1, extractor2):
323+
"""
324+
Extract archive at `location` to `target_dir` trying first `extractor1` function.
325+
If extract fails, attempt extraction again with the `extractor2` function.
326+
Return a list of warning messages. Raise exceptions on errors.
327+
328+
Note: there are a few cases where the primary extractor for a type may fail and
329+
a secondary extractor will succeed.
330+
"""
331+
abs_location = os.path.abspath(os.path.expanduser(location))
332+
abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir)))
333+
# attempt extract first to a temp dir
334+
temp_target1 = unicode(fileutils.get_temp_dir('extract1'))
335+
try:
336+
warnings = extractor1(abs_location, temp_target1)
337+
if DEBUG:
338+
logger.debug('extract_with_fallback: temp_target1: %(temp_target1)r' % locals())
339+
fileutils.copytree(temp_target1, abs_target_dir)
340+
except:
341+
try:
342+
temp_target2 = unicode(fileutils.get_temp_dir('extract2'))
343+
warnings = extractor2(abs_location, temp_target2)
344+
if DEBUG:
345+
logger.debug('extract_with_fallback: temp_target2: %(temp_target2)r' % locals())
346+
fileutils.copytree(temp_target2, abs_target_dir)
347+
finally:
348+
fileutils.delete(temp_target2)
349+
finally:
350+
fileutils.delete(temp_target1)
351+
return warnings
352+
353+
354+
# High level aliases to lower level extraction functions
355+
########################################################
356+
extract_tar = libarchive2.extract
357+
extract_patch = patch.extract
358+
359+
extract_deb = libarchive2.extract
360+
extract_ar = libarchive2.extract
361+
extract_msi = sevenzip.extract
362+
extract_cpio = libarchive2.extract
363+
364+
# sevenzip should be best at extracting 7zip but most often libarchive is better first
365+
extract_7z = functools.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract)
366+
367+
extract_zip = libarchive2.extract
368+
extract_iso = sevenzip.extract
369+
extract_rar = sevenzip.extract
370+
extract_rpm = sevenzip.extract
371+
extract_xz = sevenzip.extract
372+
extract_lzma = sevenzip.extract
373+
extract_squashfs = sevenzip.extract
374+
extract_cab = sevenzip.extract
375+
extract_nsis = sevenzip.extract
376+
extract_ishield = sevenzip.extract
377+
extract_Z = sevenzip.extract
378+
379+
380+
# Archive handlers.
381+
####################
348382

349383
TarHandler = Handler(
350384
name='Tar',
@@ -795,6 +829,7 @@ def extract_twice(location, target_dir, extractor1, extractor2):
795829
strict=True
796830
)
797831

832+
# Actual list of handlers
798833

799834
archive_handlers = [
800835
TarHandler,
192 Bytes
Binary file not shown.
425 Bytes
Binary file not shown.
330 Bytes
Binary file not shown.

tests/extractcode/test_archive.py

Lines changed: 43 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -96,14 +96,7 @@ def test_get_extractors(self):
9696
for test_file, expected in test_data:
9797
test_loc = self.get_test_loc(test_file)
9898
extractors = archive.get_extractors(test_loc)
99-
ft = typecode.contenttype.get_type(test_loc).filetype_file
100-
mt = typecode.contenttype.get_type(test_loc).mimetype_file
101-
fe = fileutils.file_extension(test_loc).lower()
102-
msg = (expected[0].__module__ + '.' + expected[0].__name__
103-
+ '!='
104-
+ extractors[0].__module__ + '.' + extractors[0].__name__
105-
+ ' for %(test_file)r') % locals()
106-
assert expected == extractors, msg
99+
assert expected == extractors
107100

108101
def test_get_extractors_with_kinds(self):
109102
test_data = [
@@ -394,6 +387,15 @@ def test_extract_targz_from_apache_should_not_return_errors(self):
394387
assert [] == result
395388
assert os.listdir(test_dir)
396389

390+
def test_extract_targz_with_unicode_path_should_extract_without_error(self):
391+
test_file = self.get_test_loc('archive/tgz/tgz_unicode.tgz')
392+
test_dir = self.get_temp_dir()
393+
extractor = archive.get_extractor(test_file)
394+
assert archive.extract_tar == extractor
395+
result = archive.extract_tar(test_file, test_dir)
396+
assert [] == result
397+
assert os.listdir(test_dir)
398+
397399

398400
class TestGzip(BaseArchiveTestCase):
399401
def test_uncompress_gzip_basic(self):
@@ -841,6 +843,13 @@ def test_extract_zip_can_extract_windows_media_player_skins(self):
841843
expected = ['32px.png', 'go.js', 'go.wms']
842844
check_files(test_dir, expected)
843845

846+
def test_extract_zip_with_unicode_path_should_extract_without_error(self):
847+
test_file = self.get_test_loc('archive/zip/zip_unicode.zip')
848+
test_dir = self.get_temp_dir()
849+
result = archive.extract_zip(test_file, test_dir)
850+
assert [] == result
851+
assert os.listdir(test_dir)
852+
844853

845854
class TestLibarch(BaseArchiveTestCase):
846855
def test_extract_zip_with_relative_path_libarchive(self):
@@ -1588,12 +1597,11 @@ def test_extract_7z_with_broken_archive_with7z(self):
15881597
msg = 'No error returned'
15891598
self.assertRaisesInstance(ExtractErrorFailedToExtract(msg), sevenzip.extract, test_file, test_dir)
15901599

1591-
def test_extract_7z_with_broken_archive(self):
1600+
def test_extract_7z_with_broken_archive_does_not_fail_when_using_fallback(self):
15921601
test_file = self.get_test_loc('archive/7z/corrupted7z.7z')
15931602
test_dir = self.get_temp_dir()
1594-
expected = libarchive2.ArchiveError()
1595-
expected.msg = 'Damaged 7-Zip archive'
1596-
self.assertRaisesInstance(expected, archive.extract_7z, test_file, test_dir)
1603+
msg = 'No error returned'
1604+
self.assertRaisesInstance(ExtractErrorFailedToExtract(msg), archive.extract_7z, test_file, test_dir)
15971605

15981606
def test_extract_7z_with_non_existing_archive(self):
15991607
test_file = 'archive/7z/I_DO_NOT_EXIST.zip'
@@ -1647,9 +1655,31 @@ def test_extract_7z_with_password_with_7z(self):
16471655
def test_extract_7z_with_password(self):
16481656
test_file = self.get_test_loc('archive/7z/7zip_password.7z')
16491657
test_dir = self.get_temp_dir()
1650-
expected = Exception("'The file content is encrypted, but currently not supported'")
1658+
expected = Exception('Password protected archive, unable to extract')
16511659
self.assertRaisesInstance(expected, archive.extract_7z, test_file, test_dir)
16521660

1661+
def test_extract_7zip_native_with_unicode_path_should_extract_without_error(self):
1662+
test_file = self.get_test_loc('archive/7z/7zip_unicode.7z')
1663+
test_dir = self.get_temp_dir()
1664+
result = sevenzip.extract(test_file, test_dir)
1665+
assert [] == result
1666+
assert 2 == len(os.listdir(os.path.join(test_dir, 'zip')))
1667+
1668+
def test_extract_7zip_with_fallback_with_unicode_path_should_extract_without_error(self):
1669+
test_file = self.get_test_loc('archive/7z/7zip_unicode.7z')
1670+
test_dir = self.get_temp_dir()
1671+
result = archive.extract_7z(test_file, test_dir)
1672+
assert [] == result
1673+
assert 2 == len(os.listdir(os.path.join(test_dir, 'zip')))
1674+
1675+
def test_extract_7zip_libarchive_with_unicode_path_extracts_with_errors(self):
1676+
test_file = self.get_test_loc('archive/7z/7zip_unicode.7z')
1677+
test_dir = self.get_temp_dir()
1678+
try:
1679+
archive.extract_7z(test_file, test_dir)
1680+
except libarchive2.ArchiveError, e:
1681+
assert 'Damaged 7-Zip archive' in e.msg
1682+
16531683

16541684
class TestIso(BaseArchiveTestCase):
16551685
def test_extract_iso_basic(self):

0 commit comments

Comments
 (0)