11#
2- # Copyright (c) 2015 nexB Inc. and others. All rights reserved.
2+ # Copyright (c) 2016 nexB Inc. and others. All rights reserved.
33# http://nexb.com and https://github.com/nexB/scancode-toolkit/
44# The ScanCode software is licensed under the Apache License version 2.0.
55# Data generated with ScanCode require an acknowledgment.
8585 - http://en.wikipedia.org/wiki/List_of_file_formats#Archive_and_compressed
8686"""
8787
88- # high level aliases to lower level extraction functions
89- extract_tar = libarchive2 .extract
90- extract_patch = patch .extract
91-
92- extract_deb = libarchive2 .extract
93- extract_ar = libarchive2 .extract
94- extract_msi = sevenzip .extract
95- extract_cpio = libarchive2 .extract
96- extract_7z = libarchive2 .extract
97- extract_zip = libarchive2 .extract
98-
99- extract_iso = sevenzip .extract
100- extract_rar = sevenzip .extract
101- extract_rpm = sevenzip .extract
102- extract_xz = sevenzip .extract
103- extract_lzma = sevenzip .extract
104- extract_squashfs = sevenzip .extract
105- extract_cab = sevenzip .extract
106- extract_nsis = sevenzip .extract
107- extract_ishield = sevenzip .extract
108- extract_Z = sevenzip .extract
109-
110-
11188# if strict, all hanlders criteria must be matched for it to be selected
11289Handler = namedtuple ('Handler' , ['name' , 'filetypes' , 'mimetypes' , 'extensions' , 'kind' , 'extractors' , 'strict' ])
11390
@@ -319,9 +296,9 @@ def extract_twice(location, target_dir, extractor1, extractor2):
319296 covers most common cases.
320297 """
321298 abs_location = os .path .abspath (os .path .expanduser (location ))
322- abs_target_dir = os .path .abspath (os .path .expanduser (target_dir ))
299+ abs_target_dir = unicode ( os .path .abspath (os .path .expanduser (target_dir ) ))
323300 # extract first the intermediate payload to a temp dir
324- temp_target = fileutils .get_temp_dir ('extract' )
301+ temp_target = unicode ( fileutils .get_temp_dir ('extract' ) )
325302 warnings = extractor1 (abs_location , temp_target )
326303 if DEBUG :
327304 logger .debug ('extract_twice: temp_target: %(temp_target)r' % locals ())
@@ -335,16 +312,73 @@ def extract_twice(location, target_dir, extractor1, extractor2):
335312 for extracted1_loc in inner_archives :
336313 if DEBUG :
337314 logger .debug ('extract_twice: extractor2: %(extracted1_loc)r' % locals ())
338- warnings .extend (extractor2 (extracted1_loc , target_dir ))
315+ warnings .extend (extractor2 (extracted1_loc , abs_target_dir ))
339316 finally :
340317 # cleanup the temporary output from extractor1
341318 fileutils .delete (temp_target )
342319 return warnings
343320
344321
345- """
346- List of archive handlers.
347- """
322+ def extract_with_fallback (location , target_dir , extractor1 , extractor2 ):
323+ """
324+ Extract archive at `location` to `target_dir` trying first `extractor1` function.
325+ If extract fails, attempt extraction again with the `extractor2` function.
326+ Return a list of warning messages. Raise exceptions on errors.
327+
328+ Note: there are a few cases where the primary extractor for a type may fail and
329+ a secondary extractor will succeed.
330+ """
331+ abs_location = os .path .abspath (os .path .expanduser (location ))
332+ abs_target_dir = unicode (os .path .abspath (os .path .expanduser (target_dir )))
333+ # attempt extract first to a temp dir
334+ temp_target1 = unicode (fileutils .get_temp_dir ('extract1' ))
335+ try :
336+ warnings = extractor1 (abs_location , temp_target1 )
337+ if DEBUG :
338+ logger .debug ('extract_with_fallback: temp_target1: %(temp_target1)r' % locals ())
339+ fileutils .copytree (temp_target1 , abs_target_dir )
340+ except :
341+ try :
342+ temp_target2 = unicode (fileutils .get_temp_dir ('extract2' ))
343+ warnings = extractor2 (abs_location , temp_target2 )
344+ if DEBUG :
345+ logger .debug ('extract_with_fallback: temp_target2: %(temp_target2)r' % locals ())
346+ fileutils .copytree (temp_target2 , abs_target_dir )
347+ finally :
348+ fileutils .delete (temp_target2 )
349+ finally :
350+ fileutils .delete (temp_target1 )
351+ return warnings
352+
353+
354+ # High level aliases to lower level extraction functions
355+ ########################################################
356+ extract_tar = libarchive2 .extract
357+ extract_patch = patch .extract
358+
359+ extract_deb = libarchive2 .extract
360+ extract_ar = libarchive2 .extract
361+ extract_msi = sevenzip .extract
362+ extract_cpio = libarchive2 .extract
363+
364+ # sevenzip should be best at extracting 7zip but most often libarchive is better first
365+ extract_7z = functools .partial (extract_with_fallback , extractor1 = libarchive2 .extract , extractor2 = sevenzip .extract )
366+
367+ extract_zip = libarchive2 .extract
368+ extract_iso = sevenzip .extract
369+ extract_rar = sevenzip .extract
370+ extract_rpm = sevenzip .extract
371+ extract_xz = sevenzip .extract
372+ extract_lzma = sevenzip .extract
373+ extract_squashfs = sevenzip .extract
374+ extract_cab = sevenzip .extract
375+ extract_nsis = sevenzip .extract
376+ extract_ishield = sevenzip .extract
377+ extract_Z = sevenzip .extract
378+
379+
380+ # Archive handlers.
381+ ####################
348382
349383TarHandler = Handler (
350384 name = 'Tar' ,
@@ -795,6 +829,7 @@ def extract_twice(location, target_dir, extractor1, extractor2):
795829 strict = True
796830)
797831
832+ # Actual list of handlers
798833
799834archive_handlers = [
800835 TarHandler ,
0 commit comments