Skip to content

Commit 1c44ad7

Browse files
committed
respect file extensions in archive filenames of filetools.make_archive()
1 parent ad47cac commit 1c44ad7

File tree

2 files changed

+147
-59
lines changed

2 files changed

+147
-59
lines changed

easybuild/tools/filetools.py

Lines changed: 82 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -1404,7 +1404,7 @@ def get_local_dirs_purged():
14041404
return new_dir
14051405

14061406

1407-
def find_extension(filename, required=True):
1407+
def find_extension(filename):
14081408
"""Find best match for filename extension."""
14091409
# sort by length, so longest file extensions get preference
14101410
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True)
@@ -1413,12 +1413,9 @@ def find_extension(filename, required=True):
14131413

14141414
if res:
14151415
return res.group('ext')
1416-
1417-
if required:
1416+
else:
14181417
raise EasyBuildError("%s has unknown file extension", filename)
14191418

1420-
return None
1421-
14221419

14231420
def extract_cmd(filepath, overwrite=False):
14241421
"""
@@ -2648,7 +2645,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26482645
"""
26492646
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it
26502647
2651-
:param filename: name of the archive to save the code to (must be extensionless)
2648+
:param filename: name of the archive file to save the code to (including extension)
26522649
:param target_dir: target directory where to save the archive to
26532650
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
26542651
"""
@@ -2684,11 +2681,6 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26842681
if not url:
26852682
raise EasyBuildError("url not specified in git_config parameter")
26862683

2687-
file_ext = find_extension(filename, required=False)
2688-
if file_ext:
2689-
print_warning(f"Ignoring extension of filename '{filename}' set in git_config parameter")
2690-
filename = filename[:-len(file_ext)]
2691-
26922684
# prepare target directory and clone repository
26932685
mkdir(target_dir, parents=True)
26942686

@@ -2776,27 +2768,27 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
27762768
# Create archive
27772769
repo_path = os.path.join(tmpdir, repo_name)
27782770
reproducible = not keep_git_dir # presence of .git directory renders repo unreproducible
2779-
archive_path = make_archive(repo_path, archive_name=filename, archive_dir=target_dir, reproducible=reproducible)
2771+
archive_path = make_archive(repo_path, archive_file=filename, archive_dir=target_dir, reproducible=reproducible)
27802772

27812773
# cleanup (repo_name dir does not exist in dry run mode)
27822774
remove(tmpdir)
27832775

27842776
return archive_path
27852777

27862778

2787-
def make_archive(dir_path, archive_name=None, archive_dir=None, reproducible=False):
2779+
def make_archive(source_dir, archive_file=None, archive_dir=None, reproducible=True):
27882780
"""
2789-
Create a compressed tar archive in XZ format.
2781+
Create an archive file of the given directory
27902782
2791-
:dir_path: string with path to directory to be archived
2792-
:archive_name: string with extensionless filename of archive
2783+
:source_dir: string with path to directory to be archived
2784+
:archive_file: string with filename of archive
27932785
:archive_dir: string with path to directory to place the archive
2794-
:reproducuble: make a tarball that is reproducible accross systems
2795-
see https://reproducible-builds.org/docs/archives/
2786+
:reproducible: make a tarball that is reproducible accross systems
2787+
- see https://reproducible-builds.org/docs/archives/
2788+
- requires uncompressed or LZMA compressed archive images, other formats like .gz are not reproducible
2789+
due to arbitrary strings and timestamps added into their metadata.
27962790
2797-
Archive is compressed with LZMA into a .xz because that is compatible with
2798-
a reproducible archive. Other formats like .gz are not reproducible due to
2799-
arbitrary strings and timestamps getting added into their metadata.
2791+
Default behaviour: reproducible tarball in .tar.xz
28002792
"""
28012793
def reproducible_filter(tarinfo):
28022794
"Filter out system-dependent data from tarball"
@@ -2815,37 +2807,87 @@ def reproducible_filter(tarinfo):
28152807
tarinfo.uname = tarinfo.gname = ""
28162808
return tarinfo
28172809

2818-
if archive_name is None:
2819-
archive_name = os.path.basename(dir_path)
2810+
compression = {
2811+
# taken from EXTRACT_CMDS
2812+
'.gtgz': 'gz',
2813+
'.tar.gz': 'gz',
2814+
'.tgz': 'gz',
2815+
'.tar.bz2': 'bz2',
2816+
'.tb2': 'bz2',
2817+
'.tbz': 'bz2',
2818+
'.tbz2': 'bz2',
2819+
'.tar.xz': 'xz',
2820+
'.txz': 'xz',
2821+
'.tar': '',
2822+
}
2823+
reproducible_compression = ["", "xz"]
2824+
default_ext = ".tar.xz"
28202825

2821-
archive_ext = ".tar.xz"
2822-
archive_filename = archive_name + archive_ext
2823-
archive_path = archive_filename if archive_dir is None else os.path.join(archive_dir, archive_filename)
2826+
if archive_file is None:
2827+
archive_file = os.path.basename(source_dir) + default_ext
2828+
2829+
try:
2830+
archive_ext = find_extension(archive_file)
2831+
except EasyBuildError:
2832+
if "." in archive_file:
2833+
# archive filename has unknown extension (set for raise)
2834+
archive_ext = ""
2835+
else:
2836+
# archive filename has no extension, use default one
2837+
archive_ext = default_ext
2838+
archive_file += archive_ext
2839+
2840+
if archive_ext not in compression:
2841+
# archive filename has unsupported extension
2842+
raise EasyBuildError(
2843+
f"Unsupported archive format: {archive_file}. Supported tarball extensions: {', '.join(compression)}"
2844+
)
2845+
_log.debug(f"Archive extension and compression: {archive_ext} in {compression[archive_ext]}")
2846+
2847+
archive_path = archive_file if archive_dir is None else os.path.join(archive_dir, archive_file)
2848+
2849+
archive = {
2850+
'name': archive_path,
2851+
'mode': f"w:{compression[archive_ext]}",
2852+
'format': tarfile.GNU_FORMAT,
2853+
'encoding': "utf-8",
2854+
}
2855+
2856+
if reproducible:
2857+
if compression[archive_ext] == "xz":
2858+
# ensure a consistent compression level in reproducible tarballs with XZ
2859+
archive["preset"] = 6
2860+
elif compression[archive_ext] not in reproducible_compression:
2861+
# requested archive compression cannot be made reproducible
2862+
print_warning(
2863+
f"Requested reproducible archive with unsupported file compression ({compression[archive_ext]}). "
2864+
"Please use XZ instead."
2865+
)
2866+
reproducible = False
2867+
2868+
archive_filter = reproducible_filter if reproducible else None
28242869

28252870
if build_option('extended_dry_run'):
28262871
# early return in dry run mode
2827-
dry_run_msg("Archiving '%s' into '%s'...", dir_path, archive_path)
2872+
dry_run_msg("Archiving '%s' into '%s'...", source_dir, archive_path)
28282873
return archive_path
2874+
_log.info("Archiving '%s' into '%s'...", source_dir, archive_path)
28292875

28302876
# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
28312877
# since Python v3.7 tarfile automatically orders the list of files added to the archive
2832-
dir_files = [dir_path]
2878+
source_files = [source_dir]
28332879
# pathlib's glob includes hidden files
2834-
dir_files.extend([str(filepath) for filepath in pathlib.Path(dir_path).glob("**/*")])
2835-
dir_files.sort() # independent of locale
2836-
2837-
dir_path_prefix = os.path.dirname(dir_path)
2838-
archive_filter = reproducible_filter if reproducible else None
2880+
source_files.extend([str(filepath) for filepath in pathlib.Path(source_dir).glob("**/*")])
2881+
source_files.sort() # independent of locale
28392882

2840-
_log.info("Archiving '%s' into '%s'...", dir_path, archive_path)
2841-
with tarfile.open(archive_path, "w:xz", format=tarfile.GNU_FORMAT, encoding="utf-8", preset=6) as archive:
2842-
for filepath in dir_files:
2883+
with tarfile.open(**archive) as tar_archive:
2884+
for filepath in source_files:
28432885
# archive with target directory in its top level, remove any prefix in path
2844-
file_name = os.path.relpath(filepath, start=dir_path_prefix)
2845-
archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
2846-
_log.debug("File/folder added to archive '%s': %s", archive_filename, filepath)
2886+
file_name = os.path.relpath(filepath, start=os.path.dirname(source_dir))
2887+
tar_archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
2888+
_log.debug("File/folder added to archive '%s': %s", archive_file, filepath)
28472889

2848-
_log.info("Archive '%s' created successfully", archive_filename)
2890+
_log.info("Archive '%s' created successfully", archive_file)
28492891

28502892
return archive_path
28512893

test/framework/filetools.py

Lines changed: 65 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3145,32 +3145,78 @@ def test_make_archive(self):
31453145
ft.write_file(os.path.join(tardir, 'lib', 'lib.so'), 'Dummy library')
31463146
ft.write_file(os.path.join(tardir, 'include', 'header.h'), 'Dummy header')
31473147

3148-
unreprod_tar = ft.make_archive(tardir, reproducible=False)
3149-
unreprod_tar_chksum = ft.compute_checksum(unreprod_tar, checksum_type="sha256")
3150-
self.assertEqual(unreprod_tar, "test_archive.tar.xz")
3151-
os.remove(unreprod_tar)
3152-
reprod_tar = ft.make_archive(tardir, reproducible=True)
3148+
# default behaviour
3149+
unreprod_txz = ft.make_archive(tardir, reproducible=False)
3150+
unreprod_txz_chksum = ft.compute_checksum(unreprod_txz, checksum_type="sha256")
3151+
self.assertEqual(unreprod_txz, "test_archive.tar.xz")
3152+
self.assertExists(unreprod_txz)
3153+
os.remove(unreprod_txz)
3154+
reprod_txz = ft.make_archive(tardir, reproducible=True)
3155+
reprod_txz_chksum = ft.compute_checksum(reprod_txz, checksum_type="sha256")
3156+
self.assertEqual(reprod_txz, "test_archive.tar.xz")
3157+
self.assertExists(reprod_txz)
3158+
os.remove(reprod_txz)
3159+
# custom filenames
3160+
custom_txz = ft.make_archive(tardir, archive_file="custom_name", reproducible=True)
3161+
custom_txz_chksum = ft.compute_checksum(custom_txz, checksum_type="sha256")
3162+
self.assertEqual(custom_txz, "custom_name.tar.xz")
3163+
self.assertExists(custom_txz)
3164+
os.remove(custom_txz)
3165+
customdir_txz = ft.make_archive(tardir, archive_file="custom_name", archive_dir=tmpdir, reproducible=True)
3166+
customdir_txz_chksum = ft.compute_checksum(customdir_txz, checksum_type="sha256")
3167+
self.assertEqual(customdir_txz, os.path.join(tmpdir, "custom_name.tar.xz"))
3168+
self.assertExists(customdir_txz)
3169+
os.remove(customdir_txz)
3170+
# custom .tar
3171+
reprod_tar = ft.make_archive(tardir, archive_file="custom_name.tar", reproducible=True)
31533172
reprod_tar_chksum = ft.compute_checksum(reprod_tar, checksum_type="sha256")
3154-
self.assertEqual(reprod_tar, "test_archive.tar.xz")
3173+
self.assertEqual(reprod_tar, "custom_name.tar")
3174+
self.assertExists(reprod_tar)
31553175
os.remove(reprod_tar)
3156-
custom_tar = ft.make_archive(tardir, archive_name="custom_name", reproducible=True)
3157-
custom_tar_chksum = ft.compute_checksum(custom_tar, checksum_type="sha256")
3158-
self.assertEqual(custom_tar, "custom_name.tar.xz")
3159-
os.remove(custom_tar)
3160-
customdir_tar = ft.make_archive(tardir, archive_name="custom_name", archive_dir=tmpdir, reproducible=True)
3161-
customdir_tar_chksum = ft.compute_checksum(customdir_tar, checksum_type="sha256")
3162-
self.assertEqual(customdir_tar, os.path.join(tmpdir, "custom_name.tar.xz"))
3163-
os.remove(customdir_tar)
3176+
unreprod_tar = ft.make_archive(tardir, archive_file="custom_name.tar", reproducible=False)
3177+
unreprod_tar_chksum = ft.compute_checksum(unreprod_tar, checksum_type="sha256")
3178+
self.assertEqual(unreprod_tar, "custom_name.tar")
3179+
self.assertExists(unreprod_tar)
3180+
os.remove(unreprod_tar)
3181+
# custom .tar.gz
3182+
self.mock_stdout(True)
3183+
self.mock_stderr(True)
3184+
custom_tgz = ft.make_archive(tardir, archive_file="custom_name.tar.gz", reproducible=True)
3185+
stderr = self.get_stderr()
3186+
self.mock_stdout(False)
3187+
self.mock_stderr(False)
3188+
self.assertIn("WARNING: Requested reproducible archive with unsupported file compression (gz)", stderr)
3189+
custom_tgz_chksum = ft.compute_checksum(custom_tgz, checksum_type="sha256")
3190+
self.assertEqual(custom_tgz, "custom_name.tar.gz")
3191+
self.assertExists(custom_tgz)
3192+
os.remove(custom_tgz)
3193+
self.mock_stdout(True)
3194+
self.mock_stderr(True)
3195+
custom_tgz = ft.make_archive(tardir, archive_file="custom_name.tar.gz", reproducible=False)
3196+
stderr = self.get_stderr()
3197+
self.mock_stdout(False)
3198+
self.mock_stderr(False)
3199+
self.assertNotIn("WARNING: Requested reproducible archive with unsupported file compression (gz)", stderr)
3200+
custom_tgz_chksum = ft.compute_checksum(custom_tgz, checksum_type="sha256")
3201+
self.assertEqual(custom_tgz, "custom_name.tar.gz")
3202+
self.assertExists(custom_tgz)
3203+
os.remove(custom_tgz)
3204+
3205+
self.assertErrorRegex(EasyBuildError, "Unsupported archive format.*", ft.make_archive, tardir, "unknown.ext")
31643206

3165-
reference_checksum = "ec0f91a462c2743b19b428f4c177d7109d2ccc018dcdedc12570d9d735d6fb1b"
3207+
reference_checksum_txz = "ec0f91a462c2743b19b428f4c177d7109d2ccc018dcdedc12570d9d735d6fb1b"
3208+
reference_checksum_tar = "6e902e77925ab2faeef8377722434d4482f1fcc74af958c984c3f22509ae5084"
31663209

31673210
if sys.version_info[0] >= 3 and sys.version_info[1] >= 9:
31683211
# checksums of tarballs made by EB cannot be reliably checked prior to Python 3.9
31693212
# due to changes introduced in python/cpython#90021
3170-
self.assertNotEqual(unreprod_tar_chksum, reference_checksum)
3171-
self.assertEqual(reprod_tar_chksum, reference_checksum)
3172-
self.assertEqual(custom_tar_chksum, reference_checksum)
3173-
self.assertEqual(customdir_tar_chksum, reference_checksum)
3213+
self.assertNotEqual(unreprod_txz_chksum, reference_checksum_txz)
3214+
self.assertEqual(reprod_txz_chksum, reference_checksum_txz)
3215+
self.assertEqual(custom_txz_chksum, reference_checksum_txz)
3216+
self.assertEqual(customdir_txz_chksum, reference_checksum_txz)
3217+
self.assertNotEqual(unreprod_tar_chksum, reference_checksum_tar)
3218+
self.assertEqual(reprod_tar_chksum, reference_checksum_tar)
3219+
self.assertNotEqual(custom_tgz_chksum, reference_checksum_txz)
31743220

31753221
def test_is_sha256_checksum(self):
31763222
"""Test for is_sha256_checksum function."""

0 commit comments

Comments
 (0)