Skip to content

Commit d7195c7

Browse files
committed
use tarfile module instead of executing external shell commands to create archives of git repos
1 parent d0a55ba commit d7195c7

File tree

1 file changed

+64
-38
lines changed

1 file changed

+64
-38
lines changed

easybuild/tools/filetools.py

Lines changed: 64 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,15 @@
4848
import inspect
4949
import itertools
5050
import os
51+
import pathlib
5152
import platform
5253
import re
5354
import shutil
5455
import signal
5556
import stat
5657
import ssl
5758
import sys
59+
import tarfile
5860
import tempfile
5961
import time
6062
import zlib
@@ -2644,7 +2646,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26442646
"""
26452647
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it
26462648
2647-
:param filename: name of the archive to save the code to (must be .tar.gz)
2649+
:param filename: name of the archive to save the code to (must be extensionless)
26482650
:param target_dir: target directory where to save the archive to
26492651
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
26502652
"""
@@ -2680,8 +2682,10 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26802682
if not url:
26812683
raise EasyBuildError("url not specified in git_config parameter")
26822684

2683-
if not filename.endswith('.tar.gz'):
2684-
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz")
2685+
file_ext = find_extension(filename)
2686+
if file_ext:
2687+
print_warning(f"Ignoring extension of filename '{filename}' set in git_config parameter")
2688+
filename = filename[:-len(file_ext)]
26852689

26862690
# prepare target directory and clone repository
26872691
mkdir(target_dir, parents=True)
@@ -2768,50 +2772,72 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
27682772
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True)
27692773

27702774
# Create archive
2771-
archive_path = os.path.join(target_dir, filename)
2772-
2773-
if keep_git_dir:
2774-
# create archive of git repo including .git directory
2775-
tar_cmd = f"tar cfvz {archive_path} {repo_name}"
2776-
else:
2777-
# create reproducible archive
2778-
tar_cmd = reproducible_archive_cmd(repo_name, archive_path)
2779-
2780-
run_shell_cmd(tar_cmd, work_dir=tmpdir, hidden=True, verbose_dry_run=True)
2775+
repo_path = os.path.join(tmpdir, repo_name)
2776+
archive_path = make_archive(repo_path, archive_name=filename, archive_dir=target_dir, reproducible=not keep_git_dir)
27812777

27822778
# cleanup (repo_name dir does not exist in dry run mode)
27832779
remove(tmpdir)
27842780

27852781
return archive_path
27862782

27872783

2788-
def reproducible_archive_cmd(dir_name, archive_name):
2784+
def make_archive(dir_path, archive_name=None, archive_dir=None, reproducible=False):
27892785
"""
2790-
Return string with command to make reproducible archive from a given directory
2786+
Create a compressed tar archive in XZ format.
2787+
2788+
:dir_path: string with path to directory to be archived
2789+
:archive_name: string with extensionless filename of archive
2790+
:archive_dir: string with path to directory to place the archive
2791+
:reproducuble: make a tarball that is reproducible accross systems
27912792
see https://reproducible-builds.org/docs/archives/
2792-
"""
2793-
try:
2794-
cmd_pipe = [
2795-
# stop on failure of any command in the pipe
2796-
'set', '-eo pipefail', ';',
2797-
# print names of all files and folders excluding .git directory
2798-
'find', str(dir_name), '-name ".git"', '-prune', '-o', '-print0',
2799-
# reset access and modification timestamps to epoch 0
2800-
'-exec', 'touch', '--date=1970-01-01T00:00:00.00Z', '{}', r'\;',
2801-
# reset file permissions of cloned repo (equivalent to --mode in GNU tar)
2802-
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|',
2803-
# sort file list (equivalent to --sort in GNU tar)
2804-
'LC_ALL=C', 'sort', '--zero-terminated', '|',
2805-
# create tarball in GNU format with ownership and permissions reset
2806-
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner',
2807-
'--format=gnu', '--null', '--files-from', '-', '|',
2808-
# compress tarball with gzip without original file name and timestamp
2809-
'gzip', '--no-name', '>', str(archive_name)
2810-
]
2811-
except TypeError as err:
2812-
raise EasyBuildError("reproducible_archive_cmd: wrong directory or archive name given") from err
2813-
2814-
return " ".join(cmd_pipe)
2793+
2794+
Archive is compressed with LZMA into a .xz because that is compatible with
2795+
a reproducible archive. Other formats like .gz are not reproducible due to
2796+
arbitrary strings and timestamps getting added into their metadata.
2797+
"""
2798+
def reproducible_filter(tarinfo):
2799+
"Filter out system-dependent data from tarball"
2800+
# contents of '.git' subdir are inherently system dependent
2801+
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"):
2802+
return None
2803+
# set timestamp to epoch 0
2804+
tarinfo.mtime = 0
2805+
# reset file permissions by applying go+u,go-w
2806+
user_mode = tarinfo.mode & stat.S_IRWXU
2807+
tarinfo.mode = tarinfo.mode | user_mode >> 3 & ~stat.S_IWGRP | user_mode >> 6 & ~stat.S_IWOTH
2808+
# reset ownership numeric UID/GID 0
2809+
tarinfo.uid = tarinfo.gid = 0
2810+
tarinfo.uname = tarinfo.gname = ""
2811+
return tarinfo
2812+
2813+
if archive_name is None:
2814+
archive_name = os.path.basename(dir_path)
2815+
2816+
archive_ext = ".tar.xz"
2817+
archive_filename = archive_name + archive_ext
2818+
archive_path = archive_filename if archive_dir is None else os.path.join(archive_dir, archive_filename)
2819+
2820+
# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
2821+
# since Python v3.7 tarfile automatically orders the list of files added to the archive
2822+
dir_files = [dir_path]
2823+
# pathlib's glob includes hidden files
2824+
dir_files.extend([str(filepath) for filepath in pathlib.Path(dir_path).glob("**/*")])
2825+
dir_files.sort() # independent of locale
2826+
2827+
dir_path_prefix = os.path.dirname(dir_path)
2828+
archive_filter = reproducible_filter if reproducible else None
2829+
2830+
_log.info("Archiving '%s' into '%s'...", dir_path, archive_path)
2831+
with tarfile.open(archive_path, "w:xz", format=tarfile.GNU_FORMAT, encoding="utf-8", preset=6) as archive:
2832+
for filepath in dir_files:
2833+
# archive with target directory in its top level, remove any prefix in path
2834+
file_name = os.path.relpath(filepath, start=dir_path_prefix)
2835+
archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
2836+
_log.debug("File/folder added to archive '%s': %s", archive_filename, filepath)
2837+
2838+
_log.info("Archive '%s' created successfully", archive_filename)
2839+
2840+
return archive_path
28152841

28162842

28172843
def move_file(path, target_path, force_in_dry_run=False):

0 commit comments

Comments
 (0)