Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
263 changes: 262 additions & 1 deletion src/fromager/gitutils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,17 @@
import logging
import os
import pathlib
import re
import tarfile
import tempfile
import typing
from urllib.parse import urlparse

from packaging.requirements import Requirement
from packaging.utils import canonicalize_name
from packaging.version import Version

from fromager import context, external_commands
from . import context, external_commands, tarballs

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -61,3 +68,257 @@ def git_clone(
)

return output_dir


class BeforeSubmoduleCallback(typing.Protocol):
"""Before submodule update callback"""

def __call__(self, *, clonedir: pathlib.Path, name: str, remote: str) -> None:
pass


def git_clone_and_tarball(
*,
destdir: pathlib.Path,
prefix: tuple[Requirement, Version] | str,
repo_url: str,
tag: str | None = None,
ref: str | None = None,
before_submodule_update: BeforeSubmoduleCallback | None = None,
git_archival_tag_match: str | None = None,
) -> pathlib.Path:
"""Clone a git repository and generate a ball

This function creates a tar ball from a remote URL, with all submodules
(non-recursive), and includes a ``.git_archival.txt`` for setuptools-scm.

:param destdir: directory where the tar ball is stored
:param prefix: prefix of the tar ball and first level directory
:param repo_url: git clone url
:param tag: tag name to clone
:param ref: git ref to clone (mutually exclusive with *tag*)
:param before_submodule_update: callback that runs before
``git submodule update``. The callback is execute for each submodule.
:param git_archival_tag_match: git describe tag pattern for ``.git_archival.txt``

This example code creates a ``xformers-0.0.31.post1.tar.gz`` tar ball:

.. code-block::

def cb(*, clonedir: pathlib.Path, name: str, remote: str) -> None:
subprocess.check_call(
["git", "config", "set", f"submodule.{name}.url", mirror(remote)],
cwd=str(clonedir)
)

req = Requirement("xformers")
tag = "v0.0.31.post1"
version = Version(tag)
repo_url = "https://github.com/facebookresearch/xformers.git"
destdir = pathlib.Path("destdir").absolute()
tarball = git_clone_and_tarball(
prefix=(req, version),
destdir=destdir,
repo_url=repo_url,
tag=tag,
before_submodule_update=cb,
)
"""
if isinstance(prefix, tuple):
req = prefix[0]
version = prefix[1]
assert isinstance(req, Requirement)
assert isinstance(version, Version)
canon_name = canonicalize_name(req.name)
prefix = f"{canon_name}-{version}"

with tempfile.TemporaryDirectory() as tmpdir:
clonedir = pathlib.Path(tmpdir).absolute()
_git_clone(
clonedir=clonedir,
repo_url=repo_url,
tag=tag,
ref=ref,
)
submodules = _git_submodule_list(clonedir=clonedir)
if before_submodule_update is not None:
for name, remote in submodules.items():
before_submodule_update(clonedir=clonedir, name=name, remote=remote)
_get_submodule_update(clonedir=clonedir)
_make_git_archival_txt(
clonedir=clonedir,
tag_match=git_archival_tag_match,
)
tarball = _create_tarball(
clonedir=clonedir,
destdir=destdir,
prefix=prefix,
)

return tarball


def _git_clone(
*,
clonedir: pathlib.Path,
repo_url: str,
tag: str | None,
ref: str | None,
) -> None:
"""Clone a git repository into *clonedir*

Initializes submodules
"""
if not bool(tag) ^ bool(ref):
raise ValueError("tag and ref are mutually exclusive")

# Create a clean URL without any credentials for logging
parsed_url = urlparse(repo_url)
clean_url = parsed_url._replace(netloc=parsed_url.hostname or "").geturl()
logger.info(f"cloning {clean_url}, tag {tag}, ref {ref}, into {clonedir}")

cmd: list[str] = ["git", "clone"]
if tag is not None:
# --branch works with branches and tags, but not with commits
cmd.extend(["--branch", tag, "--depth", "1"])
cmd.extend([repo_url, str(clonedir)])
external_commands.run(cmd, network_isolation=False)

# --branch only works with names, so we have to checkout the reference we
# actually want if it is not a name
if ref is not None:
external_commands.run(
["git", "checkout", "--force", ref],
cwd=str(clonedir),
network_isolation=False,
)

# initialize submodule but do not fetch them, yet, to allow customization.
external_commands.run(
["git", "submodule", "init"],
cwd=str(clonedir),
network_isolation=False,
)


_SUBMODULE_RE = re.compile(r"^submodule\.(.*)\.url=(.*)$")


def _git_submodule_list(*, clonedir: pathlib.Path) -> dict[str, str]:
"""Get submodule mapping of name -> remote

Submodule must be initialized
"""
out = external_commands.run(
["git", "config", "list", "--local"],
cwd=str(clonedir),
network_isolation=False,
)
submodules = {}
for line in out.split("\n"):
if mo := _SUBMODULE_RE.match(line):
name, remote = mo.groups()
submodules[name] = remote
logger.debug(f"found submodules: {submodules}")
return submodules


def _get_submodule_update(*, clonedir) -> None:
"""Update and fetch submodules"""
external_commands.run(
["git", "submodule", "update", "--force", "--depth", "1"],
cwd=str(clonedir),
network_isolation=False,
)


def _make_git_archival_txt(
clonedir: pathlib.Path,
*,
tag_match: str | None = None,
) -> str:
"""Generate a .git_archival.txt file for setuptools-scm

https://setuptools-scm.readthedocs.io/en/latest/usage/#git-archives
"""
if not tag_match:
tag_match = "*[0-9]*"
# ignore existing .git_archive.txt template
# TODO: Figure out how to use an existing file and replace its template variables.
archival = clonedir / ".git_archival.txt"
parts = [
"node: %H", # commit hash
"node-date: %cI", # commit date
f"describe-name: %(describe:tags=true,match={tag_match})", # tag + commits since tags
]
sep = "\n" # cannot use backslash in f-strings on Python 3.11
out = external_commands.run(
[
"git",
"log",
f"--pretty=tformat:{sep.join(parts)}",
"-1",
],
cwd=str(clonedir),
network_isolation=False,
)
archival.write_text(out)
logger.debug(f"Generated {archival} with content: \n{out}")
return out


def _create_tarball(
*,
clonedir: pathlib.Path,
destdir: pathlib.Path,
prefix: str,
) -> pathlib.Path:
"""Create a tarball from a git checkout"""
# check for '/' in prefix
if os.sep in prefix:
raise ValueError(f"{prefix=} cannot contain {os.sep}")

tarball = destdir / f"{prefix}.tar.gz"
if tarball.is_file():
logger.debug(f"removing stale tar ball {tarball}")
tarball.unlink()

with tarfile.open(tarball, "x:gz", format=tarfile.PAX_FORMAT) as tar:
tarballs.tar_reproducible_with_prefix(
tar=tar,
basedir=clonedir,
prefix=prefix,
exclude_vcs=True,
)
return tarball


def test():
logging.basicConfig(level=logging.DEBUG)

def cb(*, clonedir: pathlib.Path, name: str, remote: str) -> None:
print(name, remote)

if True:
tag = "v0.0.31.post1"
version = Version(tag)
req = Requirement("xformers")
repo_url = "https://github.com/facebookresearch/xformers.git"
else:
tag = "0.54.0"
version = Version(tag)
req = Requirement("fromager")
repo_url = "https://github.com/python-wheel-build/fromager.git"
destdir = pathlib.Path(".").absolute()
tarball = git_clone_and_tarball(
destdir=destdir,
prefix=(req, version),
repo_url=repo_url,
tag=tag,
before_submodule_update=cb,
)
print(tarball)


if __name__ == "__main__":
test()
61 changes: 48 additions & 13 deletions src/fromager/tarballs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pathlib
import stat
import tarfile
import typing

VCS_DIRS = {".bzr", ".git", ".hg", ".svn"}

Expand All @@ -24,6 +25,24 @@ def _tar_reset(tarinfo: tarfile.TarInfo) -> tarfile.TarInfo:
return tarinfo


def _tar_content(
*, basedir: pathlib.Path, exclude_vcs: bool = False
) -> typing.Iterable[str]:
content: list[str] = [str(basedir)] # include root
for root, dirs, files in os.walk(basedir):
if exclude_vcs:
# modify lists in-place, so os.walk does not descent into the
# excluded entries. git submodules have a `.git` file.
dirs[:] = [directory for directory in dirs if directory not in VCS_DIRS]
files[:] = [filename for filename in files if filename not in VCS_DIRS]
for directory in dirs:
content.append(os.path.join(root, directory))
for filename in files:
content.append(os.path.join(root, filename))
content.sort()
return content


def tar_reproducible(
tar: tarfile.TarFile,
basedir: pathlib.Path,
Expand All @@ -39,21 +58,37 @@ def tar_reproducible(
If ``exclude_vcs`` is True, then Bazaar, git, Mercurial, and subversion
directories and files are excluded.
"""
content = [str(basedir)] # convert from pathlib.Path, if that's what we have
for root, dirs, files in os.walk(basedir):
if exclude_vcs:
# modify lists in-place, so os.walk does not descent into the
# excluded entries. git submodules have a `.git` file.
dirs[:] = [directory for directory in dirs if directory not in VCS_DIRS]
files[:] = [filename for filename in files if filename not in VCS_DIRS]
for directory in dirs:
content.append(os.path.join(root, directory))
for filename in files:
content.append(os.path.join(root, filename))
content.sort()

content = _tar_content(basedir=basedir, exclude_vcs=exclude_vcs)
for fn in content:
# Ensure that the paths in the tarfile are rooted at the prefix
# directory, if we have one.
arcname = fn if prefix is None else os.path.relpath(fn, prefix)
tar.add(fn, filter=_tar_reset, recursive=False, arcname=arcname)


def tar_reproducible_with_prefix(
tar: tarfile.TarFile,
basedir: pathlib.Path,
prefix: str,
*,
exclude_vcs: bool = False,
) -> None:
"""Create reproducible tar file with a prefix

Add content from basedir to already opened tar. All archive names are
relative to ``basedir`` and with ``prefix` prepended. The ``prefix``
must be relative and can be ``.``. This is equivalent to
``tar -czf $tarfile -C $basedir --transform 's,^,${prefix}/' .`` or
``git archive --prefix ${prefix}/``.

If ``exclude_vcs`` is True, then Bazaar, git, Mercurial, and subversion
directories and files are excluded.
"""
if os.sep in prefix:
raise ValueError("prefix {prefix} cannot contain {os.sep}")
content = _tar_content(basedir=basedir, exclude_vcs=exclude_vcs)
for fn in content:
# archive names are relative to basedir
# prefix is prepended and path is normalized
arcname = os.path.normpath(os.path.join(prefix, os.path.relpath(fn, basedir)))
tar.add(fn, filter=_tar_reset, recursive=False, arcname=arcname)
16 changes: 16 additions & 0 deletions tests/test_tarballs.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,19 @@ def test_vcs_exclude(tmp_path: pathlib.Path) -> None:
with tarfile.open(t1, "r") as tf:
names = tf.getnames()
assert names == [str(p).lstrip(os.sep) for p in [root, root / "a"]]


def test_tar_reproducible_with_prefix(tmp_path: pathlib.Path) -> None:
root = tmp_path / "root"
root.mkdir()
subdir = root / "subdir"
subdir.mkdir()
a = subdir / "a"
a.write_text("this is file a")

t1 = tmp_path / "out1.tar"
with tarfile.open(t1, "w") as tf:
tarballs.tar_reproducible_with_prefix(tar=tf, basedir=root, prefix="someprefix")
with tarfile.open(t1, "r") as tf:
names = tf.getnames()
assert names == ["someprefix", "someprefix/subdir", "someprefix/subdir/a"]
Loading