From b60115feb3103a5db93fe070cccaad4292ac76c5 Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Wed, 5 Nov 2025 14:38:44 -0500 Subject: [PATCH 1/3] cmd-diff: move gc; conditionalize diff code In the case of `cosa diff --gc` there will be no active differs so let's just move the gc code earlier and conditionalize on active_differs being non-empty for the rest of it. --- src/cmd-diff | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/src/cmd-diff b/src/cmd-diff index adf4256dc6..8083e33a49 100755 --- a/src/cmd-diff +++ b/src/cmd-diff @@ -162,27 +162,27 @@ def main(): diff_from = DiffBuildTarget.from_build(builds, args.diff_from, args.arch) diff_to = DiffBuildTarget.from_build(builds, args.diff_to, args.arch) + if args.gc: + # some of the dirs in the rootfs are dumb and have "private" bits + runcmd(['find', DIFF_CACHE, '-type', 'd', '-exec', 'chmod', 'u+rwx', '{}', '+']) + shutil.rmtree(DIFF_CACHE) + # get activated differs active_differs = [] for differ in DIFFERS: if getattr(args, differ.name.replace('-', '_')): active_differs += [differ] - # ensure commits are imported if we know we'll need them - ostree_import = max([d.needs_ostree for d in active_differs]) - if ostree_import > OSTreeImport.NO: - for target in [diff_from, diff_to]: - import_ostree_commit('.', target.dir, target.meta, extract_json=False, - partial_import=(ostree_import == OSTreeImport.PARTIAL)) - - # start diff'ing - for differ in active_differs: - differ.function(diff_from, diff_to) - - if args.gc: - # some of the dirs in the rootfs are dumb and have "private" bits - runcmd(['find', DIFF_CACHE, '-type', 'd', '-exec', 'chmod', 'u+rwx', '{}', '+']) - shutil.rmtree(DIFF_CACHE) + if active_differs: + # ensure commits are imported if we know we'll need them + ostree_import = max([d.needs_ostree for d in active_differs]) + if ostree_import > OSTreeImport.NO: + for target in [diff_from, diff_to]: + import_ostree_commit('.', target.dir, target.meta, extract_json=False, + partial_import=(ostree_import == OSTreeImport.PARTIAL)) + # start diff'ing + for differ in active_differs: + differ.function(diff_from, diff_to) def parse_args(): From 95f7dc688e82df7a6654f7dc5d20d97109e7f5e7 Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Wed, 5 Nov 2025 14:49:14 -0500 Subject: [PATCH 2/3] cmd-diff: extract files instead of using FUSE This approach use more disk space but disk access for the diff will be faster. Files will also survive under after git diff returns in case manual inspection is desired. Co-authored-by: Jean-Baptiste Trystram --- src/cmd-diff | 112 ++++++++++++++++++++++++--------------------------- 1 file changed, 53 insertions(+), 59 deletions(-) diff --git a/src/cmd-diff b/src/cmd-diff index 8083e33a49..4022b946c7 100755 --- a/src/cmd-diff +++ b/src/cmd-diff @@ -7,9 +7,7 @@ import json import subprocess import sys import tempfile -import time import rpm -from multiprocessing import Process from dataclasses import dataclass from enum import IntEnum @@ -531,72 +529,68 @@ def run_guestfs_mount(image_path, mount_target): # the paths to be used for analysis and then clean up once given back # control. def diff_metal_helper(diff_from, diff_to): - metal_from = get_metal_path(diff_from) - metal_to = get_metal_path(diff_to) - - mount_dir_from = os.path.join(cache_dir("metal"), diff_from.id) - mount_dir_to = os.path.join(cache_dir("metal"), diff_to.id) - - for d in [mount_dir_from, mount_dir_to]: - if os.path.exists(d): - shutil.rmtree(d) - os.makedirs(d) - - # As the libreguest mount call is blocking until unmounted, let's - # do that in a separate thread - p_from = Process(target=run_guestfs_mount, args=(metal_from, mount_dir_from)) - p_to = Process(target=run_guestfs_mount, args=(metal_to, mount_dir_to)) - - try: - p_from.start() - p_to.start() - # Wait for the FUSE mounts to be ready. We'll check for a known file. - for i, d in enumerate([mount_dir_from, mount_dir_to]): - p = p_from if i == 0 else p_to - timeout = 60 # seconds - start_time = time.time() - check_file = os.path.join(d, 'ostree') - while not os.path.exists(check_file): - time.sleep(1) - if time.time() - start_time > timeout: - raise Exception(f"Timeout waiting for mount in {d}") - if not p.is_alive(): - raise Exception(f"A guestfs process for {os.path.basename(d)} died unexpectedly.") - - # Allow the caller to operate on these values - yield mount_dir_from, mount_dir_to - - finally: - # Unmount the FUSE binds, this will make the guestfs mount calls return - runcmd(['fusermount', '-u', mount_dir_from], check=False) - runcmd(['fusermount', '-u', mount_dir_to], check=False) - - # Ensure the background processes are terminated - def shutdown_process(process): - process.join(timeout=5) - if process.is_alive(): - process.terminate() - process.join() - - shutdown_process(p_from) - shutdown_process(p_to) + metal_image_from = get_metal_path(diff_from) + metal_image_to = get_metal_path(diff_to) + + diff_dir_from = os.path.join(cache_dir("metal"), diff_from.id) + diff_dir_to = os.path.join(cache_dir("metal"), diff_to.id) + + for image_path, diff_dir in [(metal_image_from, diff_dir_from), + (metal_image_to, diff_dir_to)]: + if os.path.exists(diff_dir): + # If it exists assume it's cached already and we don't + # need to do anything. If it's stale for whatever reason + # the user can `cosa diff --gc`. + continue + + os.makedirs(diff_dir) + + g = None + try: + g = guestfs.GuestFS(python_return_dict=True) + g.set_backend("direct") + g.add_drive_opts(image_path, readonly=1) + g.launch() + + # Mount the disks in the guestfs VM + root = g.findfs_label("root") + g.mount_ro(root, "/") + boot = g.findfs_label("boot") + g.mount_ro(boot, "/boot") + efi = g.findfs_label("EFI-SYSTEM") + g.mount_ro(efi, "/boot/efi") + + with tempfile.NamedTemporaryFile(suffix=".tar", delete=True) as tmp_tar: + g.tar_out("/", tmp_tar.name, xattrs=True, selinux=True, excludes=excludes) + # Extract the tarball. + runcmd(['tar', '-xf', tmp_tar.name, '-C', diff_dir]) + + except Exception as e: + print(f"Error in guestfs process for {image_path}: {e}", file=sys.stderr) + raise + finally: + if g: + g.close() + + # Allow the caller to operate on these values + return diff_dir_from, diff_dir_to def diff_metal(diff_from, diff_to): - for mount_dir_from, mount_dir_to in diff_metal_helper(diff_from, diff_to): - git_diff(mount_dir_from, mount_dir_to) + mount_dir_from, mount_dir_to = diff_metal_helper(diff_from, diff_to) + git_diff(mount_dir_from, mount_dir_to) def diff_metal_du(diff_from, diff_to): - for mount_dir_from, mount_dir_to in diff_metal_helper(diff_from, diff_to): - cmd = ['find', '.', '-type', 'd', '-exec', 'du', '-sh', '{}', ';'] - diff_cmd_outputs(cmd, mount_dir_from, mount_dir_to, strategy=DiffCmdOutputStrategy.CD) + mount_dir_from, mount_dir_to = diff_metal_helper(diff_from, diff_to) + cmd = ['find', '.', '-type', 'd', '-exec', 'du', '-sh', '{}', ';'] + diff_cmd_outputs(cmd, mount_dir_from, mount_dir_to, strategy=DiffCmdOutputStrategy.CD) def diff_metal_ls(diff_from, diff_to): - for mount_dir_from, mount_dir_to in diff_metal_helper(diff_from, diff_to): - cmd = ['find', '.'] - diff_cmd_outputs(cmd, mount_dir_from, mount_dir_to, strategy=DiffCmdOutputStrategy.CD) + mount_dir_from, mount_dir_to = diff_metal_helper(diff_from, diff_to) + cmd = ['find', '.'] + diff_cmd_outputs(cmd, mount_dir_from, mount_dir_to, strategy=DiffCmdOutputStrategy.CD) def diff_cmd_outputs(cmd, path_from, path_to, strategy: DiffCmdOutputStrategy = DiffCmdOutputStrategy.TEMPLATE): From 34a255d941aed709f67ba1f349b7be516c258e72 Mon Sep 17 00:00:00 2001 From: Dusty Mabe Date: Wed, 5 Nov 2025 14:51:11 -0500 Subject: [PATCH 3/3] cmd-diff: attempt to normalize metal diffs The diffs contain a bunch of files with sha256 checksums in them, which are unique on every build. Let's attempt to normalize the output directories so the diffs will me less noisy and more meaningful. Also fixup some permissions on some files because they can't be diffed otherwise. --- src/cmd-diff | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/cmd-diff b/src/cmd-diff index 4022b946c7..86029ca057 100755 --- a/src/cmd-diff +++ b/src/cmd-diff @@ -561,9 +561,14 @@ def diff_metal_helper(diff_from, diff_to): g.mount_ro(efi, "/boot/efi") with tempfile.NamedTemporaryFile(suffix=".tar", delete=True) as tmp_tar: + # Exclude ostree/repo/objects. It just adds noise to the diff + excludes = ['*ostree/repo/objects/*'] g.tar_out("/", tmp_tar.name, xattrs=True, selinux=True, excludes=excludes) - # Extract the tarball. - runcmd(['tar', '-xf', tmp_tar.name, '-C', diff_dir]) + # Extract the tarball. Normalize the output by replacing sha256sum hashes + # in filenames with XXXXXXXXXXXXXXXX so that we can get a real diff between + # two of the same files in different builds. + runcmd(['tar', '-xf', tmp_tar.name, '-C', diff_dir, + '--transform', 's|[[:xdigit:]]{64}|XXXXXXXXXXXXXXXX|gx']) except Exception as e: print(f"Error in guestfs process for {image_path}: {e}", file=sys.stderr) @@ -572,6 +577,13 @@ def diff_metal_helper(diff_from, diff_to): if g: g.close() + # Some files like /etc/shadow and sudo have no read permissions so let's + # open it up so the difftool can access it. + runcmd(['find', diff_dir, '-type', 'f', '-perm', '000', + '-exec', 'chmod', '--verbose', '444', '{}', ';']) + runcmd(['find', diff_dir, '-type', 'f', '-perm', '111', + '-exec', 'chmod', '--verbose', '555', '{}', ';']) + # Allow the caller to operate on these values return diff_dir_from, diff_dir_to