Skip to content

Commit f104fe6

Browse files
titu1994hemildesai
andauthored
Refactor tar packaging logic to work for submodule and extra repo (#347)
* Refactor tar packaging logic for improved performance and simplicity Signed-off-by: smajumdar <[email protected]> * Clarify tar repacking logic to avoid issues with concatenating tar files Signed-off-by: smajumdar <[email protected]> * Remove redundant test for concatenating tar files on Linux Signed-off-by: smajumdar <[email protected]> * spell check fix Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: smajumdar <[email protected]> Signed-off-by: Hemil Desai <[email protected]> Co-authored-by: Hemil Desai <[email protected]>
1 parent 6bc4319 commit f104fe6

File tree

3 files changed

+11
-59
lines changed

3 files changed

+11
-59
lines changed

nemo_run/core/packaging/git.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -88,22 +88,15 @@ def _concatenate_tar_files(
8888
quoted_files = [shlex.quote(f) for f in files_to_concatenate]
8989
quoted_output_file = shlex.quote(output_file)
9090

91-
if os.uname().sysname == "Linux":
92-
# Start from the first archive then append the rest, to avoid self-append issues
93-
first_file, *rest_files = quoted_files
94-
ctx.run(f"cp {first_file} {quoted_output_file}")
95-
if rest_files:
96-
ctx.run(f"tar Af {quoted_output_file} {' '.join(rest_files)}")
97-
# Remove all input fragments
98-
ctx.run(f"rm {' '.join(quoted_files)}")
99-
else:
100-
# Extract all fragments and repack once (faster than iterative extract/append)
101-
temp_dir = f"temp_extract_{uuid.uuid4()}"
102-
ctx.run(f"mkdir -p {temp_dir}")
103-
for file in quoted_files:
104-
ctx.run(f"tar xf {file} -C {temp_dir}")
105-
ctx.run(f"tar cf {quoted_output_file} -C {temp_dir} .")
106-
ctx.run(f"rm -r {temp_dir} {' '.join(quoted_files)}")
91+
# Extract all fragments and repack once (faster than iterative extract/append)
92+
# Note: Avoid using tar Af based solution as it does not properly concatenate
93+
# tar files for additional filepaths and submodules.
94+
temp_dir = f"temp_extract_{uuid.uuid4()}"
95+
ctx.run(f"mkdir -p {temp_dir}")
96+
for file in quoted_files:
97+
ctx.run(f"tar xf {file} -C {temp_dir}")
98+
ctx.run(f"tar cf {quoted_output_file} -C {temp_dir} .")
99+
ctx.run(f"rm -r {temp_dir} {' '.join(quoted_files)}")
107100

108101
def package(self, path: Path, job_dir: str, name: str) -> str:
109102
output_file = os.path.join(job_dir, f"{name}.tar.gz")

nemo_run/run/experiment.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,8 @@
5555
from nemo_run.core.execution.lepton import LeptonExecutor
5656
from nemo_run.core.execution.local import LocalExecutor
5757
from nemo_run.core.execution.skypilot import SkypilotExecutor
58-
from nemo_run.core.execution.slurm import SlurmExecutor
5958
from nemo_run.core.execution.skypilot_jobs import SkypilotJobsExecutor
59+
from nemo_run.core.execution.slurm import SlurmExecutor
6060
from nemo_run.core.frontend.console.api import CONSOLE, configure_logging, deconfigure_logging
6161
from nemo_run.core.serialization.zlib_json import ZlibJSONSerializer
6262
from nemo_run.core.tunnel.client import SSHTunnel, Tunnel
@@ -639,7 +639,7 @@ def run(
639639
If sequential=True, all tasks will be run one after the other.
640640
The order is based on the order in which they were added.
641641
642-
Parallel mode only works if all exectuors in the experiment support it.
642+
Parallel mode only works if all executors in the experiment support it.
643643
Currently, all executors support parallel mode.
644644
645645
In sequential mode, if all executor supports dependencies, then all tasks will be scheduled at once

test/core/packaging/test_git.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -463,47 +463,6 @@ def test_concatenate_tar_files_non_linux_integration(tmp_path, monkeypatch):
463463
assert names == ["./fileA.txt", "./fileB.txt"]
464464

465465

466-
def test_concatenate_tar_files_linux_emits_expected_commands(monkeypatch, tmp_path):
467-
# Simulate Linux branch; use a dummy Context that records commands instead of executing
468-
monkeypatch.setattr(os, "uname", lambda: SimpleNamespace(sysname="Linux"))
469-
470-
class DummyContext:
471-
def __init__(self):
472-
self.commands: list[str] = []
473-
474-
def run(self, cmd: str, **_kwargs):
475-
self.commands.append(cmd)
476-
477-
# Support ctx.cd(...) context manager API
478-
def cd(self, _path: Path):
479-
class _CD:
480-
def __enter__(self_nonlocal):
481-
return self
482-
483-
def __exit__(self_nonlocal, exc_type, exc, tb):
484-
return False
485-
486-
return _CD()
487-
488-
# Fake inputs (do not need to exist since we don't execute)
489-
tar1 = str(tmp_path / "one.tar")
490-
tar2 = str(tmp_path / "two.tar")
491-
tar3 = str(tmp_path / "three.tar")
492-
out_tar = str(tmp_path / "out.tar")
493-
494-
ctx = DummyContext()
495-
packager = GitArchivePackager()
496-
packager._concatenate_tar_files(ctx, out_tar, [tar1, tar2, tar3])
497-
498-
# Expected sequence: cp first -> tar Af rest -> rm all inputs
499-
assert len(ctx.commands) == 3
500-
assert ctx.commands[0] == f"cp {shlex.quote(tar1)} {shlex.quote(out_tar)}"
501-
assert (
502-
ctx.commands[1] == f"tar Af {shlex.quote(out_tar)} {shlex.quote(tar2)} {shlex.quote(tar3)}"
503-
)
504-
assert ctx.commands[2] == f"rm {shlex.quote(tar1)} {shlex.quote(tar2)} {shlex.quote(tar3)}"
505-
506-
507466
@patch("nemo_run.core.packaging.git.Context", MockContext)
508467
def test_include_pattern_length_mismatch_raises(packager, temp_repo):
509468
# Mismatch between include_pattern and include_pattern_relative_path should raise

0 commit comments

Comments
 (0)