NVIDIA-NeMo
diff --git a/‎nemo_run/core/execution/slurm.py‎
Lines changed: 4 additions & 1 deletion b/‎nemo_run/core/execution/slurm.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎nemo_run/run/experiment.py‎
Lines changed: 82 additions & 15 deletions b/‎nemo_run/run/experiment.py‎
Lines changed: 82 additions & 15 deletions
diff --git a/‎nemo_run/run/job.py‎
Lines changed: 22 additions & 6 deletions b/‎nemo_run/run/job.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎nemo_run/run/torchx_backend/launcher.py‎
Lines changed: 7 additions & 2 deletions b/‎nemo_run/run/torchx_backend/launcher.py‎
Lines changed: 7 additions & 2 deletions
@@ -590,7 +590,10 @@ def package_configs(self, *cfgs: tuple[str, str]) -> list[str]:
     def package(self, packager: Packager, job_name: str):
         assert self.experiment_id, "Executor not assigned to an experiment."
 
-        if job_name in self.tunnel.packaging_jobs and not packager.symlink_from_remote_dir:
+        if (
+            get_packaging_job_key(self.experiment_id, job_name) in self.tunnel.packaging_jobs
+            and not packager.symlink_from_remote_dir
+        ):
             logger.info(
                 f"Packaging for job {job_name} in tunnel {self.tunnel.key} already done. Skipping subsequent packagings.\n"
                 "This may cause issues if you have multiple tasks with the same name but different packagers, as only the first packager will be used."
 
@@ -35,9 +35,8 @@
 from rich.console import Group
 from rich.live import Live
 from rich.panel import Panel
-from rich.progress import BarColumn, Progress, SpinnerColumn
+from rich.progress import BarColumn, Progress, SpinnerColumn, TaskID, TimeElapsedColumn
 from rich.progress import Task as RichTask
-from rich.progress import TaskID, TimeElapsedColumn
 from rich.syntax import Syntax
 from torchx.specs.api import AppState
 
@@ -303,6 +302,9 @@ def __init__(
         base_dir: str | None = None,
         clean_mode: bool = False,
         enable_goodbye_message: bool = True,
+        threadpool_workers: int = 16,
+        skip_status_at_exit: bool = False,
+        serialize_metadata_for_scripts: bool = True,
     ) -> None:
         """
         Initializes an experiment run by creating its metadata directory and saving the experiment config.
@@ -328,6 +330,9 @@ def __init__(
         self._title = title
         self._id = id or f"{title}_{int(time.time())}"
         self._enable_goodbye_message = enable_goodbye_message
+        self._threadpool_workers = threadpool_workers
+        self._skip_status_at_exit = skip_status_at_exit
+        self._serialize_metadata_for_scripts = serialize_metadata_for_scripts
 
         base_dir = str(base_dir or get_nemorun_home())
         self._exp_dir = os.path.join(base_dir, "experiments", title, self._id)
@@ -359,6 +364,10 @@ def to_config(self) -> Config:
             executor=self.executor.to_config(),
             log_level=self.log_level,
             clean_mode=self.clean_mode,
+            threadpool_workers=self._threadpool_workers,
+            enable_goodbye_message=self._enable_goodbye_message,
+            skip_status_at_exit=self._skip_status_at_exit,
+            serialize_metadata_for_scripts=self._serialize_metadata_for_scripts,
         )
 
     def _save_experiment(self, exist_ok: bool = False):
@@ -422,8 +431,9 @@ def _load_jobs(self) -> list[Job | JobGroup]:
 
     def _prepare(self, exist_ok: bool = False):
         self._save_experiment(exist_ok=exist_ok)
+
         for job in self.jobs:
-            job.prepare()
+            job.prepare(serialize_metadata_for_scripts=self._serialize_metadata_for_scripts)
 
         self._save_jobs()
 
@@ -769,7 +779,15 @@ def _run_dag(self, detach: bool, tail_logs: bool, executors: set[Executor]):
             self.detach = detach
 
         for level in order:
-            for _, node in enumerate(level):
+            # Launch jobs in this level concurrently since they are independent
+
+            def _set_context(ctx: contextvars.Context):
+                for var, value in ctx.items():
+                    var.set(value)
+
+            ctx = contextvars.copy_context()
+
+            def _launch(node: str):
                 job: Job | JobGroup = job_map[node]
                 self.console.log(f"[bold cyan]Launching job {job.id} for experiment {self._title}")
                 if tail_logs:
@@ -787,14 +805,24 @@ def _run_dag(self, detach: bool, tail_logs: bool, executors: set[Executor]):
                             deps.append(handle)
 
                         job.executor.dependencies = deps  # type: ignore
+
                     job.launch(wait=False, runner=self._runner)
+                    return job
 
                 except Exception as e:
                     self.console.log(f"Error running job {job.id}: {e}")
                     raise e
 
+            launched_jobs: list[Job | JobGroup] = []
+            with ThreadPoolExecutor(
+                initializer=_set_context, initargs=(ctx,), max_workers=self._threadpool_workers
+            ) as pool:
+                futures = [pool.submit(_launch, node) for node in level]
+                for future in as_completed(futures):
+                    launched_jobs.append(future.result())
+
             if wait:
-                self._wait_for_jobs(jobs=[job_map[node] for node in level])
+                self._wait_for_jobs(jobs=launched_jobs)
 
         self._save_jobs()
         self._launched = any(map(lambda job: job.launched, self.jobs))
@@ -840,7 +868,21 @@ def set_context(context: contextvars.Context):
                 finally:
                     job.cleanup()
 
-    def status(self, return_dict: bool = False) -> Optional[dict[str, str]]:
+    def _initialize_tunnels(self, extract_from_executors: bool = False):
+        if extract_from_executors:
+            for job in self.jobs:
+                if (
+                    isinstance(job.executor, SlurmExecutor)
+                    and job.executor.tunnel.key not in self.tunnels
+                ):
+                    self.tunnels[job.executor.tunnel.key] = job.executor.tunnel
+
+        for tunnel in self.tunnels.values():
+            if isinstance(tunnel, SSHTunnel):
+                tunnel.connect()
+                assert tunnel.session, f"SSH tunnel {tunnel.key} failed to connect."
+
+    def status(self, return_dict: bool = False) -> Optional[dict[str, dict[str, str]]]:
         """
         Prints a table specifying the status of all tasks.
 
@@ -880,6 +922,7 @@ def _get_job_info_and_dict(
                 "status": job.status(runner=self._runner),
                 "executor": job.executor.info(),
                 "job_id": app_id,
+                "handle": job.handle,
                 "local_dir": job.executor.job_dir,
             }
 
@@ -902,13 +945,34 @@ def _get_job_info_and_dict(
             job_info.extend(directory_info)
             return job_info, job_dict
 
+        self._initialize_tunnels(extract_from_executors=True)
         try:
             result_dict = {}
-            job_infos = []
-            for i, job in enumerate(self.jobs):
-                job_info, job_dict = _get_job_info_and_dict(i, job)
-                job_infos.append(Group(*job_info))
-                result_dict[job.id] = job_dict
+            job_infos: list[Group | None] = [None] * len(self.jobs)
+
+            # Parallelize IO-bound status retrieval across jobs
+            def _collect(arg):
+                idx, job = arg
+                job_info, job_dict = _get_job_info_and_dict(idx, job)
+                return idx, job.id, job_info, job_dict
+
+            # Propagate context variables to worker threads so helpers that rely on them keep working
+            def _set_context(ctx: contextvars.Context):
+                for var, value in ctx.items():
+                    var.set(value)
+
+            ctx = contextvars.copy_context()
+            with ThreadPoolExecutor(
+                initializer=_set_context, initargs=(ctx,), max_workers=self._threadpool_workers
+            ) as pool:
+                futures = [pool.submit(_collect, (idx, job)) for idx, job in enumerate(self.jobs)]
+                for future in as_completed(futures):
+                    idx, job_id, job_info, job_dict = future.result()
+                    job_infos[idx] = Group(*job_info)
+                    result_dict[job_id] = job_dict
+
+            # Remove potential None slots (should not occur)
+            job_infos = [ji for ji in job_infos if ji is not None]
 
             if return_dict:
                 return result_dict
@@ -1142,7 +1206,7 @@ def __exit__(self, exc_type, exc_value, tb):
                     "Ephemeral logs and artifacts may be lost.",
                 )
 
-                if self._launched:
+                if self._launched and not self._skip_status_at_exit:
                     self.status()
                 return
 
@@ -1151,20 +1215,23 @@ def __exit__(self, exc_type, exc_value, tb):
                     self.console.rule(
                         f"[bold magenta]Direct run Experiment {self._id}",
                     )
-                    self.status()
+                    if not self._skip_status_at_exit:
+                        self.status()
                     return
 
                 if hasattr(self, "_waited") and self._waited:
                     self.console.rule(
                         f"[bold magenta]Done waiting for Experiment {self._id}",
                     )
-                    self.status()
+                    if not self._skip_status_at_exit:
+                        self.status()
                     return
 
                 self.console.rule(
                     f"[bold magenta]Waiting for Experiment {self._id} to finish",
                 )
-                self.status()
+                if not self._skip_status_at_exit:
+                    self.status()
 
                 self._wait_for_jobs(jobs=self.jobs)
         finally:
 
@@ -3,7 +3,7 @@
 from dataclasses import dataclass, field
 from typing import Optional, Union, cast
 
-from torchx.specs.api import AppDef, AppState, is_terminal
+from torchx.specs.api import AppDef, AppDryRunInfo, AppState, is_terminal
 
 import nemo_run.exceptions
 from nemo_run.config import Config, ConfigurableMixin, Partial, Script
@@ -62,6 +62,10 @@ class Job(ConfigurableMixin):
     plugins: Optional[list[ExperimentPlugin]] = None
     tail_logs: bool = False
     dependencies: list[str] = field(default_factory=list)
+    name: str = ""
+
+    def __post_init__(self):
+        self._dryrun_info: Optional[AppDryRunInfo] = None
 
     def serialize(self) -> tuple[str, str]:
         cfg = self.to_config()
@@ -92,10 +96,14 @@ def logs(self, runner: Runner, regex: str | None = None):
             regex=regex,
         )
 
-    def prepare(self):
+    def prepare(self, serialize_metadata_for_scripts: bool = True):
         self.executor.create_job_dir()
         self._executable = package(
-            self.id, self.task, executor=self.executor, serialize_to_file=True
+            self.id,
+            self.task,
+            executor=self.executor,
+            serialize_to_file=True,
+            serialize_metadata_for_scripts=serialize_metadata_for_scripts,
         )
 
     def launch(
@@ -120,7 +128,7 @@ def launch(
             return
 
         if dryrun:
-            launch(
+            _, dryrun_info = launch(
                 executable=self._executable,
                 executor_name=executor_str,
                 executor=self.executor,
@@ -130,6 +138,7 @@ def launch(
                 log=self.tail_logs,
                 runner=runner,
             )
+            self._dryrun_info = dryrun_info
             return
 
         self.handle, status = launch(
@@ -140,6 +149,7 @@ def launch(
             wait=wait,
             log=self.tail_logs,
             runner=runner,
+            dryrun_info=self._dryrun_info,
         )
         self.state = status.state if status else AppState.UNKNOWN
         self.launched = True
@@ -223,6 +233,7 @@ class JobGroup(ConfigurableMixin):
     plugins: Optional[list[ExperimentPlugin]] = None
     tail_logs: bool = False
     dependencies: list[str] = field(default_factory=list)
+    name: str = ""
 
     def __post_init__(self):
         executors = [self.executors] if isinstance(self.executors, Executor) else self.executors
@@ -252,6 +263,8 @@ def __post_init__(self):
             if len(executors) == 1:
                 self.executors = executors * len(self.tasks)
 
+        self._dryrun_info: Optional[AppDryRunInfo] = None
+
     @property
     def state(self) -> AppState:
         if not self.launched or not self.handles:
@@ -307,7 +320,7 @@ def logs(self, runner: Runner, regex: str | None = None):
             regex=regex,
         )
 
-    def prepare(self):
+    def prepare(self, serialize_metadata_for_scripts: bool = True):
         self.executor.create_job_dir()
         self._executables: list[tuple[AppDef, Executor]] = []
         for i, task in enumerate(self.tasks):
@@ -318,6 +331,7 @@ def prepare(self):
                 task,
                 executor=executor,
                 serialize_to_file=True,
+                serialize_metadata_for_scripts=serialize_metadata_for_scripts,
             )
             self._executables.append((executable, executor))
 
@@ -346,7 +360,7 @@ def launch(
             executor_str = get_executor_str(executor)
 
             if dryrun:
-                launch(
+                _, dryrun_info = launch(
                     executable=executable,
                     executor_name=executor_str,
                     executor=executor,
@@ -356,6 +370,7 @@ def launch(
                     log=self.tail_logs,
                     runner=runner,
                 )
+                self._dryrun_info = dryrun_info
             else:
                 handle, status = launch(
                     executable=executable,
@@ -365,6 +380,7 @@ def launch(
                     wait=wait,
                     log=self.tail_logs,
                     runner=runner,
+                    dryrun_info=self._dryrun_info,
                 )
                 self.handles.append(handle)
                 self.states.append(status.state if status else AppState.UNKNOWN)
 
@@ -45,6 +45,7 @@ def launch(
     parent_run_id: Optional[str] = None,
     runner: Runner | None = None,
     log_dryrun: bool = ...,
+    dryrun_info: specs.AppDryRunInfo | None = None,
 ) -> tuple[None, None]: ...
 
 
@@ -59,6 +60,7 @@ def launch(
     parent_run_id: Optional[str] = None,
     runner: Runner | None = None,
     log_dryrun: bool = ...,
+    dryrun_info: specs.AppDryRunInfo | None = None,
 ) -> tuple[str, specs.AppStatus]: ...
 
 
@@ -73,6 +75,7 @@ def launch(
     parent_run_id: Optional[str] = None,
     runner: Runner | None = None,
     log_dryrun: bool = False,
+    dryrun_info: specs.AppDryRunInfo | None = None,
 ) -> tuple[str | None, specs.AppStatus | None]: ...
 
 
@@ -86,7 +89,8 @@ def launch(
     parent_run_id: Optional[str] = None,
     runner: Runner | None = None,
     log_dryrun: bool = False,
-) -> tuple[str | None, specs.AppStatus | None]:
+    dryrun_info: specs.AppDryRunInfo | None = None,
+) -> tuple[str | None, specs.AppStatus | specs.AppDryRunInfo | None]:
     runner = runner or get_runner()
 
     if dryrun:
@@ -100,13 +104,14 @@ def launch(
             CONSOLE.log("\n=== APPLICATION ===\n")
             CONSOLE.log(dryrun_info)
 
-        return None, None
+        return None, dryrun_info
     else:
         app_handle = runner.run(
             executable,
             executor_name,
             cfg=executor,  # type: ignore
             parent_run_id=parent_run_id,
+            dryrun_info=dryrun_info,
         )
         logger.info(f"Launched app: {app_handle}")
         app_status = specs.AppStatus(state=specs.AppState.SUBMITTED)