diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 234f0d360..cf382a075 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -210,19 +210,36 @@ def _make_nemo_path_command(self) -> List[str]: ] def _make_git_log_command(self, stage_cfg_path: Path): - """log last 5 commits for repos- NeMo, megatron-lm, NeMo-Framework-Launcher or NeMo-Megatron-Launcher - 'NeMo-Megatron-Launcher' was renamed to 'NeMo-Framework-Launcher'. We run git log for both for - backwards compatibility. """ - append_to_file = f"{stage_cfg_path.parent}/git_log.txt" - return [ - f"(echo PYT$\"NVIDIA_PYTORCH_VERSION\" && \ - git --git-dir=/opt/NeMo/.git log -n 5 --format='NeMo;%h;%aD;%s' && \ - git --git-dir=/opt/megatron-lm/.git log -n 5 --format='megatron-lm;%h;%aD;%s' && \ - git --git-dir=/opt/NeMo-Framework-Launcher/.git log -n 5 --format='NeMo-Framework-Launcher;%h;%aD;%s' && \ - git --git-dir=/opt/NeMo-Megatron-Launcher/.git log -n 5 --format='NeMo-Megatron-Launcher;%h;%aD;%s') > {append_to_file}" + log HEAD commit for subset of repos in NeMo container, version names for PyTorch and NeMo container + """ + filepath = os.path.join(f"{stage_cfg_path.parent}", "git-info.log") + + git_repos = [ + "NeMo", + "megatron-lm", + "TransformerEngine", + "NeMo-Framework-Launcher", + "apex", + "NeMo-Aligner", + "NeMo-Curator", + ] + + git_log_cmd = [ + f"git --git-dir=/opt/{repo}/.git log -n 1 --format='{repo};%h;%aD;%s'" + for repo in git_repos ] + container_info_cmd = [ + f"echo NeMo-Container-Version\;{self.cfg.get('container', '')}", + 'echo PyTorch-Container-Version\;PYT$"NVIDIA_PYTORCH_VERSION"', + ] + + # semi-colon delimiter ensures we run all above commands even after a failure + # circular brackets groups commands and ensures we write to file ONLY after all + # commands finish execution + return [f"({';'.join(git_log_cmd + container_info_cmd)}) > {filepath}"] + def _make_k8s_spec_file( self, template_root: str, cluster_parameters: Dict, job_path: JobPaths ): @@ -622,7 +639,8 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: command_groups = [[]] command_groups[0] += self._make_wandb_login_command() command_groups[0] += self._make_nemo_path_command() - command_groups[0] += self._make_git_log_command(stage_cfg_path) + if self.cluster == "bcm": + command_groups[0] += self._make_git_log_command(stage_cfg_path) # command_groups[0] += self._make_numa_mapping_command() # _cuda_device_max_connections and _cuda_visible_devices cannot be used as command prefix on BCP