Skip to content

Commit 7b13fb3

Browse files
authored
Fix job name prefix for slurm het jobs (#147)
* Fix job name prefix for slurm het jobs Signed-off-by: Hemil Desai <[email protected]> * Fix ruff Signed-off-by: Hemil Desai <[email protected]> * Fix Signed-off-by: Hemil Desai <[email protected]> --------- Signed-off-by: Hemil Desai <[email protected]>
1 parent 463d6e1 commit 7b13fb3

File tree

2 files changed

+47
-11
lines changed

2 files changed

+47
-11
lines changed

src/nemo_run/core/execution/slurm.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,7 @@ def materialize(self) -> str:
816816
)
817817
het_parameters.update(
818818
{
819-
"job_name": f"{self.slurm_config.account}-{self.slurm_config.account.split('_')[-1]}.{self.jobs[i]}",
819+
"job_name": f"{job_details.job_name[:-2] if job_details.job_name.endswith('-0') else job_details.job_name}-{i}",
820820
"nodes": resource_req.nodes,
821821
"ntasks_per_node": resource_req.ntasks_per_node,
822822
"gpus_per_node": resource_req.gpus_per_node,
@@ -995,7 +995,7 @@ def get_container_flags(
995995
return sbatch_script
996996

997997
def __repr__(self) -> str:
998-
return f"""{' '.join(self.cmd + ['$SBATCH_SCRIPT'])}
998+
return f"""{" ".join(self.cmd + ["$SBATCH_SCRIPT"])}
999999
10001000
#----------------
10011001
# SBATCH_SCRIPT

test/core/execution/test_slurm.py

Lines changed: 45 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -473,16 +473,16 @@ class CustomJobDetails(SlurmJobDetails):
473473
@property
474474
def stdout(self) -> Path:
475475
assert self.folder
476-
return Path(self.folder / "sbatch_job.out")
476+
return Path(self.folder) / "sbatch_job.out"
477477

478478
@property
479479
def srun_stdout(self) -> Path:
480480
assert self.folder
481-
return Path(self.folder / "log_job.out")
481+
return Path(self.folder) / "log_job.out"
482482

483483
dummy_slurm_request, _ = dummy_slurm_request_with_artifact
484484
dummy_slurm_request.slurm_config.job_details = CustomJobDetails(
485-
job_name="custom_sample_job", folder=Path("/custom_folder")
485+
job_name="custom_sample_job", folder="/custom_folder"
486486
)
487487
sbatch_script = dummy_slurm_request.materialize()
488488
assert "#SBATCH --job-name=custom_sample_job" in sbatch_script
@@ -633,25 +633,23 @@ class CustomJobDetails(SlurmJobDetails):
633633
@property
634634
def stdout(self) -> Path:
635635
assert self.folder
636-
return Path(self.folder / "sbatch_job.out")
636+
return Path(self.folder) / "sbatch_job.out"
637637

638638
@property
639639
def srun_stdout(self) -> Path:
640640
assert self.folder
641-
return Path(self.folder / f"log_{self.job_name}.out")
641+
return Path(self.folder) / f"log_{self.job_name}.out"
642642

643643
group_resource_req_slurm_request, _ = group_resource_req_slurm_request_with_artifact
644644
group_resource_req_slurm_request.slurm_config.job_details = CustomJobDetails(
645-
job_name="custom_sample_job", folder=Path("/custom_folder")
645+
job_name="custom_sample_job", folder="/custom_folder"
646646
)
647647
group_resource_req_slurm_request.slurm_config.resource_group[0].job_details = copy.deepcopy(
648648
group_resource_req_slurm_request.slurm_config.job_details
649649
)
650650
group_resource_req_slurm_request.slurm_config.resource_group[
651651
1
652-
].job_details = CustomJobDetails(
653-
job_name="custom_sample_job_2", folder=Path("/custom_folder_2")
654-
)
652+
].job_details = CustomJobDetails(job_name="custom_sample_job_2", folder="/custom_folder_2")
655653

656654
sbatch_script = group_resource_req_slurm_request.materialize()
657655
assert "#SBATCH --job-name=custom_sample_job" in sbatch_script
@@ -680,3 +678,41 @@ def test_ft_het_slurm_request_materialize(
680678
sbatch_script = re.sub(r"--rdzv-id \d+", "--rdzv-id 1", sbatch_script)
681679
expected = re.sub(r"--rdzv-id \d+", "--rdzv-id 1", expected)
682680
assert sbatch_script.strip() == expected.strip()
681+
682+
def test_het_job_name_prefix(self, het_slurm_request_with_artifact):
683+
# Set the job_name_prefix to a custom value
684+
het_request, _ = het_slurm_request_with_artifact
685+
het_request.slurm_config.job_name_prefix = "prefix_"
686+
687+
# Materialize the batch request script
688+
sbatch_script = het_request.materialize()
689+
690+
# For each job in the heterogeneous request, verify the job name uses the prefix
691+
for job in het_request.jobs:
692+
expected = f"prefix_{job}"
693+
assert expected in sbatch_script, f"Expected job name '{expected}' not found in script"
694+
695+
def test_het_job_custom_details_job_name(self, het_slurm_request_with_artifact):
696+
# Test that the job name from CustomJobDetails is used for heterogeneous slurm requests
697+
from nemo_run.core.execution.slurm import SlurmJobDetails
698+
699+
het_request, _ = het_slurm_request_with_artifact
700+
701+
class CustomJobDetails(SlurmJobDetails):
702+
@property
703+
def stdout(self):
704+
assert self.folder
705+
return Path(self.folder) / "sbatch_job.out"
706+
707+
@property
708+
def srun_stdout(self):
709+
assert self.folder
710+
return Path(self.folder) / "log_job.out"
711+
712+
custom_name = "custom_het_job"
713+
het_request.slurm_config.job_details = CustomJobDetails(
714+
job_name=custom_name, folder="/custom_folder"
715+
)
716+
sbatch_script = het_request.materialize()
717+
for i in range(len(het_request.jobs)):
718+
assert f"#SBATCH --job-name={custom_name}-{i}" in sbatch_script

0 commit comments

Comments
 (0)