Skip to content

Commit 7af3936

Browse files
committed
Fix job name prefix for slurm het jobs
Signed-off-by: Hemil Desai <[email protected]>
1 parent 463d6e1 commit 7af3936

File tree

2 files changed

+22
-11
lines changed

2 files changed

+22
-11
lines changed

src/nemo_run/core/execution/slurm.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,7 @@ def materialize(self) -> str:
816816
)
817817
het_parameters.update(
818818
{
819-
"job_name": f"{self.slurm_config.account}-{self.slurm_config.account.split('_')[-1]}.{self.jobs[i]}",
819+
"job_name": f"{job_name_prefix}{self.jobs[i]}",
820820
"nodes": resource_req.nodes,
821821
"ntasks_per_node": resource_req.ntasks_per_node,
822822
"gpus_per_node": resource_req.gpus_per_node,

test/core/execution/test_slurm.py

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -473,16 +473,16 @@ class CustomJobDetails(SlurmJobDetails):
473473
@property
474474
def stdout(self) -> Path:
475475
assert self.folder
476-
return Path(self.folder / "sbatch_job.out")
476+
return Path(self.folder) / "sbatch_job.out"
477477

478478
@property
479479
def srun_stdout(self) -> Path:
480480
assert self.folder
481-
return Path(self.folder / "log_job.out")
481+
return Path(self.folder) / "log_job.out"
482482

483483
dummy_slurm_request, _ = dummy_slurm_request_with_artifact
484484
dummy_slurm_request.slurm_config.job_details = CustomJobDetails(
485-
job_name="custom_sample_job", folder=Path("/custom_folder")
485+
job_name="custom_sample_job", folder="/custom_folder"
486486
)
487487
sbatch_script = dummy_slurm_request.materialize()
488488
assert "#SBATCH --job-name=custom_sample_job" in sbatch_script
@@ -633,24 +633,22 @@ class CustomJobDetails(SlurmJobDetails):
633633
@property
634634
def stdout(self) -> Path:
635635
assert self.folder
636-
return Path(self.folder / "sbatch_job.out")
636+
return Path(self.folder) / "sbatch_job.out"
637637

638638
@property
639639
def srun_stdout(self) -> Path:
640640
assert self.folder
641-
return Path(self.folder / f"log_{self.job_name}.out")
641+
return Path(self.folder) / f"log_{self.job_name}.out"
642642

643643
group_resource_req_slurm_request, _ = group_resource_req_slurm_request_with_artifact
644644
group_resource_req_slurm_request.slurm_config.job_details = CustomJobDetails(
645-
job_name="custom_sample_job", folder=Path("/custom_folder")
645+
job_name="custom_sample_job", folder="/custom_folder"
646646
)
647647
group_resource_req_slurm_request.slurm_config.resource_group[0].job_details = copy.deepcopy(
648648
group_resource_req_slurm_request.slurm_config.job_details
649649
)
650-
group_resource_req_slurm_request.slurm_config.resource_group[
651-
1
652-
].job_details = CustomJobDetails(
653-
job_name="custom_sample_job_2", folder=Path("/custom_folder_2")
650+
group_resource_req_slurm_request.slurm_config.resource_group[1].job_details = CustomJobDetails(
651+
job_name="custom_sample_job_2", folder="/custom_folder_2"
654652
)
655653

656654
sbatch_script = group_resource_req_slurm_request.materialize()
@@ -680,3 +678,16 @@ def test_ft_het_slurm_request_materialize(
680678
sbatch_script = re.sub(r"--rdzv-id \d+", "--rdzv-id 1", sbatch_script)
681679
expected = re.sub(r"--rdzv-id \d+", "--rdzv-id 1", expected)
682680
assert sbatch_script.strip() == expected.strip()
681+
682+
def test_het_job_name_prefix(self, het_slurm_request_with_artifact):
683+
# Set the job_name_prefix to a custom value
684+
het_request, _ = het_slurm_request_with_artifact
685+
het_request.slurm_config.job_name_prefix = "prefix_"
686+
687+
# Materialize the batch request script
688+
sbatch_script = het_request.materialize()
689+
690+
# For each job in the heterogeneous request, verify the job name uses the prefix
691+
for job in het_request.jobs:
692+
expected = f"prefix_{job}"
693+
assert expected in sbatch_script, f"Expected job name '{expected}' not found in script"

0 commit comments

Comments
 (0)