From 89bee8288dd072321f8c3a9488bd6a1bb95e6389 Mon Sep 17 00:00:00 2001 From: Keita Watanabe Date: Fri, 7 Feb 2025 09:11:18 +0900 Subject: [PATCH] Add auto-resume flag in slurm.yaml So that HyperPod can auto-resume from H/W failures --- recipes_collection/cluster/slurm.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/recipes_collection/cluster/slurm.yaml b/recipes_collection/cluster/slurm.yaml index 79b8e35..64e8c1f 100755 --- a/recipes_collection/cluster/slurm.yaml +++ b/recipes_collection/cluster/slurm.yaml @@ -6,6 +6,7 @@ job_name_prefix: 'sagemaker-' slurm_create_submission_file_only: False # Setting to True if just want to create submission file stderr_to_stdout: True # Setting to False to split the stderr and stdout logs srun_args: + - --auto-resume=1 # - "--no-container-mount-home" slurm_docker_cfg: docker_args: