Skip to content

Commit 3d024a0

Browse files
fix: pipeline hangs when submitting from compute nodes
When running snakemake from within a SLURM job (e.g., an interactive session on a compute node), the pipeline would submit jobs but never detect their completion, hanging forever. The RemoteExecutor base class starts a status-checking daemon thread in __init__ before __post_init__ is called. The SLURM plugin's warn_on_jobcontext() in __post_init__ would sleep 5 seconds and then delete SLURM environment variables, but by then the daemon thread had already started and would silently die after its first polling cycle. Fix: move the SLURM environment detection and cleanup into __init__, before super().__init__() starts the daemon thread. Remove the now unnecessary warn_on_jobcontext() method and its 5-second sleep. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 7fa975f commit 3d024a0

File tree

2 files changed

+22
-24
lines changed

2 files changed

+22
-24
lines changed

snakemake_executor_plugin_slurm/__init__.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -428,9 +428,22 @@ def _select_logdir(workflow):
428428
# Required:
429429
# Implementation of your executor
430430
class Executor(RemoteExecutor):
431+
def __init__(self, workflow, logger):
432+
# Must clean SLURM environment BEFORE super().__init__() starts
433+
# the wait_thread, otherwise the thread races with __post_init__
434+
# and silently dies, causing the pipeline to hang forever.
435+
if "SLURM_JOB_ID" in os.environ:
436+
logger.warning(
437+
"You are running snakemake in a SLURM job context. "
438+
"This is not recommended, as it may lead to unexpected "
439+
"behavior. "
440+
"If possible, please run Snakemake directly on the "
441+
"login node."
442+
)
443+
delete_slurm_environment()
444+
super().__init__(workflow, logger)
445+
431446
def __post_init__(self, test_mode: bool = False):
432-
# run check whether we are running in a SLURM job context
433-
self.warn_on_jobcontext()
434447
self.test_mode = test_mode
435448

436449
self.run_uuid = str(uuid.uuid4())
@@ -617,19 +630,6 @@ def clean_old_logs(self) -> None:
617630
f"Could not delete empty directories in {self.slurm_logdir}: {e}"
618631
)
619632

620-
def warn_on_jobcontext(self, done=None):
621-
if not done:
622-
if "SLURM_JOB_ID" in os.environ:
623-
self.logger.warning(
624-
"You are running snakemake in a SLURM job context. "
625-
"This is not recommended, as it may lead to unexpected "
626-
"behavior. "
627-
"If possible, please run Snakemake directly on the "
628-
"login node."
629-
)
630-
time.sleep(5)
631-
delete_slurm_environment()
632-
done = True
633633

634634
def additional_general_args(self):
635635
"""

tests/test_cli.py

Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,19 +34,17 @@ def _make_executor(jobname_prefix: str):
3434
def test_jobname_prefix_applied():
3535
executor = _make_executor("testprefix")
3636

37-
with patch.object(Executor, "warn_on_jobcontext", return_value=None):
38-
with patch(
39-
"snakemake_executor_plugin_slurm.uuid.uuid4",
40-
return_value=uuid.UUID("00000000-0000-0000-0000-000000000000"),
41-
):
42-
executor.__post_init__(test_mode=True)
37+
with patch(
38+
"snakemake_executor_plugin_slurm.uuid.uuid4",
39+
return_value=uuid.UUID("00000000-0000-0000-0000-000000000000"),
40+
):
41+
executor.__post_init__(test_mode=True)
4342

4443
assert executor.run_uuid == "testprefix_00000000-0000-0000-0000-000000000000"
4544

4645

4746
def test_jobname_prefix_validation():
4847
executor = _make_executor("bad!prefix")
4948

50-
with patch.object(Executor, "warn_on_jobcontext", return_value=None):
51-
with pytest.raises(WorkflowError, match="jobname_prefix"):
52-
executor.__post_init__(test_mode=True)
49+
with pytest.raises(WorkflowError, match="jobname_prefix"):
50+
executor.__post_init__(test_mode=True)

0 commit comments

Comments
 (0)