Skip to content

Commit 2f6a256

Browse files
batchtools_slurm(): Add more troubleshooting information to the infamous 'Error: Future of class BatchtoolsSlurmFuture expired' problem that some Slurm users experience
1 parent d5cb91a commit 2f6a256

File tree

2 files changed

+46
-12
lines changed

2 files changed

+46
-12
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Package: future.batchtools
2-
Version: 0.20.0-9000
2+
Version: 0.20.0-9001
33
Depends:
44
R (>= 3.2.0),
55
parallelly,

R/BatchtoolsFutureBackend-class.R

Lines changed: 45 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -667,7 +667,7 @@ resolved.BatchtoolsFuture <- function(x, ...) {
667667
## Assert that the process that created the future is
668668
## also the one that evaluates/resolves/queries it.
669669
assertOwner(x)
670-
670+
671671
## If not, checks the batchtools registry status
672672
resolved <- finished(x)
673673
if (is.na(resolved)) return(FALSE)
@@ -845,25 +845,59 @@ await <- function(future, cleanup = TRUE, ...) {
845845
## how we can distinguish the two right now, but I'll assume that
846846
## started jobs have a 'submitted' or 'started' status flag too,
847847
## whereas jobs that failed to launch won't. /HB 2025-07-15
848+
hints <- NULL
849+
850+
state <- future[["state"]]
851+
info <- sprintf("Future state: %s", sQuote(state))
852+
hints <- c(hints, info)
853+
info <- sprintf("Batchtools status: %s", commaq(stat))
854+
hints <- c(hints, info)
855+
856+
## SPECIAL CASE: Some Slurm users report on 'expired' jobs, although they never started.
857+
## Output more breadcrumbs to be able to narrow in on what causes this. /HB 2025-09-07
858+
if (inherits(future, "BatchtoolsSlurmFuture")) {
859+
## Get _all_ jobs of the users, including those not submitted via future.batchtools
860+
slurm_job_ids <- unique(c(
861+
reg$cluster.functions$listJobsQueued(reg),
862+
reg$cluster.functions$listJobsRunning(reg)
863+
))
864+
if (length(slurm_job_ids) > 0) {
865+
info <- sprintf("Slurm job ID: [n=%d] %s", length(slurm_job_ids), commaq(slurm_job_ids))
866+
args <- c("--noheader", "--format='job_id=%i,state=%T,submitted_on=%V,time_used=%M'", "-j", paste(slurm_job_ids, collapse = ","))
867+
res <- system2("squeue", args = args, stdout = TRUE, stderr = TRUE)
868+
res <- paste(res, collapse = "; ") ## should only be a single line, but ...
869+
info <- c(info, sprintf("Slurm job status: %s", res))
870+
} else {
871+
info <- "Slurm job ID: <not found>"
872+
info <- c(info, sprintf("Slurm job status: <unknown>"))
873+
}
874+
hints <- c(hints, info)
875+
}
848876

849-
hint <- tryCatch({
877+
## TROUBLESHOOTING: Logged output
878+
info <- tryCatch({
850879
output <- loggedOutput(future, timeout = 0.0)
851-
hint <- unlist(strsplit(output, split = "\n", fixed = TRUE))
852-
hint <- hint[nzchar(hint)]
853-
hint <- tail(hint, n = getOption("future.batchtools.expiration.tail", 48L))
880+
info <- unlist(strsplit(output, split = "\n", fixed = TRUE))
881+
info <- info[nzchar(info)]
882+
info <- tail(info, n = getOption("future.batchtools.expiration.tail", 48L))
854883
}, error = function(e) NULL)
855-
if (length(hint) > 0) {
856-
hint <- c("The last few lines of the logged output:", hint)
857-
hint <- paste(hint, collapse = "\n")
884+
885+
if (length(info) > 0) {
886+
info <- c("The last few lines of the logged output:", info)
858887
} else {
859-
hint <- "No logged output file exist (at the moment)"
888+
info <- "No logged output file exist (at the moment)"
860889
}
890+
hints <- c(hints, info)
861891

892+
if (length(hints) > 0) {
893+
hints <- c("\nPost-mortem details:", hints)
894+
hints <- paste(hints, collapse = "\n")
895+
}
862896
if (any(c("submitted", "started") %in% stat)) {
863-
msg <- sprintf("Future (%s) of class %s expired, which indicates that it crashed or was killed. %s", label, class(future)[1], hint)
897+
msg <- sprintf("Future (%s) of class %s expired, which indicates that it crashed or was killed.%s", label, class(future)[1], hints)
864898
result <- FutureInterruptError(msg, future = future)
865899
} else {
866-
msg <- sprintf("Future (%s) of class %s failed to launch. %s", label, class(future)[1], hint)
900+
msg <- sprintf("Future (%s) of class %s failed to launch.%s", label, class(future)[1], hints)
867901
result <- FutureLaunchError(msg, future = future)
868902
}
869903
} else if (future[["state"]] %in% c("canceled", "interrupted")) {

0 commit comments

Comments
 (0)