Skip to content

Commit b3f8038

Browse files
GallVpjealous
authored andcommitted
[PER-44] NoAvailableHost exit code 143
1 parent bbba0ca commit b3f8038

File tree

6 files changed

+107
-20
lines changed

6 files changed

+107
-20
lines changed

README.md

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Otherwise, the worker nodes won't be able to see the task files.
2626
- [Configure with environment variables](#configure-with-environment-variables)
2727
- [Configure with Nextflow secrets](#configure-with-nextflow-secrets)
2828
- [Configuration best practices](#configuration-best-practices)
29+
- [S3](#s3)
30+
- [errorStrategy](#errorstrategy)
2931
- [Configure s3 work directory](#configure-s3-work-directory)
3032
- [Configure s3fs work directory](#configure-s3fs-work-directory)
3133
- [Configure fusion FS over s3](#configure-fusion-fs-over-s3)
@@ -214,6 +216,8 @@ Unknown config secret 'MMC_USERNAME'
214216

215217
### Configuration best practices
216218

219+
#### S3
220+
217221
When you are using s3, it's recommended to update the aws client configurations
218222
based on your environment and the workload. Here is an example:
219223
```groovy
@@ -239,17 +243,34 @@ aws {
239243
}
240244
```
241245

246+
#### errorStrategy
247+
242248
If you are sure that the workflow file is properly composed, it's recommended to
243249
set proper error strategy and retry limit in the process scope to make sure
244250
the workflow can be completed.
245-
Here is an example:
251+
252+
Here is the recommended error strategy:
246253
```groovy
247254
process {
248-
errorStrategy='retry'
249-
maxRetries=5
255+
errorStrategy = {
256+
if ( task.exitStatus == 143 ) { // NoAvailableHost
257+
sleep(Math.pow(2, task.attempt) * 2 * 60000 as long)
258+
return 'retry'
259+
}
260+
261+
task.exitStatus in ((130..142) + 145 + 104 + 175)
262+
? 'retry'
263+
: 'finish'
264+
}
265+
maxRetries = 5
250266
}
251267
```
252268

269+
This strategy increases max retries to 5 and generally adheres to the nf-core strategy.
270+
The addition is that for exit code 143 which represents NoAvailableHost resulting from no or
271+
low availability of spot instances, Nextflow waits for 4, 8, 16, 32, 64 minutes for
272+
consecutive retries of a task.
273+
253274
### Configure s3 work directory
254275

255276
To enable s3 as work directory, user need to set work directory to a s3 bucket.

plugins/nf-float/src/main/com/memverge/nextflow/FloatGridExecutor.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ fi
643643
STATUS_MAP.put(FloatStatus.PENDING, QueueStatus.PENDING)
644644
STATUS_MAP.put(FloatStatus.RUNNING, QueueStatus.RUNNING)
645645
STATUS_MAP.put(FloatStatus.DONE, QueueStatus.DONE)
646+
STATUS_MAP.put(FloatStatus.NOAVAILABLEHOST, QueueStatus.ERROR)
646647
STATUS_MAP.put(FloatStatus.ERROR, QueueStatus.ERROR)
647648
STATUS_MAP.put(FloatStatus.UNKNOWN, QueueStatus.UNKNOWN)
648649
}

plugins/nf-float/src/main/com/memverge/nextflow/FloatJob.groovy

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ enum FloatStatus {
2424
PENDING,
2525
RUNNING,
2626
DONE,
27+
NOAVAILABLEHOST,
2728
ERROR,
2829
UNKNOWN,
2930

@@ -46,7 +47,7 @@ enum FloatStatus {
4647
'CheckpointFailed' : ERROR,
4748
'WaitingForLicense': ERROR,
4849
'Timedout' : ERROR,
49-
'NoAvailableHost' : ERROR,
50+
'NoAvailableHost' : NOAVAILABLEHOST,
5051
'Unknown' : UNKNOWN,
5152
]
5253

@@ -58,12 +59,8 @@ enum FloatStatus {
5859
return this == PENDING || this == RUNNING
5960
}
6061

61-
boolean isFinished() {
62-
return this == ERROR || this == DONE
63-
}
64-
65-
boolean isError() {
66-
return this == ERROR
62+
boolean isDoneOrFailed() {
63+
return this == DONE || this == ERROR || this == NOAVAILABLEHOST
6764
}
6865
}
6966

@@ -125,8 +122,8 @@ class FloatJob {
125122
return status ? status.isRunning() : false
126123
}
127124

128-
boolean isFinished() {
129-
return status ? status.isFinished() : false
125+
boolean isDoneOrFailed() {
126+
return status ? status.isDoneOrFailed() : false
130127
}
131128

132129
static List<FloatJob> parseJobMap(String input) {

plugins/nf-float/src/main/com/memverge/nextflow/FloatJobs.groovy

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ class FloatJobs {
4444
}
4545

4646
TaskId getTaskID(String floatJobID) {
47-
// go through the all float jobs to find the task id
47+
// go through all float jobs to find the task id
4848
for (FloatJob job : nfJobID2FloatJob.values()) {
4949
if (job.floatJobID == floatJobID) {
5050
// extract the task id from the nfJobID
@@ -74,7 +74,7 @@ class FloatJobs {
7474
log.error "[FLOAT] job nf Job ID is null or empty for job ${job.floatJobID}"
7575
} else {
7676
FloatJob existingJob = nfJobID2FloatJob.get(job.nfJobID)
77-
if (existingJob != null && existingJob.finished) {
77+
if (existingJob != null && existingJob.doneOrFailed) {
7878
log.info "[FLOAT] job ${job.nfJobID} already finished, no need to update"
7979
job = existingJob
8080
} else {
@@ -83,7 +83,7 @@ class FloatJobs {
8383
}
8484
nfJobID2FloatJob.put(job.nfJobID, job)
8585
}
86-
if (job.finished) {
86+
if (job.doneOrFailed) {
8787
refreshWorkDir(job.nfJobID)
8888
}
8989
}

plugins/nf-float/src/main/com/memverge/nextflow/FloatTaskHandler.groovy

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,20 +73,25 @@ class FloatTaskHandler extends GridTaskHandler {
7373
boolean checkIfCompleted() {
7474
final FloatStatus st = floatExecutor.getJobStatus(task)
7575
log.debug "got status ${st} for ${task.id} from float executor"
76-
if (st.finished) {
77-
status = COMPLETED
76+
if (st.doneOrFailed) {
77+
this.status = COMPLETED
7878
task.exitStatus = readExitStatus()
7979
if (task.exitStatus == null) {
8080
log.debug "can't get ${task.id} exit status from file system"
8181
task.exitStatus = floatExecutor.getJobRC(task.id)
8282
}
8383
log.debug "set ${task.id} exit status to ${task.exitStatus}"
8484
if (task.exitStatus == null) {
85-
if (st.isError()) {
85+
task.exitStatus = 0
86+
87+
if (st == FloatStatus.ERROR) {
8688
task.exitStatus = 1
87-
} else {
88-
task.exitStatus = 0
8989
}
90+
91+
if (st == FloatStatus.NOAVAILABLEHOST) {
92+
task.exitStatus = 143
93+
}
94+
9095
log.info "both .exitcode and rc are empty for ${task.id}," +
9196
"set exit to ${task.exitStatus}"
9297
}

plugins/nf-float/src/test/com/memverge/nextflow/FloatJobTest.groovy

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,69 @@ class FloatJobTest extends Specification {
127127
job.status == FloatStatus.PENDING
128128
}
129129

130+
def "check job failed with NoAvailableHost"() {
131+
given:
132+
final out = """
133+
id: uqaj8l6a407u88v0106ol
134+
name: subread-3nyw14-r6a.2xlarge
135+
workingHost: ""
136+
category: failed
137+
status: NoAvailableHost
138+
cpu: 8
139+
duration: 1h5m49s
140+
queueTime: 0s
141+
submitTime: "2025-06-20T09:09:34Z"
142+
execTime: "2025-06-20T09:09:34Z"
143+
endTime: "2025-06-20T10:15:23Z"
144+
lastUpdate: "2025-06-20T10:15:23Z"
145+
memGB: 64
146+
scheduler: mmcloud
147+
imageID: quay.io/biocontainers/subread:2.0.6--he4a0461_2
148+
stdout: stdout.autosave
149+
stderr: stderr.autosave
150+
memUsed: 0.00 B
151+
cpuMax: 24
152+
memGBMax: 144
153+
instanceType: r6a.2xlarge
154+
coreHours: 0.6784628924888889
155+
""".stripIndent().trim()
156+
157+
when:
158+
final job = FloatJob.parse(out)
159+
160+
then:
161+
! job.isRunning()
162+
job.status == FloatStatus.NOAVAILABLEHOST
163+
}
164+
165+
def "check job failed with FailToExecute"() {
166+
given:
167+
final out = """
168+
id: 3cfmkg882qn09q9o8jfkf
169+
name: rseqc-2609uc-m5zn.3xlarge
170+
workingHost: ""
171+
category: failed
172+
status: FailToExecute
173+
cpu: 6
174+
duration: 0s
175+
queueTime: 0s
176+
submitTime: "2025-06-20T09:13:54Z"
177+
execTime: "2025-06-20T09:13:54Z"
178+
endTime: "2025-06-20T09:13:55Z"
179+
lastUpdate: "2025-06-20T09:13:55Z"
180+
memGB: 36
181+
cpuMax: 24
182+
memGBMax: 144
183+
""".stripIndent().trim()
184+
185+
when:
186+
final job = FloatJob.parse(out)
187+
188+
then:
189+
! job.isRunning()
190+
job.status == FloatStatus.ERROR
191+
}
192+
130193
def "get queue status"() {
131194
given:
132195
final out = """

0 commit comments

Comments
 (0)