Skip to content

Commit 57234a2

Browse files
authored
[cuebot] Improve nimby retry logic (AcademySoftwareFoundation#2169)
Frames that got killed with SIGTERM due to their executing host having become nimby locked should always be retried, regardless of their layer maxretries limit. Also increase NO_RETRY_LIMIT to prevent long running frames from not being retried.
1 parent b8396bb commit 57234a2

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

cuebot/src/main/java/com/imageworks/spcue/dispatcher/Dispatcher.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ public interface Dispatcher {
8181
public static final int DOCKER_EXIT_STATUS_MEMORY_FAILURE = 137;
8282

8383
// max retry time
84-
public static final int FRAME_TIME_NO_RETRY = 3600 * 8;
84+
public static final int FRAME_TIME_NO_RETRY = 3600 * 12;
8585

8686
// The maximum amount of virtual memory a frame can be using
8787
// without being penalized for it.

cuebot/src/main/java/com/imageworks/spcue/dispatcher/FrameCompleteHandler.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -609,13 +609,17 @@ else if (frame.state.equals(FrameState.DEAD)) {
609609
report = FrameCompleteReport.newBuilder(report)
610610
.setExitStatus(FrameExitStatus.SKIP_RETRY_VALUE).build();
611611
newState = FrameState.WAITING;
612-
// exemption code 256
613612
} else if ((report.getExitStatus() == FrameExitStatus.FAILED_LAUNCH_VALUE
614613
|| report.getExitSignal() == FrameExitStatus.FAILED_LAUNCH_VALUE)
615614
&& (frame.retries < job.maxRetries)) {
615+
// exemption code 256
616616
report = FrameCompleteReport.newBuilder(report)
617617
.setExitStatus(report.getExitStatus()).build();
618618
newState = FrameState.WAITING;
619+
} else if (report.getHost().getNimbyLocked() && report.getExitSignal() == 15) {
620+
// If frame got killed because the host was nimby locked,
621+
// retry even if retry count is higher than maxretrycount
622+
newState = FrameState.WAITING;
619623
} else if (job.autoEat) {
620624
newState = FrameState.EATEN;
621625
// ETC Time out and LLU timeout

0 commit comments

Comments
 (0)