Skip to content

Commit 2e260eb

Browse files
authored
add retry for LSF (#209)
1 parent 4b72f11 commit 2e260eb

File tree

1 file changed

+26
-5
lines changed

1 file changed

+26
-5
lines changed

dpdispatcher/lsf.py

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import time
2+
13
from dpdispatcher.machine import Machine
24
from dpdispatcher import dlog
35
from dpdispatcher.JobStatus import JobStatus
@@ -62,14 +64,25 @@ def gen_script_header(self, job):
6264

6365
return lsf_script_header
6466

65-
def do_submit(self, job):
67+
def do_submit(self, job, retry=0, max_retry=3):
6668
script_file_name = job.script_file_name
6769
script_str = self.gen_script(job)
6870
job_id_name = job.job_hash + '_job_id'
6971
self.context.write_file(fname=script_file_name, write_str=script_str)
70-
stdin, stdout, stderr = self.context.block_checkcall(
71-
'cd %s && %s %s' % (self.context.remote_root, 'bsub < ', script_file_name)
72-
)
72+
73+
try:
74+
stdin, stdout, stderr = self.context.block_checkcall(
75+
'cd %s && %s %s' % (self.context.remote_root, 'bsub < ', script_file_name)
76+
)
77+
except RuntimeError as err:
78+
if retry < max_retry:
79+
dlog.warning(err)
80+
dlog.warning("Sleep 60 s and retry submitting...")
81+
# rest 60s
82+
time.sleep(60)
83+
return self.do_submit(job, retry=retry+1, max_retry=max_retry)
84+
raise
85+
7386
subret = (stdout.readlines())
7487
job_id = subret[0].split()[1][1:-1]
7588
self.context.write_file(job_id_name, job_id)
@@ -85,7 +98,7 @@ def sub_script_cmd(self, res):
8598
def sub_script_head(self, res):
8699
pass
87100

88-
def check_status(self, job):
101+
def check_status(self, job, retry=0, max_retry=3):
89102
try:
90103
job_id = job.job_id
91104
except AttributeError:
@@ -101,6 +114,14 @@ def check_status(self, job):
101114
else:
102115
return JobStatus.terminated
103116
elif ret != 0:
117+
# just retry when any unknown error raised.
118+
if retry < max_retry:
119+
dlog.warning("Get error code %d in checking status through ssh with job: %s . message: %s" %
120+
(ret, job.job_hash, err_str))
121+
dlog.warning("Sleep 60 s and retry checking...")
122+
# rest 60s
123+
time.sleep(60)
124+
return self.check_status(job, retry=retry + 1, max_retry=max_retry)
104125
raise RuntimeError("status command bjobs fails to execute.\n error info: %s \nreturn code %d\n"
105126
% (err_str, ret))
106127
status_out = stdout.read().decode('utf-8').split('\n')

0 commit comments

Comments
 (0)