Skip to content

Commit fc5b7ab

Browse files
garlickchu11
authored andcommitted
job-manager: make some replay errors non-fatal
Problem: if a few jobs get messed up in the KVS due to an improper shutdown, recovery is a tedious process involving starting flux in --recovery mode, fixing one job, and starting again. When a job cannot be replayed from the KVS and the reason is that the directory is incomplete, log the failure at LOG_ERR level but let replay continue and ultimately the flux restart be successful. If a job has more serious problems like incorrect content in the eventlog, treat that as a fatal error as before. This avoids breaking the 'valid' tests that check backwards compatibility with older kvs dumps, which might use an older eventlog format. Update t2219-job-manage-restart.t to expect warnings rather than failure when such jobs are encountered during replay. Fixes #5147
1 parent d8f2f10 commit fc5b7ab

File tree

4 files changed

+45
-4
lines changed

4 files changed

+45
-4
lines changed

src/modules/job-manager/restart.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ int restart_count_char (const char *s, char c)
4646
return count;
4747
}
4848

49-
static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
49+
static struct job *lookup_job (flux_t *h,
50+
flux_jobid_t id,
51+
flux_error_t *error,
52+
bool *fatal)
5053
{
5154
flux_future_t *f1 = NULL;
5255
flux_future_t *f2 = NULL;
@@ -63,24 +66,33 @@ static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
6366
"cannot send lookup requests for job %ju: %s",
6467
(uintmax_t)id,
6568
strerror (errno));
69+
*fatal = true;
6670
goto done;
6771
}
6872
if (flux_kvs_lookup_get (f1, &eventlog) < 0) {
6973
errprintf (error, "lookup %s: %s", k1, strerror (errno));
74+
*fatal = false;
7075
goto done;
7176
}
7277
if (flux_kvs_lookup_get (f2, &jobspec) < 0) {
7378
errprintf (error, "lookup %s: %s", k2, strerror (errno));
79+
*fatal = false;
7480
goto done;
7581
}
76-
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e)))
82+
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e))) {
7783
errprintf (error, "replay %s: %s", k1, e.text);
84+
*fatal = true;
85+
}
7886
done:
7987
flux_future_destroy (f1);
8088
flux_future_destroy (f2);
8189
return job;
8290
}
8391

92+
/* Create a 'struct job' from the KVS, using synchronous KVS RPCs.
93+
* Return 1 on success, 0 on non-fatal error, or -1 on a fatal error,
94+
* where a fatal error will prevent flux from starting.
95+
*/
8496
static int depthfirst_map_one (flux_t *h,
8597
const char *key,
8698
int dirskip,
@@ -101,8 +113,22 @@ static int depthfirst_map_one (flux_t *h,
101113
errprintf (error, "could not decode %s to job ID", key + dirskip + 1);
102114
return -1;
103115
}
104-
if (!(job = lookup_job (h, id, error)))
105-
return -1;
116+
117+
flux_error_t lookup_error;
118+
bool fatal = false;
119+
if (!(job = lookup_job (h, id, &lookup_error, &fatal))) {
120+
if (fatal) {
121+
errprintf (error, "%s", lookup_error.text);
122+
return -1;
123+
}
124+
flux_log (h,
125+
LOG_ERR,
126+
"job %ju not replayed: %s",
127+
(uintmax_t)id,
128+
lookup_error.text);
129+
return 0;
130+
}
131+
106132
if (cb (job, arg, error) < 0)
107133
goto done;
108134
rc = 1;
File renamed without changes.
File renamed without changes.

t/t2219-job-manager-restart.t

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ restart_flux() {
1919
flux module stats job-manager
2020
}
2121

22+
# Returns 0 if dump file is replayed successfully AND one or more
23+
# "not replayed" warnings were logged
24+
restart_with_job_warning() {
25+
local out=$(basename $1).dmesg
26+
flux start -o,-Scontent.restore=$1 /bin/true 2>$out
27+
result=$?
28+
cat $out
29+
test $result -eq 0 && grep -q "not replayed:" $out
30+
}
31+
2232
test_expect_success 'verify that job manager can restart with current dump' '
2333
restart_flux dump.tar >stats.out
2434
'
@@ -193,6 +203,11 @@ for dump in ${DUMPS}/valid/*.tar.bz2; do
193203
test_expect_success 'successfully started from '$testname "restart_flux $dump"
194204
done
195205

206+
for dump in ${DUMPS}/warn/*.tar.bz2; do
207+
testname=$(basename $dump)
208+
test_expect_success 'successfully started with warning from '$testname "restart_with_job_warning $dump"
209+
done
210+
196211
for dump in ${DUMPS}/invalid/*.tar.bz2; do
197212
testname=$(basename $dump)
198213
test_expect_success 'failed on '$testname "test_must_fail restart_flux $dump"

0 commit comments

Comments
 (0)