Skip to content

Commit b0baf4e

Browse files
authored
Merge pull request #5150 from garlick/issue#5147
job-manager: make some replay errors non-fatal
2 parents ebd4459 + fc5b7ab commit b0baf4e

File tree

5 files changed

+48
-7
lines changed

5 files changed

+48
-7
lines changed

src/modules/job-list/job_state.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ static int depthfirst_map_one (struct job_state_ctx *jsctx,
673673
flux_future_t *f3 = NULL;
674674
const char *eventlog, *jobspec, *R;
675675
char path[64];
676-
int rc = -1;
676+
int rc = 0; // non-fatal error
677677

678678
if (strlen (key) <= dirskip) {
679679
errno = EINVAL;
@@ -714,7 +714,7 @@ static int depthfirst_map_one (struct job_state_ctx *jsctx,
714714
if (job->states_mask & FLUX_JOB_STATE_RUN) {
715715
if (flux_job_kvs_key (path, sizeof (path), id, "R") < 0) {
716716
errno = EINVAL;
717-
return -1;
717+
goto done;
718718
}
719719
if (!(f3 = flux_kvs_lookup (jsctx->h, NULL, 0, path)))
720720
goto done;
@@ -736,7 +736,7 @@ static int depthfirst_map_one (struct job_state_ctx *jsctx,
736736

737737
rc = 1;
738738
done:
739-
if (rc < 0)
739+
if (rc == 0)
740740
job_destroy (job);
741741
flux_future_destroy (f1);
742742
flux_future_destroy (f2);

src/modules/job-manager/restart.c

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ int restart_count_char (const char *s, char c)
4646
return count;
4747
}
4848

49-
static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
49+
static struct job *lookup_job (flux_t *h,
50+
flux_jobid_t id,
51+
flux_error_t *error,
52+
bool *fatal)
5053
{
5154
flux_future_t *f1 = NULL;
5255
flux_future_t *f2 = NULL;
@@ -63,24 +66,33 @@ static struct job *lookup_job (flux_t *h, flux_jobid_t id, flux_error_t *error)
6366
"cannot send lookup requests for job %ju: %s",
6467
(uintmax_t)id,
6568
strerror (errno));
69+
*fatal = true;
6670
goto done;
6771
}
6872
if (flux_kvs_lookup_get (f1, &eventlog) < 0) {
6973
errprintf (error, "lookup %s: %s", k1, strerror (errno));
74+
*fatal = false;
7075
goto done;
7176
}
7277
if (flux_kvs_lookup_get (f2, &jobspec) < 0) {
7378
errprintf (error, "lookup %s: %s", k2, strerror (errno));
79+
*fatal = false;
7480
goto done;
7581
}
76-
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e)))
82+
if (!(job = job_create_from_eventlog (id, eventlog, jobspec, &e))) {
7783
errprintf (error, "replay %s: %s", k1, e.text);
84+
*fatal = true;
85+
}
7886
done:
7987
flux_future_destroy (f1);
8088
flux_future_destroy (f2);
8189
return job;
8290
}
8391

92+
/* Create a 'struct job' from the KVS, using synchronous KVS RPCs.
93+
* Return 1 on success, 0 on non-fatal error, or -1 on a fatal error,
94+
* where a fatal error will prevent flux from starting.
95+
*/
8496
static int depthfirst_map_one (flux_t *h,
8597
const char *key,
8698
int dirskip,
@@ -101,8 +113,22 @@ static int depthfirst_map_one (flux_t *h,
101113
errprintf (error, "could not decode %s to job ID", key + dirskip + 1);
102114
return -1;
103115
}
104-
if (!(job = lookup_job (h, id, error)))
105-
return -1;
116+
117+
flux_error_t lookup_error;
118+
bool fatal = false;
119+
if (!(job = lookup_job (h, id, &lookup_error, &fatal))) {
120+
if (fatal) {
121+
errprintf (error, "%s", lookup_error.text);
122+
return -1;
123+
}
124+
flux_log (h,
125+
LOG_ERR,
126+
"job %ju not replayed: %s",
127+
(uintmax_t)id,
128+
lookup_error.text);
129+
return 0;
130+
}
131+
106132
if (cb (job, arg, error) < 0)
107133
goto done;
108134
rc = 1;
File renamed without changes.
File renamed without changes.

t/t2219-job-manager-restart.t

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,16 @@ restart_flux() {
1919
flux module stats job-manager
2020
}
2121

22+
# Returns 0 if dump file is replayed successfully AND one or more
23+
# "not replayed" warnings were logged
24+
restart_with_job_warning() {
25+
local out=$(basename $1).dmesg
26+
flux start -o,-Scontent.restore=$1 /bin/true 2>$out
27+
result=$?
28+
cat $out
29+
test $result -eq 0 && grep -q "not replayed:" $out
30+
}
31+
2232
test_expect_success 'verify that job manager can restart with current dump' '
2333
restart_flux dump.tar >stats.out
2434
'
@@ -193,6 +203,11 @@ for dump in ${DUMPS}/valid/*.tar.bz2; do
193203
test_expect_success 'successfully started from '$testname "restart_flux $dump"
194204
done
195205

206+
for dump in ${DUMPS}/warn/*.tar.bz2; do
207+
testname=$(basename $dump)
208+
test_expect_success 'successfully started with warning from '$testname "restart_with_job_warning $dump"
209+
done
210+
196211
for dump in ${DUMPS}/invalid/*.tar.bz2; do
197212
testname=$(basename $dump)
198213
test_expect_success 'failed on '$testname "test_must_fail restart_flux $dump"

0 commit comments

Comments
 (0)