Skip to content

Commit a443ca7

Browse files
committed
job-manager: do not send purged events
Problem: If a job has been purged from the job-manager, events in the journal may still exist for those purged jobs. They can be sent to journal requestors. Solution: When a new request is made for the journal, check that the events in the journal are still valid before sending them. Fixes #4331
1 parent 152f82f commit a443ca7

File tree

3 files changed

+69
-1
lines changed

3 files changed

+69
-1
lines changed

src/modules/job-manager/journal.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,15 +185,22 @@ static void journal_handle_request (flux_t *h,
185185
wrapped_entry = zlist_first (journal->events);
186186
while (wrapped_entry) {
187187
const char *name;
188+
flux_jobid_t id;
188189

189190
if (json_unpack (wrapped_entry,
190-
"{s:{s:s}}",
191+
"{s:I s:{s:s}}",
192+
"id", &id,
191193
"entry",
192194
"name", &name) < 0) {
193195
flux_log (h, LOG_ERR, "invalid wrapped entry");
194196
goto error;
195197
}
196198

199+
/* ensure job has not been purged */
200+
if (!zhashx_lookup (ctx->active_jobs, &id)
201+
&& !zhashx_lookup (ctx->inactive_jobs, &id))
202+
goto next;
203+
197204
if (allow_deny_check (msg, name)) {
198205
if (!a) {
199206
if (!(a = json_array ()))
@@ -202,6 +209,7 @@ static void journal_handle_request (flux_t *h,
202209
if (json_array_append (a, wrapped_entry) < 0)
203210
goto nomem;
204211
}
212+
next:
205213
wrapped_entry = zlist_next (journal->events);
206214
}
207215

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ dist_check_SCRIPTS = \
290290
issues/t4182-resource-rerank.sh \
291291
issues/t4184-sched-simple-restart.sh \
292292
issues/t4222-kvs-assume-empty-dir.sh \
293+
issues/t4331-job-manager-purged-events.sh \
293294
python/__init__.py \
294295
python/subflux.py \
295296
python/tap \
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/bin/bash -e
2+
3+
# test-prereqs: HAVE_JQ
4+
5+
EVENTS_JOURNAL_STREAM=${FLUX_BUILD_DIR}/t/job-manager/events_journal_stream
6+
7+
# wait for jobid in events file to avoid race
8+
# arg1 - file
9+
# arg2 - jobid
10+
wait_jobid_event() {
11+
local file=$1
12+
local jobid=$2
13+
for i in `seq 1 30`
14+
do
15+
if grep -q ${jobid} ${file}
16+
then
17+
break
18+
fi
19+
sleep 1
20+
done
21+
if [ "${i}" -eq "30" ]
22+
then
23+
return 1
24+
fi
25+
return 0
26+
}
27+
28+
jobid1=`flux mini submit --wait hostname`
29+
jobid2=`flux mini submit --wait hostname`
30+
31+
jq -j -c -n "{}" | $EVENTS_JOURNAL_STREAM > events1.out &
32+
pid1=$!
33+
34+
jobid1dec=`flux job id --to=dec ${jobid1}`
35+
jobid2dec=`flux job id --to=dec ${jobid2}`
36+
37+
wait_jobid_event events1.out ${jobid2dec}
38+
39+
# kill background process
40+
kill -s SIGUSR1 ${pid1}
41+
42+
# jobid1 completed first, so if jobid2 is in events journal, jobid1
43+
# should be too
44+
45+
grep ${jobid1dec} events1.out
46+
47+
flux job purge --force --num-limit=1
48+
49+
jq -j -c -n "{}" | $EVENTS_JOURNAL_STREAM > events2.out &
50+
pid2=$!
51+
52+
wait_jobid_event events2.out ${jobid2dec}
53+
54+
# kill background process
55+
kill -s SIGUSR1 ${pid2}
56+
57+
# jobid1 should be purged now and not in the events journal
58+
59+
! grep ${jobid1dec} events2.out

0 commit comments

Comments
 (0)