Skip to content

Commit 2527c83

Browse files
authored
Merge pull request #5712 from chu11/issue5688_job_list_missing_queue_stats
job-list: initialize queue stats
2 parents e144432 + 7958552 commit 2527c83

File tree

7 files changed

+162
-15
lines changed

7 files changed

+162
-15
lines changed

src/bindings/python/flux/kvs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -795,6 +795,7 @@ def __del__(self):
795795
try:
796796
super().__del__()
797797
except AttributeError:
798+
# not an error if super did not implement
798799
pass
799800

800801
def __init__(self, future_handle):

src/modules/job-list/job-list.c

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,30 @@ static void disconnect_cb (flux_t *h,
107107
job_stats_disconnect (ctx->jsctx->statsctx, msg);
108108
}
109109

110+
static void config_reload_cb (flux_t *h,
111+
flux_msg_handler_t *mh,
112+
const flux_msg_t *msg,
113+
void *arg)
114+
{
115+
struct list_ctx *ctx = arg;
116+
const flux_conf_t *conf;
117+
flux_error_t error;
118+
const char *errstr = NULL;
119+
120+
if (flux_conf_reload_decode (msg, &conf) < 0)
121+
goto error;
122+
if (job_state_config_reload (ctx->jsctx, conf, &error) < 0) {
123+
errstr = error.text;
124+
goto error;
125+
}
126+
if (flux_respond (h, msg, NULL) < 0)
127+
flux_log_error (h, "error responding to config-reload request");
128+
return;
129+
error:
130+
if (flux_respond_error (h, msg, errno, errstr) < 0)
131+
flux_log_error (h, "error responding to config-reload request");
132+
}
133+
110134
static const struct flux_msg_handler_spec htab[] = {
111135
{ .typemask = FLUX_MSGTYPE_REQUEST,
112136
.topic_glob = "job-list.list",
@@ -148,6 +172,12 @@ static const struct flux_msg_handler_spec htab[] = {
148172
.cb = disconnect_cb,
149173
.rolemask = FLUX_ROLE_USER,
150174
},
175+
{
176+
.typemask = FLUX_MSGTYPE_REQUEST,
177+
.topic_glob = "job-list.config-reload",
178+
.cb = config_reload_cb,
179+
.rolemask = 0
180+
},
151181
FLUX_MSGHANDLER_TABLE_END,
152182
};
153183

src/modules/job-list/job_state.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1798,6 +1798,13 @@ void job_state_destroy (void *data)
17981798
}
17991799
}
18001800

1801+
int job_state_config_reload (struct job_state_ctx *jsctx,
1802+
const flux_conf_t *conf,
1803+
flux_error_t *errp)
1804+
{
1805+
return job_stats_config_reload (jsctx->statsctx, conf, errp);
1806+
}
1807+
18011808
/*
18021809
* vi:tabstop=4 shiftwidth=4 expandtab
18031810
*/

src/modules/job-list/job_state.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ void job_state_unpause_cb (flux_t *h, flux_msg_handler_t *mh,
7272

7373
int job_state_init_from_kvs (struct job_state_ctx *jsctx);
7474

75+
int job_state_config_reload (struct job_state_ctx *jsctx,
76+
const flux_conf_t *conf,
77+
flux_error_t *errp);
78+
7579
#endif /* ! _FLUX_JOB_LIST_JOB_STATE_H */
7680

7781
/*

src/modules/job-list/stats.c

Lines changed: 64 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,18 +45,19 @@ static void free_wrapper (void **item)
4545
}
4646

4747
static struct job_stats *queue_stats_lookup (struct job_stats_ctx *statsctx,
48-
struct job *job)
48+
const char *name,
49+
bool create_if_missing)
4950
{
5051
struct job_stats *stats = NULL;
5152

52-
if (!job->queue)
53+
if (!name)
5354
return NULL;
5455

55-
stats = zhashx_lookup (statsctx->queue_stats, job->queue);
56-
if (!stats) {
56+
stats = zhashx_lookup (statsctx->queue_stats, name);
57+
if (!stats && create_if_missing) {
5758
if (!(stats = calloc (1, sizeof (*stats))))
5859
return NULL;
59-
(void)zhashx_insert (statsctx->queue_stats, job->queue, stats);
60+
(void)zhashx_insert (statsctx->queue_stats, name, stats);
6061
}
6162
return stats;
6263
}
@@ -127,7 +128,7 @@ void job_stats_update (struct job_stats_ctx *statsctx,
127128

128129
stats_update (&statsctx->all, job, newstate);
129130

130-
if ((stats = queue_stats_lookup (statsctx, job)))
131+
if ((stats = queue_stats_lookup (statsctx, job->queue, true)))
131132
stats_update (stats, job, newstate);
132133

133134
arm_timer (statsctx);
@@ -138,7 +139,7 @@ void job_stats_add_queue (struct job_stats_ctx *statsctx,
138139
{
139140
struct job_stats *stats;
140141

141-
if ((stats = queue_stats_lookup (statsctx, job)))
142+
if ((stats = queue_stats_lookup (statsctx, job->queue, true)))
142143
stats_add (stats, job, job->state);
143144

144145
arm_timer (statsctx);
@@ -174,9 +175,16 @@ void job_stats_remove_queue (struct job_stats_ctx *statsctx,
174175
{
175176
struct job_stats *stats;
176177

177-
if ((stats = queue_stats_lookup (statsctx, job)))
178-
stats_remove (stats, job);
178+
if (!(stats = queue_stats_lookup (statsctx, job->queue, false))) {
179+
if (job->queue)
180+
flux_log (statsctx->h,
181+
LOG_DEBUG,
182+
"no queue stats for %s",
183+
job->queue);
184+
return;
185+
}
179186

187+
stats_remove (stats, job);
180188
arm_timer (statsctx);
181189
}
182190

@@ -211,9 +219,16 @@ void job_stats_purge (struct job_stats_ctx *statsctx, struct job *job)
211219

212220
stats_purge (&statsctx->all, job);
213221

214-
if ((stats = queue_stats_lookup (statsctx, job)))
215-
stats_purge (stats, job);
222+
if (!(stats = queue_stats_lookup (statsctx, job->queue, false))) {
223+
if (job->queue)
224+
flux_log (statsctx->h,
225+
LOG_DEBUG,
226+
"no queue stats for %s",
227+
job->queue);
228+
return;
229+
}
216230

231+
stats_purge (stats, job);
217232
arm_timer (statsctx);
218233
}
219234

@@ -408,6 +423,36 @@ static void job_stats_cb (flux_t *h,
408423
flux_log_error (h, "error responding to job-stats request");
409424
}
410425

426+
static int config_parse_queues (struct job_stats_ctx *statsctx,
427+
const flux_conf_t *conf,
428+
flux_error_t *errp)
429+
{
430+
json_t *queues;
431+
432+
if (flux_conf_unpack (conf, NULL, "{s:o}", "queues", &queues) == 0
433+
&& json_object_size (queues) > 0) {
434+
const char *name;
435+
json_t *value;
436+
json_object_foreach (queues, name, value) {
437+
/* setup initial queue stats, so that user gets initial
438+
* stats before first job is submitted to the queue */
439+
if (!queue_stats_lookup (statsctx, name, true)) {
440+
flux_log_error (statsctx->h, "queue_stats_lookup");
441+
return -1;
442+
}
443+
}
444+
}
445+
446+
return 0;
447+
}
448+
449+
int job_stats_config_reload (struct job_stats_ctx *statsctx,
450+
const flux_conf_t *conf,
451+
flux_error_t *errp)
452+
{
453+
return config_parse_queues (statsctx, conf, errp);
454+
}
455+
411456
static const struct flux_msg_handler_spec htab[] = {
412457
{ .typemask = FLUX_MSGTYPE_REQUEST,
413458
.topic_glob = "job-list.job-stats",
@@ -420,6 +465,7 @@ static const struct flux_msg_handler_spec htab[] = {
420465
struct job_stats_ctx *job_stats_ctx_create (flux_t *h)
421466
{
422467
struct job_stats_ctx *statsctx = NULL;
468+
flux_error_t error;
423469

424470
if (!(statsctx = calloc (1, sizeof (*statsctx))))
425471
return NULL;
@@ -441,6 +487,13 @@ struct job_stats_ctx *job_stats_ctx_create (flux_t *h)
441487
statsctx)))
442488
goto error;
443489

490+
if (config_parse_queues (statsctx,
491+
flux_get_conf (statsctx->h),
492+
&error) < 0) {
493+
flux_log (statsctx->h, LOG_ERR, "%s", error.text);
494+
goto error;
495+
}
496+
444497
return statsctx;
445498

446499
error:

src/modules/job-list/stats.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,10 @@ void job_stats_disconnect (struct job_stats_ctx *statsctx,
5151
*/
5252
int job_stats_watchers (struct job_stats_ctx *statsctx);
5353

54+
int job_stats_config_reload (struct job_stats_ctx *statsctx,
55+
const flux_conf_t *conf,
56+
flux_error_t *errp);
57+
5458
#endif /* ! _FLUX_JOB_LIST_JOB_STATS_H */
5559

5660
// vi: ts=4 sw=4 expandtab

t/t2260-job-list.t

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2132,10 +2132,14 @@ test_expect_success 'job-list parses flux-restart events' '
21322132
# jobspec-update event testing
21332133
#
21342134

2135+
# N.B. "defaultqueue" will habe jobs "submitted" to it, but updates
2136+
# make it appear that jobs were not submitted to it. "nosubmitqueue"
2137+
# will never have jobs submitted to it.
21352138
test_expect_success 'configure update queues' '
21362139
flux config load <<-EOT &&
21372140
[queues.defaultqueue]
21382141
[queues.updatequeue]
2142+
[queues.nosubmitqueue]
21392143
EOT
21402144
flux queue start --all
21412145
'
@@ -2173,9 +2177,15 @@ test_expect_success 'job-list returns expected jobspec changes' '
21732177
flux job list -s inactive | grep $(cat update1.id) | jq -e ".duration == 1000.0"
21742178
'
21752179

2180+
#
2181+
# "nosubmitqueue" never has job submitted to it, make sure stats
2182+
# string is non-empty and filled with zeros.
2183+
#
21762184
test_expect_success 'job stats lists jobs in correct state in each queue' '
21772185
defaultq=`flux job stats | jq ".queues[] | select( .name == \"defaultqueue\" )"` &&
21782186
updateq=`flux job stats | jq ".queues[] | select( .name == \"updatequeue\" )"` &&
2187+
nosubmitq=`flux job stats | jq ".queues[] | select( .name == \"nosubmitqueue\" )"` &&
2188+
test -n "$nosubmitq" &&
21792189
echo $defaultq | jq -e ".job_states.depend == 0" &&
21802190
echo $defaultq | jq -e ".job_states.priority == 0" &&
21812191
echo $defaultq | jq -e ".job_states.sched == 0" &&
@@ -2199,7 +2209,19 @@ test_expect_success 'job stats lists jobs in correct state in each queue' '
21992209
echo $updateq | jq -e ".failed == 0" &&
22002210
echo $updateq | jq -e ".canceled == 0" &&
22012211
echo $updateq | jq -e ".timeout == 0" &&
2202-
echo $updateq | jq -e ".inactive_purged == 0"
2212+
echo $updateq | jq -e ".inactive_purged == 0" &&
2213+
echo $nosubmitq | jq -e ".job_states.depend == 0" &&
2214+
echo $nosubmitq | jq -e ".job_states.priority == 0" &&
2215+
echo $nosubmitq | jq -e ".job_states.sched == 0" &&
2216+
echo $nosubmitq | jq -e ".job_states.run == 0" &&
2217+
echo $nosubmitq | jq -e ".job_states.cleanup == 0" &&
2218+
echo $nosubmitq | jq -e ".job_states.inactive == 0" &&
2219+
echo $nosubmitq | jq -e ".job_states.total == 0" &&
2220+
echo $nosubmitq | jq -e ".successful == 0" &&
2221+
echo $nosubmitq | jq -e ".failed == 0" &&
2222+
echo $nosubmitq | jq -e ".canceled == 0" &&
2223+
echo $nosubmitq | jq -e ".timeout == 0" &&
2224+
echo $nosubmitq | jq -e ".inactive_purged == 0"
22032225
'
22042226

22052227
test_expect_success 'reload the job-list module' '
@@ -2215,12 +2237,26 @@ test_expect_success 'job-list returns expected jobspec changes after reload' '
22152237

22162238
#
22172239
# After reload, job-list will not have ever seen any jobs submitted to
2218-
# "defaultqueue" therefore the stats object is empty.
2240+
# "defaultqueue", need to check that stats string is non-empty and all
2241+
# stats are 0.
22192242
#
22202243
test_expect_success 'job stats in each queue correct after reload' '
22212244
defaultq=`flux job stats | jq ".queues[] | select( .name == \"defaultqueue\" )"` &&
22222245
updateq=`flux job stats | jq ".queues[] | select( .name == \"updatequeue\" )"` &&
2223-
test -z "$defaultq" &&
2246+
test -n "$defaultq" &&
2247+
test -n "$nosubmitq" &&
2248+
echo $defaultq | jq -e ".job_states.depend == 0" &&
2249+
echo $defaultq | jq -e ".job_states.priority == 0" &&
2250+
echo $defaultq | jq -e ".job_states.sched == 0" &&
2251+
echo $defaultq | jq -e ".job_states.run == 0" &&
2252+
echo $defaultq | jq -e ".job_states.cleanup == 0" &&
2253+
echo $defaultq | jq -e ".job_states.inactive == 0" &&
2254+
echo $defaultq | jq -e ".job_states.total == 0" &&
2255+
echo $defaultq | jq -e ".successful == 0" &&
2256+
echo $defaultq | jq -e ".failed == 0" &&
2257+
echo $defaultq | jq -e ".canceled == 0" &&
2258+
echo $defaultq | jq -e ".timeout == 0" &&
2259+
echo $defaultq | jq -e ".inactive_purged == 0" &&
22242260
echo $updateq | jq -e ".job_states.depend == 0" &&
22252261
echo $updateq | jq -e ".job_states.priority == 0" &&
22262262
echo $updateq | jq -e ".job_states.sched == 0" &&
@@ -2232,7 +2268,19 @@ test_expect_success 'job stats in each queue correct after reload' '
22322268
echo $updateq | jq -e ".failed == 0" &&
22332269
echo $updateq | jq -e ".canceled == 0" &&
22342270
echo $updateq | jq -e ".timeout == 0" &&
2235-
echo $updateq | jq -e ".inactive_purged == 0"
2271+
echo $updateq | jq -e ".inactive_purged == 0" &&
2272+
echo $nosubmitq | jq -e ".job_states.depend == 0" &&
2273+
echo $nosubmitq | jq -e ".job_states.priority == 0" &&
2274+
echo $nosubmitq | jq -e ".job_states.sched == 0" &&
2275+
echo $nosubmitq | jq -e ".job_states.run == 0" &&
2276+
echo $nosubmitq | jq -e ".job_states.cleanup == 0" &&
2277+
echo $nosubmitq | jq -e ".job_states.inactive == 0" &&
2278+
echo $nosubmitq | jq -e ".job_states.total == 0" &&
2279+
echo $nosubmitq | jq -e ".successful == 0" &&
2280+
echo $nosubmitq | jq -e ".failed == 0" &&
2281+
echo $nosubmitq | jq -e ".canceled == 0" &&
2282+
echo $nosubmitq | jq -e ".timeout == 0" &&
2283+
echo $nosubmitq | jq -e ".inactive_purged == 0"
22362284
'
22372285

22382286
test_expect_success 'remove jobtap plugins and remove queue config' '

0 commit comments

Comments
 (0)