Skip to content

Commit 665d6ee

Browse files
authored
Merge pull request #5048 from chu11/issue5029_job_list_stats_consistency
job-list: make job stats consistent to job results
2 parents 920edbb + a77fcb3 commit 665d6ee

File tree

5 files changed

+30
-8
lines changed

5 files changed

+30
-8
lines changed

doc/man1/flux-jobs.rst

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,9 @@ OPTIONS
104104
will display a summary of statistics along with the top 25
105105
running jobs, updated every 2 seconds.
106106

107+
Note that all job failures, including canceled and timeout jobs,
108+
are collectively counted as "failed" in ``--stats``.
109+
107110
**--stats-only**
108111
Output a summary of job statistics and exit. By default shows
109112
global statistics. If ``--queue`` is specified, shows statistics
@@ -116,6 +119,9 @@ OPTIONS
116119
All options other than ``--queue`` are ignored when
117120
``--stats-only`` is used.
118121

122+
Note that all job failures, including canceled and timeout jobs,
123+
are collectively counted as "failed" in ``--stats-only``.
124+
119125
**-R, --recursive**
120126
List jobs recursively. Each child job which is also an instance of
121127
Flux is prefixed by its jobid "path" followed by the list of jobs,
@@ -156,7 +162,7 @@ states also exist: "pending", an alias for DEPEND,SCHED; "running", an
156162
alias for RUN,CLEANUP; "active", an alias for "pending,running".
157163

158164
After a job has finished and is in the INACTIVE state, it can be
159-
marked with one of three possible results: COMPLETED, FAILED,
165+
marked with one of the possible results: COMPLETED, FAILED,
160166
CANCELED, TIMEOUT. Under the *result_abbrev* field name, these are
161167
abbreviated as CD, F, CA, and TO respectively.
162168

src/bindings/python/flux/job/stats.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,12 @@ def _update_cb(self, rpc):
7878
self.running = self.run + self.cleanup
7979
self.active = self.total - self.inactive
8080

81+
# This class reports the total number of unsuccessful jobs in
82+
# the 'failed' attribute, not just the count of jobs that ran
83+
# to completion with nonzero exit code
84+
self.failed += self.timeout
85+
self.failed += self.canceled
86+
8187
if self.callback:
8288
self.callback(self, **self.cb_kwargs)
8389

src/cmd/top/summary_pane.c

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,11 @@ static void draw_stats (struct summary_pane *sum)
152152
}
153153

154154
if (sum->show_details) {
155-
int failed = sum->stats.failed;
155+
/* flux-top reports the total number of unsuccessful jobs in
156+
* the 'failed' display, not just the count of jobs that ran
157+
* to completion with nonzero exit code
158+
*/
159+
int failed = sum->stats.failed + sum->stats.timeout + sum->stats.canceled;
156160
int complete = sum->stats.successful;
157161

158162
if (complete)

src/modules/job-list/stats.c

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,13 +112,16 @@ static void stats_add (struct job_stats *stats,
112112

113113
if (state == FLUX_JOB_STATE_INACTIVE) {
114114
if (!job->success) {
115-
stats->failed++;
116115
if (job->exception_occurred) {
117116
if (streq (job->exception_type, "cancel"))
118117
stats->canceled++;
119118
else if (streq (job->exception_type, "timeout"))
120119
stats->timeout++;
120+
else
121+
stats->failed++;
121122
}
123+
else
124+
stats->failed++;
122125
}
123126
else
124127
stats->successful++;
@@ -162,13 +165,16 @@ static void stats_purge (struct job_stats *stats, struct job *job)
162165
stats->state_count[state_index (job->state)]--;
163166

164167
if (!job->success) {
165-
stats->failed--;
166168
if (job->exception_occurred) {
167169
if (streq (job->exception_type, "cancel"))
168170
stats->canceled--;
169171
else if (streq (job->exception_type, "timeout"))
170172
stats->timeout--;
173+
else
174+
stats->failed--;
171175
}
176+
else
177+
stats->failed--;
172178
}
173179
else
174180
stats->successful--;

t/t2260-job-list.t

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -352,7 +352,7 @@ test_expect_success 'job stats lists jobs in correct state (mix)' '
352352
flux job stats | jq -e ".job_states.inactive == $(job_list_state_count inactive)" &&
353353
flux job stats | jq -e ".job_states.total == $(job_list_state_count all)" &&
354354
flux job stats | jq -e ".successful == $(job_list_state_count completed)" &&
355-
flux job stats | jq -e ".failed == $(job_list_state_count failed canceled timeout)" &&
355+
flux job stats | jq -e ".failed == $(job_list_state_count failed)" &&
356356
flux job stats | jq -e ".canceled == $(job_list_state_count canceled)" &&
357357
flux job stats | jq -e ".timeout == $(job_list_state_count timeout)" &&
358358
flux job stats | jq -e ".inactive_purged == 0" &&
@@ -395,8 +395,8 @@ test_expect_success 'job-list: list successfully reconstructed' '
395395
test_cmp before_reload.out after_reload.out
396396
'
397397

398-
# the failed and canceled checks may look confusing. We canceled all active jobs
399-
# right above here, so all those active jobs became failed / canceled as a result
398+
# the canceled checks may look confusing. We canceled all active jobs
399+
# right above here, so all those active jobs became canceled as a result
400400
test_expect_success 'job stats lists jobs in correct state (all inactive)' '
401401
flux job stats | jq -e ".job_states.depend == 0" &&
402402
flux job stats | jq -e ".job_states.priority == 0" &&
@@ -406,7 +406,7 @@ test_expect_success 'job stats lists jobs in correct state (all inactive)' '
406406
flux job stats | jq -e ".job_states.inactive == $(job_list_state_count all)" &&
407407
flux job stats | jq -e ".job_states.total == $(job_list_state_count all)" &&
408408
flux job stats | jq -e ".successful == $(job_list_state_count completed)" &&
409-
flux job stats | jq -e ".failed == $(job_list_state_count active failed canceled timeout)" &&
409+
flux job stats | jq -e ".failed == $(job_list_state_count failed)" &&
410410
flux job stats | jq -e ".canceled == $(job_list_state_count active canceled)" &&
411411
flux job stats | jq -e ".timeout == $(job_list_state_count timeout)" &&
412412
flux job stats | jq -e ".inactive_purged == 0" &&

0 commit comments

Comments
 (0)