Skip to content

Commit 99ecf1d

Browse files
authored
Merge pull request #4617 from chu11/job_list_duration
job-list: support retrieval of job duration
2 parents 086372b + 3ff24d3 commit 99ecf1d

File tree

10 files changed

+146
-32
lines changed

10 files changed

+146
-32
lines changed

doc/man1/flux-jobs.rst

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,12 +186,14 @@ the following conversion flags are supported by *flux-jobs*:
186186
datetime of epoch if timestamp field does not exist.
187187

188188
**!F**
189-
convert a duration in floating point seconds to Flux Standard Duration (FSD).
190-
string. Defaults to empty string if duration field does not exist.
189+
convert a time duration in floating point seconds to Flux Standard
190+
Duration (FSD) string (e.g. *{runtime!F}*). Defaults to empty string if
191+
field does not exist.
191192

192193
**!H**
193-
convert a duration to hours:minutes:seconds form (e.g. *{runtime!H}*).
194-
Defaults to empty string if duration field does not exist.
194+
convert a time duration in floating point seconds to
195+
hours:minutes:seconds form (e.g. *{runtime!H}*). Defaults to empty
196+
string if time duration field does not exist.
195197

196198
**!P**
197199
convert a floating point number into a percentage fitting in 5 characters
@@ -264,6 +266,9 @@ The field names that can be specified are:
264266
**ntasks**
265267
job task count
266268

269+
**duration**
270+
job duration in seconds
271+
267272
**nnodes**
268273
job node count (if job ran / is running), empty string otherwise
269274

src/bindings/python/flux/job/info.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,7 @@ class JobInfo:
200200
"t_run": 0.0,
201201
"t_cleanup": 0.0,
202202
"t_inactive": 0.0,
203+
"duration": 0.0,
203204
"expiration": 0.0,
204205
"name": "",
205206
"queue": "",
@@ -506,6 +507,7 @@ def get_field(self, field_name, args, kwargs):
506507
"name": "NAME",
507508
"queue": "QUEUE",
508509
"ntasks": "NTASKS",
510+
"duration": "DURATION",
509511
"nnodes": "NNODES",
510512
"expiration": "EXPIRATION",
511513
"t_remaining": "T_REMAINING",

src/cmd/flux-jobs.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,7 @@ def fetch_jobs_flux(args, fields, flux_handle=None):
155155
"name": ("name",),
156156
"queue": ("queue",),
157157
"ntasks": ("ntasks",),
158+
"duration": ("duration",),
158159
"nnodes": ("nnodes",),
159160
"ranks": ("ranks",),
160161
"nodelist": ("nodelist",),

src/modules/job-list/job-list.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
static const char *attrs[] = {
2525
"userid", "urgency", "priority", "t_submit",
2626
"t_depend", "t_run", "t_cleanup", "t_inactive",
27-
"state", "name", "queue", "ntasks", "nnodes",
27+
"state", "name", "queue", "ntasks", "duration", "nnodes",
2828
"ranks", "nodelist", "success", "exception_occurred",
2929
"exception_type", "exception_severity",
3030
"exception_note", "result", "expiration",

src/modules/job-list/job_data.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ struct job *job_create (struct list_ctx *ctx, flux_jobid_t id)
5656
job->priority = FLUX_JOB_PRIORITY_MIN;
5757
job->state = FLUX_JOB_STATE_NEW;
5858
job->ntasks = -1;
59+
job->duration = -1.0;
5960
job->nnodes = -1;
6061
job->expiration = -1.0;
6162
job->wait_status = -1;
@@ -153,6 +154,15 @@ static int parse_jobspec_nnodes (struct job *job, struct jj_counts *jj)
153154
return 0;
154155
}
155156

157+
static int parse_jobspec_duration (struct job *job, struct jj_counts *jj)
158+
{
159+
/* N.B. Jobspec V1 requires duration to be set, so duration will
160+
* always be >= 0 from libjj.
161+
*/
162+
job->duration = jj->duration;
163+
return 0;
164+
}
165+
156166
static int parse_per_resource (struct job *job,
157167
const char **type,
158168
int *count)
@@ -288,6 +298,9 @@ int job_parse_jobspec (struct job *job, const char *s)
288298
if (parse_jobspec_ntasks (job, &jj) < 0)
289299
goto nonfatal_error;
290300

301+
if (parse_jobspec_duration (job, &jj) < 0)
302+
goto nonfatal_error;
303+
291304
/* nonfatal error - jobspec illegal, but we'll continue on. job
292305
* listing will return whatever data is available */
293306
nonfatal_error:

src/modules/job-list/job_data.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ struct job {
4747
const char *name;
4848
const char *queue;
4949
int ntasks;
50+
double duration;
5051
int nnodes;
5152
char *ranks;
5253
char *nodelist;

src/modules/job-list/job_util.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,12 @@ static int store_attr (struct job *job,
9898
return 0;
9999
val = json_integer (job->ntasks);
100100
}
101+
else if (!strcmp (attr, "duration")) {
102+
/* job->duration potentially < 0 if jobspec invalid */
103+
if (job->duration < 0)
104+
return 0;
105+
val = json_real (job->duration);
106+
}
101107
else if (!strcmp (attr, "nnodes")) {
102108
/* job->nnodes < 0 if not set yet or R invalid, may be set in
103109
* DEPEND or RUN state */

t/python/t0010-job.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -475,6 +475,7 @@ def test_25_job_list_attrs(self):
475475
"name",
476476
"queue",
477477
"ntasks",
478+
"duration",
478479
"nnodes",
479480
"ranks",
480481
"nodelist",

t/t2260-job-list.t

Lines changed: 97 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -797,28 +797,75 @@ test_expect_success HAVE_JQ 'flux job list lists nnodes for pending jobs correct
797797
flux queue start
798798
'
799799

800+
test_expect_success 'reload the job-list module' '
801+
flux module reload job-list
802+
'
803+
804+
test_expect_success HAVE_JQ 'verify nnodes/ranks/nodelist preserved across restart' '
805+
jobid1=`cat nodecount1.id` &&
806+
jobid2=`cat nodecount2.id` &&
807+
jobid3=`cat nodecount3.id` &&
808+
jobid4=`cat nodecount4.id` &&
809+
obj=$(flux job list -s inactive | grep ${jobid1}) &&
810+
echo $obj | jq -e ".nnodes == 1" &&
811+
echo $obj | jq -e ".ranks == \"0\"" &&
812+
nodes=`flux job info ${jobid1} R | flux R decode --nodelist` &&
813+
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
814+
obj=$(flux job list -s inactive | grep ${jobid2}) &&
815+
echo $obj | jq -e ".nnodes == 1" &&
816+
echo $obj | jq -e ".ranks == \"0\"" &&
817+
nodes=`flux job info ${jobid2} R | flux R decode --nodelist` &&
818+
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
819+
obj=$(flux job list -s inactive | grep ${jobid3}) &&
820+
echo $obj | jq -e ".nnodes == 2" &&
821+
echo $obj | jq -e ".ranks == \"[0-1]\"" &&
822+
nodes=`flux job info ${jobid3} R | flux R decode --nodelist` &&
823+
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
824+
obj=$(flux job list -s inactive | grep ${jobid4}) &&
825+
echo $obj | jq -e ".nnodes == 3" &&
826+
echo $obj | jq -e ".ranks == \"[0-2]\"" &&
827+
nodes=`flux job info ${jobid4} R | flux R decode --nodelist` &&
828+
echo $obj | jq -e ".nodelist == \"${nodes}\""
829+
'
830+
800831
#
801832
# job success
802833
#
803834

804835
test_expect_success HAVE_JQ 'flux job list outputs success correctly (true)' '
805836
jobid=`flux mini submit --wait hostname | flux job id` &&
837+
echo $jobid > success1.id &&
806838
wait_jobid_state $jobid inactive &&
807839
obj=$(flux job list -s inactive | grep $jobid) &&
808840
echo $obj | jq -e ".success == true"
809841
'
810842

811843
test_expect_success HAVE_JQ 'flux job list outputs success correctly (false)' '
812844
jobid=`flux mini submit --wait nosuchcommand | flux job id` &&
845+
echo $jobid > success2.id &&
813846
wait_jobid_state $jobid inactive &&
814847
obj=$(flux job list -s inactive | grep $jobid) &&
815848
echo $obj | jq -e ".success == false"
816849
'
817850

851+
test_expect_success 'reload the job-list module' '
852+
flux module reload job-list
853+
'
854+
855+
test_expect_success HAVE_JQ 'verify task count preserved across restart' '
856+
jobid1=`cat success1.id` &&
857+
jobid2=`cat success2.id` &&
858+
obj=$(flux job list -s inactive | grep ${jobid1}) &&
859+
echo $obj | jq -e ".success == true" &&
860+
obj=$(flux job list -s inactive | grep ${jobid2}) &&
861+
echo $obj | jq -e ".success == false"
862+
'
863+
818864
# job exceptions
819865

820866
test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (no exception)' '
821867
jobid=`flux mini submit --wait hostname | flux job id` &&
868+
echo $jobid > exceptions1.id &&
822869
wait_jobid_state $jobid inactive &&
823870
obj=$(flux job list -s inactive | grep $jobid) &&
824871
echo $obj | jq -e ".exception_occurred == false" &&
@@ -829,6 +876,7 @@ test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (no exce
829876

830877
test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (exception)' '
831878
jobid=`flux mini submit --wait nosuchcommand | flux job id` &&
879+
echo $jobid > exceptions2.id &&
832880
wait_jobid_state $jobid inactive &&
833881
obj=$(flux job list -s inactive | grep $jobid) &&
834882
echo $obj | jq -e ".exception_occurred == true" &&
@@ -837,10 +885,31 @@ test_expect_success HAVE_JQ 'flux job list outputs exceptions correctly (excepti
837885
echo $obj | jq .exception_note | grep "No such file or directory"
838886
'
839887

888+
test_expect_success 'reload the job-list module' '
889+
flux module reload job-list
890+
'
891+
892+
test_expect_success HAVE_JQ 'verify task count preserved across restart' '
893+
jobid1=`cat exceptions1.id` &&
894+
jobid2=`cat exceptions2.id` &&
895+
obj=$(flux job list -s inactive | grep ${jobid1}) &&
896+
echo $obj | jq -e ".success == true" &&
897+
echo $obj | jq -e ".exception_occurred == false" &&
898+
echo $obj | jq -e ".exception_severity == null" &&
899+
echo $obj | jq -e ".exception_type == null" &&
900+
echo $obj | jq -e ".exception_note == null" &&
901+
obj=$(flux job list -s inactive | grep ${jobid2}) &&
902+
echo $obj | jq -e ".exception_occurred == true" &&
903+
echo $obj | jq -e ".exception_severity == 0" &&
904+
echo $obj | jq -e ".exception_type == \"exec\"" &&
905+
echo $obj | jq .exception_note | grep "No such file or directory"
906+
'
907+
840908
# expiration time
841909

842910
test_expect_success HAVE_JQ 'flux job list outputs expiration time when set' '
843-
jobid=$(flux mini submit -t 30s sleep 1000 | flux job id) &&
911+
jobid=$(flux mini submit -t 500s sleep 1000 | flux job id) &&
912+
echo $jobid > expiration.id &&
844913
fj_wait_event $jobid start &&
845914
flux job list | grep $jobid > expiration.json &&
846915
test_debug "cat expiration.json" &&
@@ -852,31 +921,32 @@ test_expect_success 'reload the job-list module' '
852921
flux module reload job-list
853922
'
854923

855-
test_expect_success HAVE_JQ 'verify nnodes/ranks/nodelist preserved across restart' '
856-
jobid1=`cat nodecount1.id` &&
857-
jobid2=`cat nodecount2.id` &&
858-
jobid3=`cat nodecount3.id` &&
859-
jobid4=`cat nodecount4.id` &&
860-
obj=$(flux job list -s inactive | grep ${jobid1}) &&
861-
echo $obj | jq -e ".nnodes == 1" &&
862-
echo $obj | jq -e ".ranks == \"0\"" &&
863-
nodes=`flux job info ${jobid1} R | flux R decode --nodelist` &&
864-
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
865-
obj=$(flux job list -s inactive | grep ${jobid2}) &&
866-
echo $obj | jq -e ".nnodes == 1" &&
867-
echo $obj | jq -e ".ranks == \"0\"" &&
868-
nodes=`flux job info ${jobid2} R | flux R decode --nodelist` &&
869-
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
870-
obj=$(flux job list -s inactive | grep ${jobid3}) &&
871-
echo $obj | jq -e ".nnodes == 2" &&
872-
echo $obj | jq -e ".ranks == \"[0-1]\"" &&
873-
nodes=`flux job info ${jobid3} R | flux R decode --nodelist` &&
874-
echo $obj | jq -e ".nodelist == \"${nodes}\"" &&
875-
obj=$(flux job list -s inactive | grep ${jobid4}) &&
876-
echo $obj | jq -e ".nnodes == 3" &&
877-
echo $obj | jq -e ".ranks == \"[0-2]\"" &&
878-
nodes=`flux job info ${jobid4} R | flux R decode --nodelist` &&
879-
echo $obj | jq -e ".nodelist == \"${nodes}\""
924+
test_expect_success HAVE_JQ 'verify task count preserved across restart' '
925+
jobid=`cat expiration.id` &&
926+
flux job list -s inactive | grep ${jobid} > expiration2.json &&
927+
jq -e ".expiration > now" < expiration2.json
928+
'
929+
930+
# duration time
931+
932+
test_expect_success HAVE_JQ 'flux job list outputs duration time when set' '
933+
jobid=$(flux mini submit -t 60m sleep 1000 | flux job id) &&
934+
echo $jobid > duration.id &&
935+
fj_wait_event $jobid start &&
936+
flux job list | grep $jobid > duration.json &&
937+
test_debug "cat duration.json" &&
938+
jq -e ".duration == 3600.0" < duration.json &&
939+
flux job cancel $jobid
940+
'
941+
942+
test_expect_success 'reload the job-list module' '
943+
flux module reload job-list
944+
'
945+
946+
test_expect_success HAVE_JQ 'verify task count preserved across restart' '
947+
jobid=`cat duration.id` &&
948+
flux job list -s inactive | grep ${jobid} > duration2.json &&
949+
jq -e ".duration == 3600.0" < duration2.json
880950
'
881951

882952
# all job attributes
@@ -1004,6 +1074,7 @@ t_inactive \
10041074
state \
10051075
name \
10061076
ntasks \
1077+
duration \
10071078
nnodes \
10081079
ranks \
10091080
nodelist \

t/t2800-jobs-cmd.t

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,19 @@ test_expect_success 'flux-jobs --format={ntasks},{nnodes},{nnodes:h} works' '
537537
test_cmp nodecountI.exp nodecountI.out
538538
'
539539

540+
test_expect_success 'flux-jobs --format={duration},{duration:h},{duration!F},{duration!H},{duration!F:h},{duration!H:h} works' '
541+
fmt="{duration},{duration:h},{duration!F},{duration!H},{duration!F:h},{duration!H:h}" &&
542+
flux jobs --filter=pending,running -no "${fmt}" > durationPR.out &&
543+
for i in `seq 1 $(state_count sched run)`; do
544+
echo "300.0,300.0,5m,0:05:00,5m,0:05:00" >> durationPR.exp
545+
done &&
546+
test_cmp durationPR.exp durationPR.out &&
547+
flux jobs --filter=completed -no "${fmt}" > durationCD.out &&
548+
for i in `seq 1 $(state_count completed)`;
549+
do echo "0.0,-,0s,0:00:00,-,-" >> durationCD.exp
550+
done &&
551+
test_cmp durationCD.exp durationCD.out
552+
'
540553

541554
test_expect_success 'flux-jobs --format={runtime:0.3f} works' '
542555
flux jobs --filter=pending -no "{runtime:0.3f}" > runtime-dotP.out &&
@@ -657,7 +670,7 @@ test_expect_success 'flux jobs --format={t_cleanup/{in}active} works' '
657670
test $count -eq $(state_count inactive)
658671
'
659672

660-
test_expect_success 'flux-jobs --format={runtime},{runtime!F},{runtime!F:h},{runtime!H},{runtime!H:h} works' '
673+
test_expect_success 'flux-jobs --format={runtime},{runtime!F},{runtime!H},{runtime!F:h},{runtime!H:h} works' '
661674
fmt="{runtime},{runtime!F},{runtime!H},{runtime!F:h},{runtime!H:h}" &&
662675
flux jobs --filter=pending -no "${fmt}" > runtimeP.out &&
663676
for i in `seq 1 $(state_count sched)`; do
@@ -917,6 +930,7 @@ test_expect_success 'flux-jobs: header included with all custom formats' '
917930
name==NAME
918931
queue==QUEUE
919932
ntasks==NTASKS
933+
duration==DURATION
920934
nnodes==NNODES
921935
ranks==RANKS
922936
nodelist==NODELIST

0 commit comments

Comments
 (0)