Skip to content

Commit 6462184

Browse files
authored
Merge pull request #5656 from chu11/issue4186_flux_jobs_filter_nodes
job-list: support "hostlist" constraint to allow jobs to be filtered by nodes
2 parents 67fc412 + d381bf3 commit 6462184

File tree

8 files changed

+647
-24
lines changed

8 files changed

+647
-24
lines changed

src/modules/job-list/job_data.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ void job_destroy (void *data)
3535
int save_errno = errno;
3636
free (job->ranks);
3737
free (job->nodelist);
38+
hostlist_destroy (job->nodelist_hl);
3839
json_decref (job->annotations);
3940
grudgeset_destroy (job->dependencies);
4041
json_decref (job->jobspec);

src/modules/job-list/job_data.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <flux/core.h>
1515
#include <jansson.h>
1616

17+
#include "src/common/libhostlist/hostlist.h"
1718
#include "src/common/libutil/grudgeset.h"
1819
#include "src/common/libczmqcontainers/czmq_containers.h"
1920

@@ -54,6 +55,7 @@ struct job {
5455
int nnodes;
5556
char *ranks;
5657
char *nodelist;
58+
struct hostlist *nodelist_hl; /* cache of nodelist in hl form */
5759
double expiration;
5860
int wait_status;
5961
bool success;

src/modules/job-list/match.c

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@ typedef enum {
5151
MATCH_LESS_THAN = 4,
5252
} match_comparison_t;
5353

54+
#define MIN_MATCH_HOSTLIST 1024
55+
5456
struct timestamp_value {
5557
double t_value;
5658
match_timestamp_type_t t_type;
@@ -428,6 +430,91 @@ static struct list_constraint *create_results_constraint (struct match_ctx *mctx
428430
errp);
429431
}
430432

433+
static int match_hostlist (struct list_constraint *c,
434+
const struct job *job,
435+
unsigned int *comparisons,
436+
flux_error_t *errp)
437+
{
438+
struct hostlist *hl = zlistx_first (c->values);
439+
const char *host;
440+
441+
/* nodelist may not exist if job never ran */
442+
if (!job->nodelist)
443+
return 0;
444+
if (!job->nodelist_hl) {
445+
/* hack to remove const */
446+
struct job *jobtmp = (struct job *)job;
447+
if (!(jobtmp->nodelist_hl = hostlist_decode (job->nodelist)))
448+
return 0;
449+
}
450+
host = hostlist_first (hl);
451+
while (host) {
452+
if (inc_check_comparison (c->mctx, comparisons, errp) < 0)
453+
return -1;
454+
if (hostlist_find (job->nodelist_hl, host) >= 0)
455+
return 1;
456+
host = hostlist_next (hl);
457+
}
458+
return 0;
459+
}
460+
461+
/* zlistx_set_destructor */
462+
static void wrap_hostlist_destroy (void **item)
463+
{
464+
if (item) {
465+
struct hostlist *hl = *item;
466+
hostlist_destroy (hl);
467+
(*item) = NULL;
468+
}
469+
}
470+
471+
static struct list_constraint *create_hostlist_constraint (
472+
struct match_ctx *mctx,
473+
json_t *values,
474+
flux_error_t *errp)
475+
{
476+
struct list_constraint *c;
477+
struct hostlist *hl = NULL;
478+
json_t *entry;
479+
size_t index;
480+
481+
if (!(c = list_constraint_new (mctx,
482+
match_hostlist,
483+
wrap_hostlist_destroy,
484+
errp)))
485+
return NULL;
486+
/* Create a single hostlist if user specifies multiple nodes or
487+
* RFC29 hostlist range */
488+
if (!(hl = hostlist_create ())) {
489+
errprintf (errp, "failed to create hostlist structure");
490+
goto error;
491+
}
492+
json_array_foreach (values, index, entry) {
493+
if (!json_is_string (entry)) {
494+
errprintf (errp, "host value must be a string");
495+
goto error;
496+
}
497+
if (hostlist_append (hl, json_string_value (entry)) <= 0) {
498+
errprintf (errp, "host value not in valid Hostlist format");
499+
goto error;
500+
}
501+
}
502+
if (hostlist_count (hl) > mctx->max_hostlist) {
503+
errprintf (errp, "too many hosts specified");
504+
goto error;
505+
}
506+
if (!zlistx_add_end (c->values, hl)) {
507+
errprintf (errp, "failed to append hostlist structure");
508+
hostlist_destroy (hl);
509+
goto error;
510+
}
511+
return c;
512+
error:
513+
hostlist_destroy (hl);
514+
list_constraint_destroy (c);
515+
return NULL;
516+
}
517+
431518
static int match_timestamp (struct list_constraint *c,
432519
const struct job *job,
433520
unsigned int *comparisons,
@@ -665,6 +752,8 @@ struct list_constraint *list_constraint_create (struct match_ctx *mctx,
665752
return create_states_constraint (mctx, values, errp);
666753
else if (streq (op, "results"))
667754
return create_results_constraint (mctx, values, errp);
755+
else if (streq (op, "hostlist"))
756+
return create_hostlist_constraint (mctx, values, errp);
668757
else if (streq (op, "t_submit")
669758
|| streq (op, "t_depend")
670759
|| streq (op, "t_run")
@@ -743,6 +832,30 @@ struct match_ctx *match_ctx_create (flux_t *h)
743832
goto error;
744833
}
745834

835+
if (flux_get_size (mctx->h, &mctx->max_hostlist) < 0) {
836+
flux_log_error (h, "failed to get instance size");
837+
goto error;
838+
}
839+
840+
/* Notes:
841+
*
842+
* We do not want a hostlist constraint match to DoS this module.
843+
* So we want to configure a "max" amount of hosts that can exist
844+
* within a hostlist constraint.
845+
*
846+
* Under normal operating conditions, the number of brokers should
847+
* represent the most likely maximum. But there are some corner
848+
* cases. For example, the instance gets reconfigured to be
849+
* smaller, which is not an uncommon thing to do towards a
850+
* cluster's end of life and hardware is beginning to die.
851+
*
852+
* So we configure the following compromise. If the number of
853+
* brokers is below our defined minimum MIN_MATCH_HOSTLIST, we'll
854+
* allow max_hostlist to be increased to this number.
855+
*/
856+
if (mctx->max_hostlist < MIN_MATCH_HOSTLIST)
857+
mctx->max_hostlist = MIN_MATCH_HOSTLIST;
858+
746859
return mctx;
747860

748861
error:

src/modules/job-list/match.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
struct match_ctx {
2424
flux_t *h;
2525
uint64_t max_comparisons;
26+
uint32_t max_hostlist;
2627
};
2728

2829
struct match_ctx *match_ctx_create (flux_t *h);

src/modules/job-list/state_match.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,8 @@ struct state_constraint *state_constraint_create (json_t *constraint, flux_error
366366
}
367367
if (streq (op, "userid")
368368
|| streq (op, "name")
369-
|| streq (op, "queue"))
369+
|| streq (op, "queue")
370+
|| streq (op, "hostlist"))
370371
return state_constraint_new (match_maybe, NULL, errp);
371372
else if (streq (op, "results"))
372373
return state_constraint_new (match_result, NULL, errp);

0 commit comments

Comments
 (0)