Skip to content

Commit 01b4d87

Browse files
authored
Merge pull request #6541 from grondo/issue#6418
add `resource.rediscover` config key to force rediscovery of subinstance resources
2 parents e6b108f + 41dc3a3 commit 01b4d87

File tree

9 files changed

+114
-76
lines changed

9 files changed

+114
-76
lines changed

doc/man5/flux-config-resource.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,10 @@ noverify
7575
(optional) If true, disable the draining of nodes when there is a
7676
discrepancy between configured resources and HWLOC-probed resources.
7777

78+
rediscover
79+
(optional) If true, force rediscovery of resources using HWLOC, rather
80+
then using the R and HWLOC XML from the enclosing instance.
81+
7882
Note that updates to the resource table are ignored until the next Flux
7983
restart.
8084

src/modules/resource/inventory.c

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -519,7 +519,8 @@ static int lookup_R_fallback (struct inventory *inv, flux_jobid_t id)
519519
return rc;
520520
}
521521

522-
static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
522+
static int start_resource_watch (struct inventory *inv,
523+
struct resource_config *config)
523524
{
524525
flux_t *h = inv->ctx->h;
525526
const char *jobid;
@@ -534,7 +535,7 @@ static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
534535
* simulate start under an older instance that does not support this
535536
* RPC.
536537
*/
537-
if (no_resource_watch)
538+
if (config->no_update_watch)
538539
service = "job-info.update-watch-fake";
539540

540541
if (!(jobid = flux_attr_get (h, "jobid")))
@@ -594,7 +595,11 @@ static int start_resource_watch (struct inventory *inv, bool no_resource_watch)
594595
}
595596
flux_future_reset (f);
596597
inv->R_watch_f = f;
597-
if (R) { // R = NULL if no conversion possible (fall through to discovery)
598+
599+
/* If R == NULL (no conversion possible) or rediscover == true, then
600+
* we will fall through to dynamic discovery.
601+
*/
602+
if (R && !config->rediscover) {
598603
if (inventory_put (inv, R, "job-info") < 0)
599604
goto done;
600605
if (flux_future_then (f,
@@ -898,8 +903,7 @@ void inventory_destroy (struct inventory *inv)
898903
}
899904

900905
struct inventory *inventory_create (struct resource_ctx *ctx,
901-
json_t *conf_R,
902-
bool no_update_watch)
906+
struct resource_config *config)
903907
{
904908
struct inventory *inv;
905909
json_t *R = NULL;
@@ -909,14 +913,14 @@ struct inventory *inventory_create (struct resource_ctx *ctx,
909913
inv->ctx = ctx;
910914
if (flux_msg_handler_addvec (ctx->h, htab, inv, &inv->handlers) < 0)
911915
goto error;
912-
if (conf_R && convert_R_conf (ctx->h, conf_R, &R) < 0)
916+
if (config->R && convert_R_conf (ctx->h, config->R, &R) < 0)
913917
goto error;
914918
if (ctx->rank == 0) {
915919
if (R && inventory_put (inv, R, "configuration") < 0)
916920
goto error;
917921
if (!inv->R && get_from_kvs (inv, "resource.R") < 0)
918922
goto error;
919-
if (!inv->R && start_resource_watch (inv, no_update_watch) < 0)
923+
if (!inv->R && start_resource_watch (inv, config) < 0)
920924
goto error;
921925
}
922926
else {

src/modules/resource/inventory.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,11 @@
1212
#define _FLUX_RESOURCE_INVENTORY_H
1313

1414
/* Create resource inventory.
15-
* R is configured resource object, if any (ref taken).
15+
* config->R is configured resource object, if any (ref taken).
1616
* R is obtained from enclosing Flux instance or probed dynamically otherwise.
1717
*/
1818
struct inventory *inventory_create (struct resource_ctx *ctx,
19-
json_t *R,
20-
bool no_update_watch);
19+
struct resource_config *config);
2120
void inventory_destroy (struct inventory *inv);
2221

2322
/* Get resource object.

src/modules/resource/resource.c

Lines changed: 33 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -57,14 +57,14 @@
5757
* no-update-watch = false
5858
* For testing purposes, simulate missing job-info.update-watch service
5959
* in parent instance by sending to an invalid service name.
60+
*
61+
* rediscover = false
62+
* Force rediscovery of local resources via hwloc. Do not fetch R or hwloc
63+
* XML from the enclosing instance.
6064
*/
6165
static int parse_config (struct resource_ctx *ctx,
6266
const flux_conf_t *conf,
63-
const char **excludep,
64-
json_t **R,
65-
bool *noverifyp,
66-
bool *norestrictp,
67-
bool *no_update_watchp,
67+
struct resource_config *rconfig,
6868
flux_error_t *errp)
6969
{
7070
flux_error_t error;
@@ -74,20 +74,22 @@ static int parse_config (struct resource_ctx *ctx,
7474
int noverify = 0;
7575
int norestrict = 0;
7676
int no_update_watch = 0;
77+
int rediscover = 0;
7778
json_t *o = NULL;
7879
json_t *config = NULL;
7980

8081
if (flux_conf_unpack (conf,
8182
&error,
82-
"{s?{s?s s?s s?o s?s s?b s?b s?b !}}",
83+
"{s?{s?s s?s s?o s?s s?b s?b s?b s?b !}}",
8384
"resource",
8485
"path", &path,
8586
"scheduling", &scheduling_path,
8687
"config", &config,
8788
"exclude", &exclude,
8889
"norestrict", &norestrict,
8990
"noverify", &noverify,
90-
"no-update-watch", &no_update_watch) < 0) {
91+
"no-update-watch", &no_update_watch,
92+
"rediscover", &rediscover) < 0) {
9193
errprintf (errp,
9294
"error parsing [resource] configuration: %s",
9395
error.text);
@@ -143,16 +145,14 @@ static int parse_config (struct resource_ctx *ctx,
143145
return -1;
144146
}
145147
}
146-
if (excludep)
147-
*excludep = exclude;
148-
if (noverifyp)
149-
*noverifyp = noverify ? true : false;
150-
if (norestrictp)
151-
*norestrictp = norestrict ? true : false;
152-
if (no_update_watchp)
153-
*no_update_watchp = no_update_watch ? true : false;
154-
if (R)
155-
*R = o;
148+
if (rconfig) {
149+
rconfig->exclude_idset = exclude;
150+
rconfig->noverify = noverify ? true : false;
151+
rconfig->norestrict = norestrict ? true : false;
152+
rconfig->no_update_watch = no_update_watch ? true : false;
153+
rconfig->rediscover = rediscover ? true : false;
154+
rconfig->R = o;
155+
}
156156
else
157157
json_decref (o);
158158
return 0;
@@ -174,7 +174,7 @@ static void config_reload_cb (flux_t *h,
174174

175175
if (flux_conf_reload_decode (msg, &conf) < 0)
176176
goto error;
177-
if (parse_config (ctx, conf, NULL, NULL, NULL, NULL, NULL, &error) < 0) {
177+
if (parse_config (ctx, conf, NULL, &error) < 0) {
178178
errstr = error.text;
179179
goto error;
180180
}
@@ -305,22 +305,19 @@ static int reload_eventlog (flux_t *h, json_t **eventlog)
305305
return -1;
306306
}
307307

308-
int parse_args (flux_t *h,
309-
int argc,
308+
int parse_args (flux_t *h, int argc,
310309
char **argv,
311-
bool *monitor_force_up,
312-
bool *noverify,
313-
bool *no_update_watch)
310+
struct resource_config *config)
314311
{
315312
int i;
316313
for (i = 0; i < argc; i++) {
317314
/* Test option to force all ranks to be marked online in the initial
318315
* 'restart' event posted to resource.eventlog.
319316
*/
320317
if (streq (argv[i], "monitor-force-up"))
321-
*monitor_force_up = true;
318+
config->monitor_force_up = true;
322319
else if (streq (argv[i], "noverify"))
323-
*noverify = true;
320+
config->noverify = true;
324321
else {
325322
flux_log (h, LOG_ERR, "unknown option: %s", argv[i]);
326323
errno = EINVAL;
@@ -335,52 +332,31 @@ int mod_main (flux_t *h, int argc, char **argv)
335332
{
336333
struct resource_ctx *ctx;
337334
flux_error_t error;
338-
const char *exclude_idset;
339335
json_t *eventlog = NULL;
340-
bool monitor_force_up = false;
341-
bool noverify = false;
342-
bool norestrict = false;
343-
bool no_update_watch = false;
344-
json_t *R_from_config;
336+
struct resource_config config = {0};
345337

346338
if (!(ctx = resource_ctx_create (h)))
347339
goto error;
348340
if (flux_get_size (h, &ctx->size) < 0)
349341
goto error;
350342
if (flux_get_rank (h, &ctx->rank) < 0)
351343
goto error;
352-
if (parse_config (ctx,
353-
flux_get_conf (h),
354-
&exclude_idset,
355-
&R_from_config,
356-
&noverify,
357-
&norestrict,
358-
&no_update_watch,
359-
&error) < 0) {
344+
if (parse_config (ctx, flux_get_conf (h), &config, &error) < 0) {
360345
flux_log (h, LOG_ERR, "%s", error.text);
361346
goto error;
362347
}
363-
if (parse_args (h,
364-
argc,
365-
argv,
366-
&monitor_force_up,
367-
&noverify,
368-
&no_update_watch) < 0)
348+
if (parse_args (h, argc, argv, &config) < 0)
369349
goto error;
370350
if (flux_attr_get (ctx->h, "broker.recovery-mode"))
371-
noverify = true;
351+
config.noverify = true;
372352

373353
/* Note: Order of creation of resource subsystems is important.
374354
* Create inventory on all ranks first, since it is required by
375355
* the exclude and drain subsystems on rank 0.
376356
*/
377-
if (!(ctx->inventory = inventory_create (ctx,
378-
R_from_config,
379-
no_update_watch)))
357+
if (!(ctx->inventory = inventory_create (ctx, &config)))
380358
goto error;
381-
/* Done with R_from_config now, so free it.
382-
*/
383-
json_decref (R_from_config);
359+
384360
if (ctx->rank == 0) {
385361
/* Create reslog and reload eventlog before initializing
386362
* acquire, exclude, and drain subsystems, since these
@@ -403,19 +379,19 @@ int mod_main (flux_t *h, int argc, char **argv)
403379
* the exclude idset to ensure drained ranks that are now
404380
* excluded are ignored.
405381
*/
406-
if (!(ctx->exclude = exclude_create (ctx, exclude_idset)))
382+
if (!(ctx->exclude = exclude_create (ctx, config.exclude_idset)))
407383
goto error;
408384
if (!(ctx->drain = drain_create (ctx, eventlog)))
409385
goto error;
410386
}
411387
/* topology is initialized after exclude/drain etc since this
412388
* rank may attempt to drain itself due to a topology mismatch.
413389
*/
414-
if (!(ctx->topology = topo_create (ctx, noverify, norestrict)))
390+
if (!(ctx->topology = topo_create (ctx, &config)))
415391
goto error;
416392
if (!(ctx->monitor = monitor_create (ctx,
417393
inventory_get_size (ctx->inventory),
418-
monitor_force_up)))
394+
config.monitor_force_up)))
419395
goto error;
420396
if (!(ctx->status = status_create (ctx)))
421397
goto error;
@@ -427,10 +403,12 @@ int mod_main (flux_t *h, int argc, char **argv)
427403
}
428404
resource_ctx_destroy (ctx);
429405
json_decref (eventlog);
406+
json_decref (config.R);
430407
return 0;
431408
error:
432409
resource_ctx_destroy (ctx);
433410
ERRNO_SAFE_WRAP (json_decref, eventlog);
411+
ERRNO_SAFE_WRAP (json_decref, config.R);
434412
return -1;
435413
}
436414

src/modules/resource/resource.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,16 @@
1111
#ifndef _FLUX_RESOURCE_H
1212
#define _FLUX_RESOURCE_H
1313

14+
struct resource_config {
15+
json_t *R;
16+
const char *exclude_idset;
17+
bool rediscover;
18+
bool noverify;
19+
bool norestrict;
20+
bool no_update_watch;
21+
bool monitor_force_up;
22+
};
23+
1424
struct resource_ctx {
1525
flux_t *h;
1626
flux_msg_handler_t **handlers;

src/modules/resource/topo.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -275,22 +275,24 @@ void topo_destroy (struct topo *topo)
275275
}
276276
}
277277

278-
static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
278+
static char *topo_get_local_xml (struct resource_ctx *ctx,
279+
struct resource_config *config)
279280
{
280281
flux_t *parent_h;
281282
flux_future_t *f = NULL;
282283
char *result = NULL;
283284
const char *xml;
284285

285286
errno = 0;
286-
if (!(parent_h = resource_parent_handle_open (ctx))
287+
if (config->rediscover
288+
|| !(parent_h = resource_parent_handle_open (ctx))
287289
|| !(f = flux_rpc (parent_h,
288290
"resource.topo-get",
289291
NULL,
290292
FLUX_NODEID_ANY,
291293
0))
292294
|| flux_rpc_get (f, &xml) < 0) {
293-
rhwloc_flags_t flags = no_restrict ? RHWLOC_NO_RESTRICT : 0;
295+
rhwloc_flags_t flags = config->norestrict ? RHWLOC_NO_RESTRICT : 0;
294296
/* ENOENT just means there is no parent instance.
295297
* No need for an error.
296298
*/
@@ -305,8 +307,8 @@ static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
305307
flux_log (ctx->h,
306308
LOG_INFO,
307309
"retrieved local hwloc XML from parent (norestrict=%s)",
308-
no_restrict ? "true" : "false");
309-
if (no_restrict) {
310+
config->norestrict ? "true" : "false");
311+
if (config->norestrict) {
310312
result = strdup (xml);
311313
goto out;
312314
}
@@ -320,16 +322,15 @@ static char *topo_get_local_xml (struct resource_ctx *ctx, bool no_restrict)
320322
}
321323

322324
struct topo *topo_create (struct resource_ctx *ctx,
323-
bool no_verify,
324-
bool no_restrict)
325+
struct resource_config *config)
325326
{
326327
struct topo *topo;
327328
json_t *R;
328329

329330
if (!(topo = calloc (1, sizeof (*topo))))
330331
return NULL;
331332
topo->ctx = ctx;
332-
if (!(topo->xml = topo_get_local_xml (ctx, no_restrict))) {
333+
if (!(topo->xml = topo_get_local_xml (ctx, config))) {
333334
flux_log (ctx->h, LOG_ERR, "error loading hwloc topology");
334335
goto error;
335336
}
@@ -345,7 +346,7 @@ struct topo *topo_create (struct resource_ctx *ctx,
345346

346347
if (method && streq (method, "job-info"))
347348
nodrain = true;
348-
if (!no_verify && topo_verify (topo, R, nodrain) < 0)
349+
if (!config->noverify && topo_verify (topo, R, nodrain) < 0)
349350
goto error;
350351
}
351352
/* Reduce topo to rank 0 unconditionally in case it is needed.

src/modules/resource/topo.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,7 @@
1212
#define _FLUX_RESOURCE_TOPO_H
1313

1414
struct topo *topo_create (struct resource_ctx *ctx,
15-
bool no_verify,
16-
bool no_restrict);
15+
struct resource_config *config);
1716
void topo_destroy (struct topo *topo);
1817

1918

t/Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ TESTSCRIPTS = \
179179
t2313-resource-acquire.t \
180180
t2314-resource-monitor.t \
181181
t2315-resource-system.t \
182+
t2316-resource-rediscover.t \
182183
t2350-resource-list.t \
183184
t2351-resource-status-input.t \
184185
t2352-resource-cmd-config.t \

0 commit comments

Comments
 (0)