From 7ee4e69002b550e2b53f5a6a86f72bfb6b3fce39 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 18 May 2023 10:01:53 -0700 Subject: [PATCH 1/8] broker: ensure CURVE certificate has a name Problem: internally generated curve certs are not named, so overlay_cert_name() can return NULL, but a name is required when authorizing a cert. This API inconsistency results in extra code and confusion when implementing a new boot method. Use the rank as the name for internally generated certs. --- src/broker/overlay.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/broker/overlay.c b/src/broker/overlay.c index b53f419d1701..5fc1f6b8f7b0 100644 --- a/src/broker/overlay.c +++ b/src/broker/overlay.c @@ -297,6 +297,12 @@ int overlay_set_topology (struct overlay *ov, struct topology *topo) ov->size = topology_get_size (topo); ov->rank = topology_get_rank (topo); + if (!cert_meta_get (ov->cert, "name")) { + char val[16]; + snprintf (val, sizeof (val), "%lu", (unsigned long)ov->rank); + if (cert_meta_set (ov->cert, "name", val) < 0) + goto error; + } ov->child_count = child_count; if (ov->child_count > 0) { int i; From 2e9f9f19d00837c24ca195a31fe30bd29cba42ca Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Wed, 17 May 2023 08:50:52 -0700 Subject: [PATCH 2/8] broker: allow instance size > PMI bootstrap size Problem: there is no way to bootstrap a flux instance using PMI with ranks (initially) missing. Allow the 'size' broker attribute to be set on the command line. If set to a value greater than the PMI size, perform the PMI exchange as usual with the PMI size, but configure the overlay topology with the additional ranks. Since 'hostlist' is an immutable attribute that is expected to be set by the bootstrap implementation, set it to include placeholders for the ranks that haven't connected yet "extra[0-N]" so we get something other than "(null)" in the logs. --- src/broker/boot_pmi.c | 68 +++++++++++++++++++++++++++++++------------ src/broker/overlay.c | 1 + 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/src/broker/boot_pmi.c b/src/broker/boot_pmi.c index b9d77f8c4626..da6b6f03e412 100644 --- a/src/broker/boot_pmi.c +++ b/src/broker/boot_pmi.c @@ -75,17 +75,22 @@ static char *pmi_mapping_to_taskmap (const char *s) } /* Set broker.mapping attribute from enclosing instance taskmap. + * Skip setting the taskmap if extra_ranks is nonzero since the + * mapping of those ranks is unknown. N.B. when broker.mapping is missing, + * use_ipc() below will return false, a good thing since it is unknown + * whether ipc:// would work for the TBON wire-up of extra nodes. */ static int set_broker_mapping_attr (struct upmi *upmi, int size, - attr_t *attrs) + attr_t *attrs, + int extra_ranks) { char *val = NULL; int rc; if (size == 1) val = strdup ("[[0,1,1,1]]"); - else { + else if (extra_ranks == 0) { /* First attempt to get flux.taskmap, falling back to * PMI_process_mapping if this key is not available. */ @@ -163,7 +168,7 @@ static int format_bind_uri (char *buf, int bufsz, attr_t *attrs, int rank) return -1; } -static int set_hostlist_attr (attr_t *attrs, struct hostlist *hl) +static int set_hostlist_attr (attr_t *attrs,struct hostlist *hl) { const char *value; char *s; @@ -220,9 +225,10 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) int child_count; int *child_ranks = NULL; const char *uri; - int i; int upmi_flags = UPMI_LIBPMI_NOFLUX; const char *upmi_method; + const char *s; + int size; // N.B. overlay_create() sets the tbon.topo attribute if (attr_get (attrs, "tbon.topo", &topo_uri, NULL) < 0) { @@ -249,11 +255,23 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) log_err ("set_instance_level_attr"); goto error; } - if (set_broker_mapping_attr (upmi, info.size, attrs) < 0) { + /* Allow the PMI size to be overridden with a larger one so that + * additional ranks can be grafted on later. + */ + size = info.size; + if (attr_get (attrs, "size", &s, NULL) == 0) { + errno = 0; + size = strtoul (s, NULL, 10); + if (errno != 0 || size <= info.size) { + log_msg ("instance size may only be increased"); + goto error; + } + } + if (set_broker_mapping_attr (upmi, size, attrs, size - info.size) < 0) { log_err ("error setting broker.mapping attribute"); goto error; } - if (!(topo = topology_create (topo_uri, info.size, &error))) { + if (!(topo = topology_create (topo_uri, size, &error))) { log_msg ("error creating '%s' topology: %s", topo_uri, error.text); goto error; } @@ -269,16 +287,6 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) goto error; } - /* A size=1 instance has no peers, so skip the PMI exchange. - */ - if (info.size == 1) { - if (hostlist_append (hl, hostname) < 0) { - log_err ("hostlist_append"); - goto error; - } - goto done; - } - /* Enable ipv6 for maximum flexibility in address selection. */ overlay_set_ipv6 (overlay, 1); @@ -310,6 +318,16 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) goto error; } + /* If the PMI size is 1, then skip the PMI exchange entirely. + */ + if (info.size == 1) { + if (hostlist_append (hl, hostname) < 0) { + log_err ("hostlist_append"); + goto error; + } + goto done; + } + /* Each broker writes a "business card" consisting of hostname, * public key, and URI (empty string for leaf node). */ @@ -390,10 +408,13 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) /* Fetch the business card of children and inform overlay of public keys. */ - for (i = 0; i < child_count; i++) { + for (int i = 0; i < child_count; i++) { const char *peer_pubkey; int child_rank = child_ranks[i]; + if (child_rank >= info.size) + break; + if (snprintf (key, sizeof (key), "%d", child_rank) >= sizeof (key)) { log_msg ("pmi key string overflow"); goto error; @@ -428,7 +449,7 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) /* Fetch the business card of all ranks and build hostlist. * The hostlist is built independently (and in parallel) on all ranks. */ - for (i = 0; i < info.size; i++) { + for (int i = 0; i < info.size; i++) { const char *peer_hostname; if (snprintf (key, sizeof (key), "%d", i) >= sizeof (key)) { @@ -471,6 +492,17 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) } done: + /* If the instance size is greater than the PMI size, add placeholder + * names to the hostlist for the ranks that haven't joined yet. + */ + for (int i = info.size; i < size; i++) { + char buf[64]; + snprintf (buf, sizeof (buf), "extra%d", i - info.size); + if (hostlist_append (hl, buf) < 0) { + log_err ("hostlist_append"); + goto error; + } + } if (set_hostlist_attr (attrs, hl) < 0) { log_err ("setattr hostlist"); goto error; diff --git a/src/broker/overlay.c b/src/broker/overlay.c index 5fc1f6b8f7b0..0e4bf9ce0618 100644 --- a/src/broker/overlay.c +++ b/src/broker/overlay.c @@ -1590,6 +1590,7 @@ int overlay_register_attrs (struct overlay *overlay) overlay->rank, ATTR_IMMUTABLE) < 0) return -1; + (void)attr_delete (overlay->attrs, "size", true); if (attr_add_uint32 (overlay->attrs, "size", overlay->size, ATTR_IMMUTABLE) < 0) From b6328d90447535468066e34ce40a4bad7549fea8 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 11 May 2023 17:53:56 -0700 Subject: [PATCH 3/8] broker: refactor bootstrap block Problem: the code block that selects which boot method to use is not very clear. Simplify code block so that the default path is clear and adding a boot method won't increase complexity. --- src/broker/broker.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/broker/broker.c b/src/broker/broker.c index e417e4d6c659..ba762907c9d3 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -211,7 +211,6 @@ int main (int argc, char *argv[]) struct sigaction old_sigact_term; flux_msg_handler_t **handlers = NULL; const flux_conf_t *conf; - const char *method; flux_error_t error; setlocale (LC_ALL, ""); @@ -355,24 +354,23 @@ int main (int argc, char *argv[]) init_attrs_starttime (ctx.attrs, ctx.starttime); /* Execute broker network bootstrap. - * Default method is pmi. - * If [bootstrap] is defined in configuration, use static configuration. */ - if (attr_get (ctx.attrs, "broker.boot-method", &method, NULL) < 0) { + const char *boot_method; + if (attr_get (ctx.attrs, "broker.boot-method", &boot_method, NULL) < 0) { if (flux_conf_unpack (conf, NULL, "{s:{}}", "bootstrap") == 0) - method = "config"; + boot_method = "config"; else - method = NULL; + boot_method = NULL; } - if (!method || !streq (method, "config")) { - if (boot_pmi (ctx.overlay, ctx.attrs) < 0) { - log_msg ("bootstrap failed"); + if (boot_method && streq (boot_method, "config")) { + if (boot_config (ctx.h, ctx.overlay, ctx.attrs) < 0) { + log_msg ("boot-config failed"); goto cleanup; } } else { - if (boot_config (ctx.h, ctx.overlay, ctx.attrs) < 0) { - log_msg ("bootstrap failed"); + if (boot_pmi (ctx.overlay, ctx.attrs) < 0) { + log_msg ("boot-pmi failed"); goto cleanup; } } From 35ac7948d3332825cd3934f4858823edbaab835b Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 11 May 2023 17:58:21 -0700 Subject: [PATCH 4/8] broker: add flub bootstrap method Problem: there is no way to add brokers to an instance that has extra slots available. Add support for FLUB, the FLUx Bootstrap protocol, used when the broker is started with broker.boot-server= The bootstrap protocol consists of two RPCs: 1) overlay.flub-getinfo, which requests the allocation of an available rank from rank 0 of the instance that is being extended, and also retrieves the instance size and some broker attributes. 2) overlay.flub-kex, which exchanges public keys with the new rank's TBON parent and obtains the parent's TBON URI. Assumptions: - all ranks have the same topology configuration Limitations (for now): - hostnames will be logged as extra[0-N] - a broker rank cannot be re-allocated to a new broker - a broker cannot replace one that failed in a regular instance - dummy resources for the max size of the instance must be configured --- src/broker/Makefile.am | 2 + src/broker/boot_flub.c | 223 +++++++++++++++++++++++++++++++++++++++++ src/broker/boot_flub.h | 22 ++++ src/broker/broker.c | 9 ++ 4 files changed, 256 insertions(+) create mode 100644 src/broker/boot_flub.c create mode 100644 src/broker/boot_flub.h diff --git a/src/broker/Makefile.am b/src/broker/Makefile.am index 42c2aad866bc..9b21e120f2e9 100644 --- a/src/broker/Makefile.am +++ b/src/broker/Makefile.am @@ -53,6 +53,8 @@ libbroker_la_SOURCES = \ boot_config.c \ boot_pmi.h \ boot_pmi.c \ + boot_flub.h \ + boot_flub.c \ publisher.h \ publisher.c \ groups.h \ diff --git a/src/broker/boot_flub.c b/src/broker/boot_flub.c new file mode 100644 index 000000000000..1d716b06158d --- /dev/null +++ b/src/broker/boot_flub.c @@ -0,0 +1,223 @@ +/************************************************************\ + * Copyright 2023 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +/* boot_flub.c - FLUx Boot protocol + * + * Add a broker to an existing Flux instance. + */ + +#if HAVE_CONFIG_H +#include "config.h" +#endif +#include +#include +#include + +#include "src/common/libidset/idset.h" +#include "src/common/libutil/errprintf.h" +#include "src/common/libutil/errno_safe.h" +#include "src/common/libutil/ipaddr.h" +#include "ccan/str/str.h" + +#include "attr.h" +#include "overlay.h" +#include "topology.h" + +#include "boot_flub.h" + +struct boot_info { + int size; + int rank; + json_t *attrs; +}; + +struct boot_parent { + char *pubkey; + int rank; + const char *uri; +}; + +static int wait_for_group_membership (flux_t *h, + const char *name, + int rank, + flux_error_t *error) +{ + flux_future_t *f; + bool online = false; + + if (!(f = flux_rpc_pack (h, + "groups.get", + 0, + FLUX_RPC_STREAMING, + "{s:s}", + "name", "broker.online"))) { + errprintf (error, "broker.online: %s", strerror (errno)); + return -1; + } + do { + const char *members; + struct idset *ids; + + if (flux_rpc_get_unpack (f, "{s:s}", "members", &members) < 0) { + errprintf (error, "broker.online: %s", future_strerror (f, errno)); + break; + } + if ((ids = idset_decode (members)) + && idset_test (ids, rank)) + online = true; + idset_destroy (ids); + flux_future_reset (f); + } while (!online); + flux_future_destroy (f); + + return online ? 0 : -1; +} + +static int set_attrs (attr_t *attrs, json_t *dict) +{ + const char *key; + json_t *val; + + json_object_foreach (dict, key, val) { + const char *s = json_string_value (val); + if (!s) { + errno = EPROTO; + return -1; + } + if (attr_add (attrs, key, s, ATTR_IMMUTABLE) < 0) + return -1; + } + return 0; +} + +int boot_flub (struct broker *ctx, flux_error_t *error) +{ + const char *uri = NULL; + flux_t *h; + flux_future_t *f = NULL; + flux_future_t *f2 = NULL; + struct topology *topo = NULL; + const char *topo_uri; + struct boot_info info; + struct boot_parent parent; + const char *bind_uri = NULL; + int rc = -1; + + /* Ask a Flux instance to allocate an available rank. + * N.B. the broker unsets FLUX_URI so either it is set on the + * command line via broker.boot-server, or the compiled-in path + * is used (system instance). + */ + (void)attr_get (ctx->attrs, "broker.boot-server", &uri, NULL); + if (!(h = flux_open_ex (uri, 0, error))) + return -1; + if (!(f = flux_rpc_pack (h, + "overlay.flub-getinfo", + 0, + 0, + "{}")) + || flux_rpc_get_unpack (f, + "{s:i s:i s:o}", + "rank", &info.rank, + "size", &info.size, + "attrs", &info.attrs) < 0) { + errprintf (error, "%s", future_strerror (f, errno)); + goto out; + } + /* Set instance attributes obtained from boot server. + */ + if (set_attrs (ctx->attrs, info.attrs) < 0) { + errprintf (error, "error setting attributes: %s", strerror (errno)); + goto out; + } + /* Create topology. All ranks are assumed to have the same topology. + * The tbon.topo attribute is set in overlay_create() if not provided + * on the command line. + */ + if (attr_get (ctx->attrs, "tbon.topo", &topo_uri, NULL) < 0) { + errprintf (error, "error fetching tbon.topo attribute"); + goto out; + } + if (!(topo = topology_create (topo_uri, info.size, NULL)) + || topology_set_rank (topo, info.rank) < 0 + || overlay_set_topology (ctx->overlay, topo) < 0) { + errprintf (error, "error creating topology: %s", strerror (errno)); + goto out; + } + if ((parent.rank = topology_get_parent (topo)) < 0) { + errprintf (error, + "rank %d has no parent in %s topology", + info.rank, + topo_uri); + goto out; + } + /* Exchange public keys with TBON parent and obtain its URI. + */ + if (wait_for_group_membership (h, "broker.online", parent.rank, error) < 0) + goto out; + if (!(f2 = flux_rpc_pack (h, + "overlay.flub-kex", + parent.rank, + 0, + "{s:s s:s}", + "name", overlay_cert_name (ctx->overlay), + "pubkey", overlay_cert_pubkey (ctx->overlay))) + || flux_rpc_get_unpack (f2, + "{s:s s:s}", + "pubkey", &parent.pubkey, + "uri", &parent.uri) < 0) { + errprintf (error, "%s", future_strerror (f, errno)); + goto out; + } + /* Inform overlay subsystem of parent info. + */ + if (overlay_set_parent_uri (ctx->overlay, parent.uri) < 0 + || overlay_set_parent_pubkey (ctx->overlay, parent.pubkey) < 0) { + errprintf (error, + "error setting up overlay parameters: %s", + strerror (errno)); + goto out; + } + /* If there are children, bind to zmq socket and update tbon.endpoint. + * Since we don't know if OUR children are co-located on the same node, + * always use the tcp transport. + */ + if (topology_get_child_ranks (topo, NULL, 0) > 0) { + char ipaddr[HOST_NAME_MAX + 1]; + char *wild = NULL; + + overlay_set_ipv6 (ctx->overlay, 1); + if (ipaddr_getprimary (ipaddr, + sizeof (ipaddr), + error->text, + sizeof (error->text)) < 0) + goto out; + if (asprintf (&wild, "tcp://%s:*", ipaddr) < 0 + || overlay_bind (ctx->overlay, wild) < 0) { + errprintf (error, "error binding to tcp://%s:*", ipaddr); + ERRNO_SAFE_WRAP (free, wild); + goto out; + } + bind_uri = overlay_get_bind_uri (ctx->overlay); + } + if (attr_add (ctx->attrs, "tbon.endpoint", bind_uri, ATTR_IMMUTABLE) < 0) { + errprintf (error, "setattr tbon.endpoint"); + goto out; + } + rc = 0; +out: + topology_decref (topo); + flux_future_destroy (f); + flux_future_destroy (f2); + flux_close (h); + return rc; +} + +// vi:ts=4 sw=4 expandtab diff --git a/src/broker/boot_flub.h b/src/broker/boot_flub.h new file mode 100644 index 000000000000..4bf2ee5963b3 --- /dev/null +++ b/src/broker/boot_flub.h @@ -0,0 +1,22 @@ +/************************************************************\ + * Copyright 2023 Lawrence Livermore National Security, LLC + * (c.f. AUTHORS, NOTICE.LLNS, COPYING) + * + * This file is part of the Flux resource manager framework. + * For details, see https://github.com/flux-framework. + * + * SPDX-License-Identifier: LGPL-3.0 +\************************************************************/ + +#ifndef BROKER_BOOT_FLUB_H +#define BROKER_BOOT_FLUB_H + +#include + +#include "broker.h" + +int boot_flub (struct broker *ctx, flux_error_t *error); + +#endif /* BROKER_BOOT_FLUB_H */ + +// vi:ts=4 sw=4 expandtab diff --git a/src/broker/broker.c b/src/broker/broker.c index ba762907c9d3..850d132936e6 100644 --- a/src/broker/broker.c +++ b/src/broker/broker.c @@ -75,6 +75,7 @@ #include "runat.h" #include "heaptrace.h" #include "exec.h" +#include "boot_flub.h" #include "boot_config.h" #include "boot_pmi.h" #include "publisher.h" @@ -359,6 +360,8 @@ int main (int argc, char *argv[]) if (attr_get (ctx.attrs, "broker.boot-method", &boot_method, NULL) < 0) { if (flux_conf_unpack (conf, NULL, "{s:{}}", "bootstrap") == 0) boot_method = "config"; + else if (attr_get (ctx.attrs, "broker.boot-server", NULL, NULL) == 0) + boot_method = "flub"; else boot_method = NULL; } @@ -368,6 +371,12 @@ int main (int argc, char *argv[]) goto cleanup; } } + else if (boot_method && streq (boot_method, "flub")) { + if (boot_flub (&ctx, &error) < 0) { + log_msg ("boot-flub: %s", error.text); + goto cleanup; + } + } else { if (boot_pmi (ctx.overlay, ctx.attrs) < 0) { log_msg ("boot-pmi failed"); From c03df3379def865acc39b56babff54641273c4c6 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 18 May 2023 07:12:22 -0700 Subject: [PATCH 5/8] broker: add flub RPC methods to overlay Problem: the flub bootstrap method requires broker services. Add the following services (instance owner only): overlay.flub-getinfo (rank 0 only) Allocate an unused rank from rank 0 and also return size and misc. broker attributes to be set in the new broker overlay.flub-kex (peer rank) Exchange public keys with the TBON parent and obtain its zeromq URI. Add overlay_flub_provision() which is called by boot_pmi.c when extra ranks are configured, making those ranks available for allocation. --- src/broker/boot_pmi.c | 6 ++ src/broker/overlay.c | 149 ++++++++++++++++++++++++++++++++++++++++++ src/broker/overlay.h | 7 ++ 3 files changed, 162 insertions(+) diff --git a/src/broker/boot_pmi.c b/src/broker/boot_pmi.c index da6b6f03e412..a0255960279d 100644 --- a/src/broker/boot_pmi.c +++ b/src/broker/boot_pmi.c @@ -278,6 +278,12 @@ int boot_pmi (struct overlay *overlay, attr_t *attrs) if (topology_set_rank (topo, info.rank) < 0 || overlay_set_topology (overlay, topo) < 0) goto error; + if (info.rank == 0 && size > info.size) { + if (overlay_flub_provision (overlay, info.size, size - 1, true) < 0) { + log_msg ("error provisioning flub allocator"); + goto error; + } + } if (gethostname (hostname, sizeof (hostname)) < 0) { log_err ("gethostname"); goto error; diff --git a/src/broker/overlay.c b/src/broker/overlay.c index 0e4bf9ce0618..0c19726f5617 100644 --- a/src/broker/overlay.c +++ b/src/broker/overlay.c @@ -38,6 +38,7 @@ #include "src/common/libutil/errprintf.h" #include "src/common/librouter/rpc_track.h" #include "ccan/str/str.h" +#include "ccan/array_size/array_size.h" #include "overlay.h" #include "attr.h" @@ -192,6 +193,7 @@ struct overlay { struct flux_msglist *health_requests; struct flux_msglist *trace_requests; + struct idset *flub_rankpool; }; static void overlay_mcast_child (struct overlay *ov, flux_msg_t *msg); @@ -297,6 +299,10 @@ int overlay_set_topology (struct overlay *ov, struct topology *topo) ov->size = topology_get_size (topo); ov->rank = topology_get_rank (topo); + if (ov->rank == 0) { + if (!(ov->flub_rankpool = idset_create (ov->size, 0))) + goto error; + } if (!cert_meta_get (ov->cert, "name")) { char val[16]; snprintf (val, sizeof (val), "%lu", (unsigned long)ov->rank); @@ -2326,11 +2332,142 @@ static int overlay_configure_topo (struct overlay *ov) return 0; } +static int overlay_flub_alloc (struct overlay *ov, int *rank) +{ + unsigned int id; + + if (!ov->flub_rankpool) { // created by overlay_set_topology() + errno = EINVAL; + return -1; + } + if ((id = idset_first (ov->flub_rankpool)) != IDSET_INVALID_ID) { + if (idset_clear (ov->flub_rankpool, id) < 0) + return -1; + *rank = (int)id; + return 0; + } + errno = ENOENT; + return -1; +} + +int overlay_flub_provision (struct overlay *ov, + uint32_t lo_rank, + uint32_t hi_rank, + bool available) +{ + if (!ov->flub_rankpool) { // created by overlay_set_topology() + errno = EINVAL; + return -1; + } + if (available) + return idset_range_set (ov->flub_rankpool, lo_rank, hi_rank); + return idset_range_clear (ov->flub_rankpool, lo_rank, hi_rank); +} + +static json_t *flub_dict_create (attr_t *attrs) +{ + const char *names[] = { "hostlist", "instance-level" }; + json_t *dict; + + if (!(dict = json_object ())) + goto nomem; + for (int i = 0; i < ARRAY_SIZE (names); i++) { + const char *val; + json_t *o; + if (attr_get (attrs, names[i], &val, NULL) < 0) + goto error; + if (!(o = json_string (val)) + || json_object_set_new (dict, names[i], o) < 0) { + json_decref (o); + goto nomem; + } + } + return dict; +nomem: + errno = ENOMEM; +error: + ERRNO_SAFE_WRAP (json_decref, dict); + return NULL; +} + +static void overlay_flub_getinfo_cb (flux_t *h, + flux_msg_handler_t *mh, + const flux_msg_t *msg, + void *arg) +{ + struct overlay *ov = arg; + const char *errmsg = NULL; + json_t *attrs = NULL; + int rank; + + if (flux_request_unpack (msg, NULL, "{}") < 0) + goto error; + if (overlay_flub_alloc (ov, &rank) < 0) { + errmsg = "there are no available ranks"; + goto error; + } + if (!(attrs = flub_dict_create (ov->attrs))) + goto error; + if (flux_respond_pack (h, + msg, + "{s:i s:i s:O}", + "rank", rank, + "size", ov->size, + "attrs", attrs) < 0) + flux_log_error (h, "error responding to overlay.flub-getinfo request"); + json_decref (attrs); + return; +error: + if (flux_respond_error (h, msg, errno, errmsg) < 0) + flux_log_error (h, "error responding to overlay.flub-getinfo request"); + json_decref (attrs); +} + +static void overlay_flub_kex_cb (flux_t *h, + flux_msg_handler_t *mh, + const flux_msg_t *msg, + void *arg) +{ + struct overlay *ov = arg; + const char *errmsg = NULL; + const char *name; + const char *pubkey; + + if (flux_request_unpack (msg, + NULL, + "{s:s s:s}", + "name", &name, + "pubkey", &pubkey) < 0) + goto error; + if (ov->child_count == 0) { + errmsg = "this broker cannot have children"; + errno = EINVAL; + goto error; + } + if (overlay_authorize (ov, name, pubkey) < 0) { + errmsg = "failed to authorize public key"; + goto error; + } + if (flux_respond_pack (h, + msg, + "{s:s s:s s:s}", + "pubkey", overlay_cert_pubkey (ov), + "name", overlay_cert_name (ov), + "uri", overlay_get_bind_uri (ov)) < 0) + flux_log_error (h, "error responding to overlay.flub-kex request"); + return; +error: + if (flux_respond_error (h, msg, errno, errmsg) < 0) + flux_log_error (h, "error responding to overlay.flub-kex request"); +} + void overlay_destroy (struct overlay *ov) { if (ov) { int saved_errno = errno; + idset_destroy (ov->flub_rankpool); + flux_msglist_destroy (ov->health_requests); cert_destroy (ov->cert); @@ -2384,6 +2521,18 @@ static const struct flux_msg_handler_spec htab[] = { overlay_trace_cb, 0, }, + { + FLUX_MSGTYPE_REQUEST, + "overlay.flub-kex", + overlay_flub_kex_cb, + 0, + }, + { + FLUX_MSGTYPE_REQUEST, + "overlay.flub-getinfo", + overlay_flub_getinfo_cb, + 0, + }, { FLUX_MSGTYPE_REQUEST, "overlay.stats-get", diff --git a/src/broker/overlay.h b/src/broker/overlay.h index 78be2de45a85..687f8454b4ad 100644 --- a/src/broker/overlay.h +++ b/src/broker/overlay.h @@ -145,6 +145,13 @@ int overlay_set_monitor_cb (struct overlay *ov, overlay_monitor_f cb, void *arg); +/* Make a range of ranks available/unavailable for flub bootstrap + */ +int overlay_flub_provision (struct overlay *ov, + uint32_t lo_rank, + uint32_t hi_rank, + bool available); + /* Register overlay-related broker attributes. */ int overlay_register_attrs (struct overlay *overlay); From 9ccf30c08a84859db7acc65a02570211a6758d5c Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 18 May 2023 15:02:27 -0700 Subject: [PATCH 6/8] testsuite: add coverage for instance size override Problem: there is no test coverage for broker bootstrap with a PMI size less than the actual size. Add some tests. --- t/Makefile.am | 1 + t/t0033-size-override.t | 52 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) create mode 100755 t/t0033-size-override.t diff --git a/t/Makefile.am b/t/Makefile.am index d36d56e0cadb..43705eef020e 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -95,6 +95,7 @@ TESTSCRIPTS = \ t0022-jj-reader.t \ t0023-jobspec1-validate.t \ t0026-flux-R.t \ + t0033-size-override.t \ t0090-content-enospc.t \ t1000-kvs.t \ t1001-kvs-internals.t \ diff --git a/t/t0033-size-override.t b/t/t0033-size-override.t new file mode 100755 index 000000000000..ba8c2ba904b4 --- /dev/null +++ b/t/t0033-size-override.t @@ -0,0 +1,52 @@ +#!/bin/sh +# + +test_description='Test instance size override' + +. `dirname $0`/sharness.sh + +test_expect_success 'size override fails if too small' ' + test_must_fail flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Ssize=1,-Sbroker.quorum=1 /bin/true 2>toosmall.err && + grep "may only be increased" toosmall.err +' +test_expect_success 'an instance can be started with an extra rank' ' + flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Ssize=2 -Sbroker.quorum=1 \ + flux overlay status --no-pretty >overlay.out +' +test_expect_success 'flux overlay status shows the extra rank offline' ' + grep "1 extra0: offline" overlay.out +' +test_expect_success 'an instance can be started with PMI plus extra' ' + flux start -s2 -o,-Stbon.topo=kary:0 \ + -o,-Sbroker.rc1_path=,-Sbroker.rc3_path= \ + -o,-Ssize=3,-Sbroker.quorum=2 \ + flux overlay status --no-pretty >overlay2.out +' +test_expect_success 'flux overlay status shows the extra rank offline' ' + grep "2 extra0: offline" overlay2.out +' +test_expect_success 'create config with fake resources' ' + cat >fake.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c" + cores = "0" + EOT +' +test_expect_success 'start full 2-broker instance with one extra rank' ' + flux start -s2 -o,-Ssize=3,-Sbroker.quorum=2,-cfake.toml \ + flux resource status -s offline -no {nnodes} >offline.out +' +test_expect_success 'flux resource status reports one node offline' ' + cat >offline.exp <<-EOT && + 1 + EOT + test_cmp offline.exp offline.out +' + +test_done From 1a59ee7b959e8dcd191abe2f995bb1f3af891f68 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Thu, 18 May 2023 15:09:58 -0700 Subject: [PATCH 7/8] testsuite: cover flub bootstrap Problem: there is no test coverage for adding brokers to a flux instance. Add some tests. --- t/Makefile.am | 1 + t/t0034-flub.t | 193 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 194 insertions(+) create mode 100755 t/t0034-flub.t diff --git a/t/Makefile.am b/t/Makefile.am index 43705eef020e..c37aabbd3897 100644 --- a/t/Makefile.am +++ b/t/Makefile.am @@ -96,6 +96,7 @@ TESTSCRIPTS = \ t0023-jobspec1-validate.t \ t0026-flux-R.t \ t0033-size-override.t \ + t0034-flub.t \ t0090-content-enospc.t \ t1000-kvs.t \ t1001-kvs-internals.t \ diff --git a/t/t0034-flub.t b/t/t0034-flub.t new file mode 100755 index 000000000000..2a19f8bb1ecb --- /dev/null +++ b/t/t0034-flub.t @@ -0,0 +1,193 @@ +#!/bin/sh +# + +test_description='Test flub bootstrap method' + +. `dirname $0`/sharness.sh + +test_under_flux 8 full + +export FLUX_URI_RESOLVE_LOCAL=t + +# usage: get_job_uri id +get_job_uri() { + flux job wait-event -t10 $1 memo >/dev/null && flux uri $1 +} + +# usage: wait_for_service uri name +wait_for_service() { + flux proxy $1 bash -c \""while ! flux ping -c 1 $2 >/dev/null 2>&1; do sleep 0.5; done"\" +} + +test_expect_success 'broker fails with bad broker.boot-server' ' + test_must_fail flux broker \ + -Sbroker.rc1_path= -Sbroker.rc3_path= \ + -Sbroker.boot-server=local://noexist/path \ + /bin/true 2>server.err && + grep "was not found" server.err +' + +test_expect_success 'start a 1 node job with 0 extra ranks' ' + id=$(flux batch -N1 --wrap sleep inf) && + get_job_uri $id >test1.uri +' +test_expect_success 'job has size 1' ' + size=$(flux proxy $(cat test1.uri) flux getattr size) && + test $size -eq 1 +' +test_expect_success 'flub bootstrap fails with no available ranks' ' + test_must_fail flux broker \ + -Sbroker.boot-server=$(cat test1.uri) 2>noranks.err && + grep "no available ranks" noranks.err +' +test_expect_success 'clean up' ' + flux cancel --all +' + + +# +# Start 2 node batch job with one extra slot. +# Submit 1 node broker job that fills the slot. +# Run a parallel job across all three nodes in the batch job. +# This test is constrained so that all flubbed nodes are leaf nodes, +# and the flubbed nodes connect to rank 0 only. + +test_expect_success 'create config with 3 fake nodes' ' + cat >fake3.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c" + cores = "0-3" + EOT +' +test_expect_success 'start a 2 node job with 1 extra rank' ' + id=$(flux batch -N2 \ + --broker-opts=--config-path=fake3.toml \ + --broker-opts=-Ssize=3 \ + --broker-opts=-Sbroker.quorum=2 \ + --broker-opts=-Stbon.topo=kary:0 \ + --wrap sleep inf) && + get_job_uri $id >test2.uri +' +test_expect_success 'job has size 3' ' + size=$(flux proxy $(cat test2.uri) flux getattr size) && + test $size -eq 3 +' +test_expect_success 'overlay status shows extra node offline' ' + flux proxy $(cat test2.uri) \ + flux overlay status --no-pretty >ov2.out && + grep "2 extra0: offline" ov2.out +' +test_expect_success 'run a 2 node job in the initial instance' ' + wait_for_service $(cat test2.uri) job-ingest && + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N2 flux pmi barrier +' +test_expect_success 'submit a job that starts 1 extra broker' ' + id=$(flux submit -N1 flux broker \ + --config-path=fake3.toml \ + -Stbon.topo=kary:0 \ + -Sbroker.boot-server=$(cat test2.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test2.uri) \ + flux overlay status --summary --wait full --timeout 30s +' +test_expect_success 'run a 3 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test2.uri) \ + flux run --label-io -N3 flux pmi barrier +' +test_expect_success 'clean up' ' + flux cancel --all +' + +test_expect_success 'create config with 7 fake nodes' ' + cat >fake7.toml <<-EOT + [resource] + noverify = true + [[resource.config]] + hosts = "a,b,c,d,e,f,g" + cores = "0-3" + EOT +' + +# +# Start 1 node batch job with 6 extra slots (kary:2). +# Submit 6 node broker job that fills all the slots. +# Run a 7 node parallel job. +# +test_expect_success 'start a 1 node job with 6 extra ranks' ' + id=$(flux batch -N1 \ + --broker-opts=--config-path=fake7.toml \ + --broker-opts=-Ssize=7 \ + --broker-opts=-Sbroker.quorum=1 \ + --broker-opts=-Stbon.topo=kary:2 \ + --wrap sleep inf) && + get_job_uri $id >test5.uri +' +test_expect_success 'run a 1 node job in the initial instance' ' + wait_for_service $(cat test5.uri) job-ingest && + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N1 flux pmi barrier +' +test_expect_success 'job has size 7' ' + size=$(flux proxy $(cat test5.uri) flux getattr size) && + test $size -eq 7 +' +# N.B. include exit-timeout=none so we can safely disconnect one node later +test_expect_success 'submit a job that starts 6 extra brokers' ' + id=$(flux submit -N6 -o exit-timeout=none \ + flux broker \ + --config-path=fake7.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test5.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start && + echo $id >xtra_id +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N7 flux pmi barrier +' + +# +# Show that a node can be replaced + +test_expect_success 'disconnect rank 6' ' + flux proxy $(cat test5.uri) \ + flux overlay disconnect 6 +' +test_expect_success 'rank 6 cannot be pinged - trigger EHOSTUNREACH' ' + test_must_fail flux proxy $(cat test5.uri) \ + flux ping -c1 6 +' +test_expect_success 'wait for overlay status to be degraded' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait degraded --timeout 10s +' +test_expect_success 'submit a job that starts 1 broker' ' + id=$(flux submit -N1 flux broker \ + --config-path=fake7.toml \ + -Stbon.topo=kary:2 \ + -Sbroker.boot-server=$(cat test5.uri)) && + flux job wait-event -p guest.exec.eventlog $id shell.start +' +test_expect_success 'wait for overlay status to be full' ' + flux proxy $(cat test5.uri) \ + flux overlay status --summary --wait full --timeout 10s +' +test_expect_success 'run a 7 node job in the expanded instance' ' + run_timeout 30 flux proxy $(cat test5.uri) \ + flux run --label-io -N7 flux pmi barrier +' + +test_expect_success 'clean up' ' + flux cancel --all +' + +test_done From 0ba34b1cc48dc967c709223468b9f0e12a60b4d7 Mon Sep 17 00:00:00 2001 From: Jim Garlick Date: Fri, 19 May 2023 12:18:11 -0700 Subject: [PATCH 8/8] broker: provision dead brokers for flub replacement Problem: there is no way to replace a node in Flux instance that goes down. Call overlay_flub_provision () when a rank goes offline so that the flub allocator can allocate its rank to a replacement. Unprovision ranks when they return to online. --- src/broker/state_machine.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/broker/state_machine.c b/src/broker/state_machine.c index f3c993ae5147..bfbb4b78e1cc 100644 --- a/src/broker/state_machine.c +++ b/src/broker/state_machine.c @@ -925,6 +925,24 @@ static void broker_online_cb (flux_future_t *f, void *arg) } idset_destroy (loss); } + /* A broker that drops out of s->quorum.online is provisioned + * for replacement via flub, and unprovisioned if it returns. + */ + if (previous_online) { + unsigned int id; + id = idset_first (previous_online); + while (id != IDSET_INVALID_ID) { // online -> offline + if (!idset_test (s->quorum.online, id)) + (void)overlay_flub_provision (s->ctx->overlay, id, id, true); + id = idset_next (previous_online, id); + } + id = idset_first (s->quorum.online); + while (id != IDSET_INVALID_ID) { // offline -> online + if (!idset_test (previous_online, id)) + (void)overlay_flub_provision (s->ctx->overlay, id, id, false); + id = idset_next (s->quorum.online, id); + } + } idset_destroy (previous_online); flux_future_reset (f);