|
27 | 27 | #include "src/common/libutil/errprintf.h" |
28 | 28 | #include "src/common/libsubprocess/server.h" |
29 | 29 | #include "ccan/array_size/array_size.h" |
| 30 | +#include "ccan/str/str.h" |
30 | 31 |
|
31 | 32 | #include "state_machine.h" |
32 | 33 |
|
|
38 | 39 | #include "shutdown.h" |
39 | 40 |
|
40 | 41 | struct quorum { |
41 | | - struct idset *want; |
42 | | - struct idset *have; // cumulative on rank 0, batch buffer on rank > 0 |
| 42 | + uint32_t size; |
| 43 | + struct idset *all; |
| 44 | + struct idset *online; // cumulative on rank 0, batch buffer on rank > 0 |
43 | 45 | flux_future_t *f; |
44 | 46 | double timeout; |
45 | 47 | bool warned; |
@@ -182,21 +184,6 @@ static broker_state_t state_next (broker_state_t current, const char *event) |
182 | 184 | return current; |
183 | 185 | } |
184 | 186 |
|
185 | | -/* return true if a is a subset of b */ |
186 | | -static bool is_subset_of (const struct idset *a, const struct idset *b) |
187 | | -{ |
188 | | - struct idset *ids; |
189 | | - int count; |
190 | | - |
191 | | - if (!(ids = idset_difference (a, b))) |
192 | | - return false; |
193 | | - count = idset_count (ids); |
194 | | - idset_destroy (ids); |
195 | | - if (count > 0) |
196 | | - return false; |
197 | | - return true; |
198 | | -} |
199 | | - |
200 | 187 | static void action_init (struct state_machine *s) |
201 | 188 | { |
202 | 189 | s->ctx->online = true; |
@@ -245,7 +232,7 @@ static void quorum_timer_cb (flux_reactor_t *r, |
245 | 232 | if (s->state != STATE_QUORUM) |
246 | 233 | return; |
247 | 234 |
|
248 | | - if (!(ids = idset_difference (s->quorum.want, s->quorum.have)) |
| 235 | + if (!(ids = idset_difference (s->quorum.all, s->quorum.online)) |
249 | 236 | || !(rankstr = idset_encode (ids, IDSET_FLAG_RANGE)) |
250 | 237 | || !(hl = hostlist_create ())) { |
251 | 238 | flux_log_error (h, "error computing slow brokers"); |
@@ -749,43 +736,54 @@ static void quorum_check_parent (struct state_machine *s) |
749 | 736 | } |
750 | 737 | } |
751 | 738 |
|
752 | | -/* Configure the set of broker ranks needed for quorum (default=all). |
| 739 | +/* For backwards compatibility, translate "0" and "0-<size-1>" to 1 and <size>, |
| 740 | + * respectively, but print a warning on stderr. |
| 741 | + */ |
| 742 | +static bool quorum_configure_deprecated (struct state_machine *s, |
| 743 | + const char *val) |
| 744 | +{ |
| 745 | + char all[64]; |
| 746 | + snprintf (all, sizeof (all), "0-%lu", (unsigned long)s->ctx->size - 1); |
| 747 | + if (streq (val, all)) |
| 748 | + s->quorum.size = s->ctx->size; |
| 749 | + else if (streq (val, "0")) |
| 750 | + s->quorum.size = 1; |
| 751 | + else |
| 752 | + return false; |
| 753 | + if (s->ctx->rank == 0) { |
| 754 | + log_msg ("warning: broker.quorum is now a size - assuming %lu", |
| 755 | + (unsigned long)s->quorum.size); |
| 756 | + } |
| 757 | + return true; |
| 758 | +} |
| 759 | + |
| 760 | +/* Configure the count of broker ranks needed for quorum (default=<size>). |
753 | 761 | */ |
754 | 762 | static int quorum_configure (struct state_machine *s) |
755 | 763 | { |
756 | 764 | const char *val; |
757 | | - char *tmp; |
758 | | - unsigned long id; |
759 | | - |
760 | 765 | if (attr_get (s->ctx->attrs, "broker.quorum", &val, NULL) == 0) { |
761 | | - if (!(s->quorum.want = idset_decode (val))) { |
762 | | - log_msg ("Error parsing broker.quorum attribute"); |
763 | | - return -1; |
764 | | - } |
765 | | - id = idset_last (s->quorum.want); |
766 | | - if (id != IDSET_INVALID_ID && id >= s->ctx->size) { |
767 | | - log_msg ("Error parsing broker.quorum attribute: exceeds size"); |
768 | | - return -1; |
| 766 | + if (!quorum_configure_deprecated (s, val)) { |
| 767 | + errno = 0; |
| 768 | + s->quorum.size = strtoul (val, NULL, 10); |
| 769 | + if (errno != 0 |
| 770 | + || s->quorum.size < 1 |
| 771 | + || s->quorum.size > s->ctx->size) { |
| 772 | + log_msg ("Error parsing broker.quorum attribute"); |
| 773 | + errno = EINVAL; |
| 774 | + return -1; |
| 775 | + } |
769 | 776 | } |
770 | | - if (attr_delete (s->ctx->attrs, "broker.quorum", true) < 0) |
| 777 | + if (attr_set_flags (s->ctx->attrs, "broker.quorum", ATTR_IMMUTABLE) < 0) |
771 | 778 | return -1; |
772 | 779 | } |
773 | 780 | else { |
774 | | - if (!(s->quorum.want = idset_create (s->ctx->size, 0))) |
| 781 | + s->quorum.size = s->ctx->size; |
| 782 | + char buf[16]; |
| 783 | + snprintf (buf, sizeof (buf), "%lu", (unsigned long)s->quorum.size); |
| 784 | + if (attr_add (s->ctx->attrs, "broker.quorum", buf, ATTR_IMMUTABLE) < 0) |
775 | 785 | return -1; |
776 | | - if (idset_range_set (s->quorum.want, 0, s->ctx->size - 1) < 0) |
777 | | - return -1; |
778 | | - } |
779 | | - if (!(tmp = idset_encode (s->quorum.want, IDSET_FLAG_RANGE))) |
780 | | - return -1; |
781 | | - if (attr_add (s->ctx->attrs, |
782 | | - "broker.quorum", |
783 | | - tmp, |
784 | | - ATTR_IMMUTABLE) < 0) { |
785 | | - ERRNO_SAFE_WRAP (free, tmp); |
786 | | - return -1; |
787 | 786 | } |
788 | | - free (tmp); |
789 | 787 | return 0; |
790 | 788 | } |
791 | 789 |
|
@@ -837,11 +835,10 @@ static void broker_online_cb (flux_future_t *f, void *arg) |
837 | 835 | return; |
838 | 836 | } |
839 | 837 |
|
840 | | - idset_destroy (s->quorum.have); |
841 | | - s->quorum.have = ids; |
842 | | - if (is_subset_of (s->quorum.want, s->quorum.have)) { |
| 838 | + idset_destroy (s->quorum.online); |
| 839 | + s->quorum.online = ids; |
| 840 | + if (idset_count (s->quorum.online) >= s->quorum.size) |
843 | 841 | quorum_reached = true; |
844 | | - } |
845 | 842 |
|
846 | 843 | if (strlen (members) > 0 |
847 | 844 | && (quorum_reached || now - last_update > 5)) { |
@@ -1150,8 +1147,8 @@ void state_machine_destroy (struct state_machine *s) |
1150 | 1147 | flux_msglist_destroy (s->wait_requests); |
1151 | 1148 | flux_future_destroy (s->monitor.f); |
1152 | 1149 | flux_msglist_destroy (s->monitor.requests); |
1153 | | - idset_destroy (s->quorum.want); |
1154 | | - idset_destroy (s->quorum.have); |
| 1150 | + idset_destroy (s->quorum.all); |
| 1151 | + idset_destroy (s->quorum.online); |
1155 | 1152 | flux_watcher_destroy (s->quorum.timer); |
1156 | 1153 | flux_future_destroy (s->quorum.f); |
1157 | 1154 | free (s); |
@@ -1190,8 +1187,12 @@ struct state_machine *state_machine_create (struct broker *ctx) |
1190 | 1187 | if (!(s->monitor.f = monitor_parent (ctx->h, s))) |
1191 | 1188 | goto error; |
1192 | 1189 | } |
1193 | | - if (!(s->quorum.have = idset_create (ctx->size, 0))) |
| 1190 | + if (!(s->quorum.online = idset_create (ctx->size, 0))) |
| 1191 | + goto error; |
| 1192 | + if (!(s->quorum.all = idset_create (s->ctx->size, 0)) |
| 1193 | + || idset_range_set (s->quorum.all, 0, s->ctx->size - 1) < 0) |
1194 | 1194 | goto error; |
| 1195 | + |
1195 | 1196 | if (quorum_configure (s) < 0 |
1196 | 1197 | || quorum_timeout_configure (s) < 0) { |
1197 | 1198 | log_err ("error configuring quorum attributes"); |
|
0 commit comments