Skip to content

Commit bdd6aec

Browse files
authored
Merge pull request #5762 from garlick/groups_cleanup
broker: catch an improper use of groups and handle it gracefully
2 parents 40d6f4c + 1f8d2a2 commit bdd6aec

File tree

3 files changed

+61
-4
lines changed

3 files changed

+61
-4
lines changed

src/broker/groups.c

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,18 @@
3030
* Optimization: collect contemporaneous JOIN/LEAVE requests at each
3131
* rank for a short time before applying them and sending them upstream.
3232
* During that time, JOINs/LEAVEs of the same key may be combined.
33+
*
34+
* broker.online use case:
35+
* Groups are used for instance quorum detection. The state machine calls
36+
* groups.join broker.online in the QUORUM state on all ranks. Rank 0 calls
37+
* groups.get broker.online which notifies the broker as membership evolves,
38+
* and when the quorum condition is satisfied, the state transitions
39+
* to RUN. The 'broker.online' group is also monitored by the resource module
40+
* so that it can inform the scheduler as execution targets go up/down.
41+
*
42+
* broker.torpid use case:
43+
* A broker.torpid group is maintained by the broker overlay (see overlay.c).
44+
* The resource module also monitors broker.torpid and drains torpid nodes.
3345
*/
3446

3547
#if HAVE_CONFIG_H
@@ -50,6 +62,10 @@
5062

5163
static const double batch_timeout = 0.1;
5264

65+
/* N.B. only one client can join a group per broker. That client
66+
* join request is cached in group->join_request so that when the client
67+
* disconnects, we can identify its groups and force it to leave.
68+
*/
5369
struct group {
5470
char *name; // used directly as zhashx key
5571
struct idset *members;
@@ -441,6 +457,11 @@ static void join_request_cb (flux_t *h,
441457

442458
if (flux_request_unpack (msg, NULL, "{s:s}", "name", &name) < 0)
443459
goto error;
460+
if (!flux_msg_is_local (msg)) {
461+
errno = EPROTO;
462+
errmsg = "groups.join is restricted to the local broker";
463+
goto error;
464+
}
444465
if (!(group = group_lookup (g, name, true)))
445466
goto error;
446467
if (group->join_request) {
@@ -485,6 +506,11 @@ static void leave_request_cb (flux_t *h,
485506

486507
if (flux_request_unpack (msg, NULL, "{s:s}", "name", &name) < 0)
487508
goto error;
509+
if (!flux_msg_is_local (msg)) {
510+
errno = EPROTO;
511+
errmsg = "groups.leave is restricted to the local broker";
512+
goto error;
513+
}
488514
if (!(group = group_lookup (g, name, false))
489515
|| !group->join_request) {
490516
snprintf (errbuf,

t/scripts/groups.py

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,16 +120,25 @@ def join(args):
120120
"""
121121
h = flux.Flux()
122122

123-
h.rpc("groups.join", {"name": args.name}).get()
123+
h.rpc("groups.join", {"name": args.name}, nodeid=args.rank).get()
124124
if args.dubjoin:
125-
h.rpc("groups.join", {"name": args.name}).get()
125+
h.rpc("groups.join", {"name": args.name}, nodeid=args.rank).get()
126126

127127
if args.leave:
128-
h.rpc("groups.leave", {"name": args.name}).get()
128+
h.rpc("groups.leave", {"name": args.name}, nodeid=args.rank).get()
129129
if args.dubleave:
130130
h.rpc("groups.leave", {"name": args.name}).get()
131131

132132

133+
def leave(args):
134+
"""
135+
Leave group.
136+
"""
137+
h = flux.Flux()
138+
139+
h.rpc("groups.leave", {"name": args.name}, nodeid=args.rank).get()
140+
141+
133142
LOGGER = logging.getLogger("groups")
134143

135144

@@ -183,15 +192,28 @@ def main():
183192
# join
184193
join_parser = subparsers.add_parser(
185194
"join",
186-
usage="groups join [--dubjoin] [--leave] [--dubleave] name",
195+
usage="groups join [--rank N] [--dubjoin] [--leave] [--dubleave] name",
187196
formatter_class=flux.util.help_formatter(),
188197
)
189198
join_parser.add_argument("--dubjoin", action="store_true")
190199
join_parser.add_argument("--leave", action="store_true")
191200
join_parser.add_argument("--dubleave", action="store_true")
201+
join_parser.add_argument("--rank", type=int, default=flux.constants.FLUX_NODEID_ANY)
192202
join_parser.add_argument("name")
193203
join_parser.set_defaults(func=join)
194204

205+
# leave
206+
leave_parser = subparsers.add_parser(
207+
"leave",
208+
usage="groups leave [--rank N] name",
209+
formatter_class=flux.util.help_formatter(),
210+
)
211+
leave_parser.add_argument(
212+
"--rank", type=int, default=flux.constants.FLUX_NODEID_ANY
213+
)
214+
leave_parser.add_argument("name")
215+
leave_parser.set_defaults(func=leave)
216+
195217
args = parser.parse_args()
196218
args.func(args)
197219

t/t0027-broker-groups.t

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,15 @@ test_expect_success 'groups.get on rank > 0 fails with reasonable error' '
3030
grep "only available on rank 0" test0.err
3131
'
3232

33+
test_expect_success 'nonlocal groups.join fails with appropriate error' '
34+
test_must_fail ${GROUPSCMD} join --rank 1 foo 2>rmtjoin.err &&
35+
grep "restricted to the local broker" rmtjoin.err
36+
'
37+
test_expect_success 'nonlocal groups.leave fails with appropriate error' '
38+
test_must_fail ${GROUPSCMD} leave --rank 1 foo 2>rmtleave.err &&
39+
grep "restricted to the local broker" rmtleave.err
40+
'
41+
3342
badjoin() {
3443
flux python -c "import flux; print(flux.Flux().rpc(\"groups.join\").get())"
3544
}

0 commit comments

Comments
 (0)