Skip to content

Commit 480a3ab

Browse files
authored
Merge pull request #5275 from garlick/issue#5273
broker: avoid spurious overlay peer warning
2 parents 7a64c5b + ab783b1 commit 480a3ab

File tree

3 files changed

+77
-36
lines changed

3 files changed

+77
-36
lines changed

src/broker/boot_config.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,9 @@
1313
#if HAVE_CONFIG_H
1414
#include "config.h"
1515
#endif
16+
#include <sys/types.h>
17+
#include <sys/socket.h>
18+
#include <netdb.h>
1619
#include <unistd.h>
1720
#include <stdbool.h>
1821
#include <assert.h>
@@ -22,6 +25,7 @@
2225
#include <flux/hostlist.h>
2326
#include <flux/taskmap.h>
2427

28+
#include "src/common/libyuarel/yuarel.h"
2529
#include "src/common/libutil/log.h"
2630
#include "src/common/libutil/errno_safe.h"
2731
#include "ccan/str/str.h"
@@ -495,6 +499,38 @@ static int set_broker_boot_method_attr (attr_t *attrs, const char *value)
495499
return 0;
496500
}
497501

502+
/* Zeromq treats failed hostname resolution as a transient error, and silently
503+
* retries in the background, which can make config problems hard to diagnose.
504+
* Parse the URI in advance, and if the host portion is invalid, log it.
505+
* Ref: flux-framework/flux-core#5009
506+
*/
507+
static void warn_of_invalid_host (flux_t *h, const char *uri)
508+
{
509+
char *cpy;
510+
struct yuarel u = { 0 };
511+
struct addrinfo *result;
512+
int e;
513+
514+
if (!(cpy = strdup (uri))
515+
|| yuarel_parse (&u, cpy) < 0
516+
|| !u.scheme
517+
|| !u.host
518+
|| !streq (u.scheme, "tcp"))
519+
goto done;
520+
/* N.B. this URI will be used for zmq_connect(), therefore it must
521+
* be a valid peer address, not an interface name or wildcard.
522+
*/
523+
if ((e = getaddrinfo (u.host, NULL, NULL, &result)) == 0) {
524+
freeaddrinfo (result);
525+
goto done;
526+
}
527+
log_msg ("Warning: unable to resolve upstream peer %s: %s",
528+
u.host,
529+
gai_strerror (e));
530+
done:
531+
free (cpy);
532+
}
533+
498534
int boot_config (flux_t *h, struct overlay *overlay, attr_t *attrs)
499535
{
500536
struct boot_conf conf;
@@ -623,6 +659,7 @@ int boot_config (flux_t *h, struct overlay *overlay, attr_t *attrs)
623659
parent_uri,
624660
sizeof (parent_uri)) < 0)
625661
goto error;
662+
warn_of_invalid_host (h, parent_uri);
626663
if (overlay_set_parent_uri (overlay, parent_uri) < 0) {
627664
log_err ("overlay_set_parent_uri %s", parent_uri);
628665
goto error;

src/broker/overlay.c

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
#include "src/common/libutil/errprintf.h"
3333
#include "src/common/librouter/rpc_track.h"
3434
#include "src/common/libccan/ccan/ptrint/ptrint.h"
35-
#include "src/common/libyuarel/yuarel.h"
3635

3736
#include "overlay.h"
3837
#include "attr.h"
@@ -1264,48 +1263,13 @@ static void parent_monitor_cb (struct zmqutil_monitor *mon, void *arg)
12641263
}
12651264
}
12661265

1267-
/* Zeromq treats failed hostname resolution as a transient error, and silently
1268-
* retries in the background, which can make config problems hard to diagnose.
1269-
* Parse the URI in advance, and if the host portion is invalid, log it.
1270-
* Ref: flux-framework/flux-core#5009
1271-
*/
1272-
static void warn_of_invalid_host (flux_t *h, const char *uri)
1273-
{
1274-
char *cpy;
1275-
struct yuarel u = { 0 };
1276-
struct addrinfo *result;
1277-
int e;
1278-
1279-
if (!(cpy = strdup (uri))
1280-
|| yuarel_parse (&u, cpy) < 0
1281-
|| !u.scheme
1282-
|| !u.host
1283-
|| !streq (u.scheme, "tcp"))
1284-
goto done;
1285-
/* N.B. this URI will be used for zmq_connect(), therefore it must
1286-
* be a valid peer address, not an interface name or wildcard.
1287-
*/
1288-
if ((e = getaddrinfo (u.host, NULL, NULL, &result)) == 0) {
1289-
freeaddrinfo (result);
1290-
goto done;
1291-
}
1292-
flux_log (h,
1293-
LOG_ERR,
1294-
"Warning: unable to resolve upstream peer %s: %s",
1295-
u.host,
1296-
gai_strerror (e));
1297-
done:
1298-
free (cpy);
1299-
}
1300-
13011266
int overlay_connect (struct overlay *ov)
13021267
{
13031268
if (ov->rank > 0) {
13041269
if (!ov->h || ov->rank == FLUX_NODEID_ANY || !ov->parent.uri) {
13051270
errno = EINVAL;
13061271
return -1;
13071272
}
1308-
warn_of_invalid_host (ov->h, ov->parent.uri);
13091273
if (!(ov->parent.zsock = zsock_new_dealer (NULL)))
13101274
goto nomem;
13111275
/* The socket monitor is only used for logging.

t/t0013-config-file.t

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,9 @@
11
#!/bin/sh
22
#
33

4+
# Append --logfile option if FLUX_TESTS_LOGFILE is set in environment:
5+
test -n "$FLUX_TESTS_LOGFILE" && set -- "$@" --logfile
6+
47
test_description='Test config file overlay bootstrap'
58

69
. `dirname $0`/sharness.sh
@@ -260,6 +263,43 @@ test_expect_success 'start size=3 instance with ipc:// and custom topology' '
260263
test_cmp conf8a.exp conf8a.out
261264
'
262265

266+
waitgrep() {
267+
local pattern=$1
268+
local file=$2
269+
local iter=$3
270+
while test $iter -gt 0; do
271+
grep "$pattern" $file 2>/dev/null && return 0
272+
sleep 0.3
273+
iter=$(($iter-1))
274+
done
275+
return 1
276+
}
277+
278+
# RFC 2606 reserves the .invalid domain for testing
279+
test_expect_success NO_CHAIN_LINT 'a warning is printed when upstream URI has unknown host' '
280+
mkdir conf8b &&
281+
cat <<-EOT >conf8b/bootstrap.toml &&
282+
[bootstrap]
283+
curve_cert = "testcert"
284+
[[bootstrap.hosts]]
285+
host = "fake0"
286+
connect = "tcp://foo.invalid:1234"
287+
[[bootstrap.hosts]]
288+
host = "fake1"
289+
EOT
290+
FLUX_FAKE_HOSTNAME=fake1 \
291+
flux broker -vv -Sbroker.rc1_path=,-Sbroker.rc3_path= \
292+
--config-path=conf8b 2>warn.err &
293+
echo $! >warn.pid &&
294+
waitgrep "unable to resolve upstream peer" warn.err 30
295+
'
296+
# In case warn.pid actually refers to a libtool wrapper, try pkill(1) -P
297+
# first to kill its children, then kill(1). See flux-framework/flux-core#5275.
298+
test_expect_success NO_CHAIN_LINT 'clean up broker from previous test' '
299+
warnpid=$(cat warn.pid) &&
300+
pkill -15 -P $warnpid || kill -15 $warnpid
301+
'
302+
263303
getport() {
264304
flux python -c \
265305
'from socket import socket; s=socket(); s.bind(("", 0)); print(s.getsockname()[1])'

0 commit comments

Comments
 (0)