Skip to content

Commit 588ea0a

Browse files
authored
Merge pull request #4560 from garlick/log_improvement
improve signal/noise ratio in systemd journal logs
2 parents 10d1639 + 5ab1d7d commit 588ea0a

File tree

2 files changed

+46
-34
lines changed

2 files changed

+46
-34
lines changed

src/broker/overlay.c

Lines changed: 33 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
#include "src/common/libutil/fsd.h"
3030
#include "src/common/libutil/errno_safe.h"
3131
#include "src/common/libutil/monotime.h"
32+
#include "src/common/libutil/errprintf.h"
3233
#include "src/common/librouter/rpc_track.h"
3334
#include "src/common/libccan/ccan/ptrint/ptrint.h"
3435

@@ -897,8 +898,13 @@ static void child_cb (flux_reactor_t *r, flux_watcher_t *w,
897898
* Send CONTROL_DISCONNECT to force subtree panic.
898899
*/
899900
else {
900-
if (overlay_control_child (ov, uuid, CONTROL_DISCONNECT, 0) < 0)
901-
flux_log_error (ov->h, "failed to send CONTROL_DISCONNECT");
901+
if (overlay_control_child (ov, uuid, CONTROL_DISCONNECT, 0) < 0) {
902+
// not LOG_ERR per flux-framework/flux-core#4464
903+
flux_log (ov->h,
904+
LOG_DEBUG,
905+
"failed to send CONTROL_DISCONNECT: %s",
906+
strerror (errno));
907+
}
902908
}
903909
goto done;
904910
}
@@ -1023,13 +1029,13 @@ static void parent_cb (flux_reactor_t *r, flux_watcher_t *w,
10231029
/* Check child flux-core version 'v1' against this broker's version 'v2'.
10241030
* For now we require an exact match of (major,minor,patch) and
10251031
* ignore any commit id appended to the version string.
1026-
* Return 0 on error, or -1 on failure with message for child in 'errubuf'.
1032+
* Return 0 on error, or -1 on failure with message for child in 'error'.
10271033
*/
1028-
static bool version_check (int v1, int v2, char *errbuf, int errbufsz)
1034+
static bool version_check (int v1, int v2, flux_error_t *error)
10291035
{
10301036
if (v1 != v2) {
1031-
snprintf (errbuf, errbufsz,
1032-
"flux-core v%u.%u.%u mismatched with parent v%u.%u.%u",
1037+
errprintf (error,
1038+
"client (%u.%u.%u) version mismatched with server (%u.%u.%u)",
10331039
(v1 >> 16) & 0xff,
10341040
(v1 >> 8) & 0xff,
10351041
v1 & 0xff,
@@ -1054,12 +1060,11 @@ static void hello_request_handler (struct overlay *ov, const flux_msg_t *msg)
10541060
json_int_t rank;
10551061
int version;
10561062
const char *errmsg = NULL;
1057-
char errbuf[128];
1058-
const char *reason = "";
1063+
flux_error_t error;
10591064
flux_msg_t *response;
10601065
const char *uuid;
10611066
int status;
1062-
int hello_log_level = LOG_INFO;
1067+
int hello_log_level = LOG_DEBUG;
10631068

10641069
if (flux_request_unpack (msg,
10651070
NULL,
@@ -1070,47 +1075,52 @@ static void hello_request_handler (struct overlay *ov, const flux_msg_t *msg)
10701075
"status", &status) < 0
10711076
|| flux_msg_authorize (msg, FLUX_USERID_UNKNOWN) < 0)
10721077
goto error; // EPROTO or EPERM (unlikely)
1073-
10741078
if (!(child = child_lookup_byrank (ov, rank))) {
1075-
snprintf (errbuf, sizeof (errbuf),
1079+
errprintf (&error,
10761080
"rank %lu is not a peer of parent %lu: mismatched config?",
10771081
(unsigned long)rank,
10781082
(unsigned long)ov->parent.rank);
1079-
errmsg = errbuf;
1083+
flux_log (ov->h, LOG_ERR,
1084+
"rejecting connection from %s (rank %lu): %s",
1085+
flux_get_hostbyrank (ov->h, rank),
1086+
(unsigned long)rank,
1087+
error.text);
1088+
errmsg = error.text;
10801089
errno = EINVAL;
1081-
reason = "not a peer of this broker - mismatched config?";
1082-
goto error_log;
1090+
goto error;
10831091
}
10841092
/* Oops, child was previously online, but is now saying hello.
10851093
* Update the (old) child's subtree status to LOST. If the hello
10861094
* request is successful, another update will immediately follow.
10871095
*/
10881096
if (subtree_is_online (child->status)) { // crash
10891097
flux_log (ov->h, LOG_ERR,
1090-
"%s (rank %lu) reconnected after crash, dropping old uuid %s",
1098+
"%s (rank %lu) reconnected after crash, dropping old connection state",
10911099
flux_get_hostbyrank (ov->h, child->rank),
1092-
(unsigned long)child->rank,
1093-
child->uuid);
1100+
(unsigned long)child->rank);
10941101
overlay_child_status_update (ov, child, SUBTREE_STATUS_LOST);
10951102
hello_log_level = LOG_ERR; // want hello log to stand out in this case
10961103
}
10971104

1098-
if (!version_check (version, ov->version, errbuf, sizeof (errbuf))) {
1099-
errmsg = errbuf;
1105+
if (!version_check (version, ov->version, &error)) {
1106+
flux_log (ov->h, LOG_ERR,
1107+
"rejecting connection from %s (rank %lu): %s",
1108+
flux_get_hostbyrank (ov->h, rank),
1109+
(unsigned long)rank,
1110+
error.text);
1111+
errmsg = error.text;
11001112
errno = EINVAL;
1101-
reason = "flux-core version mismatch - needs software update?";
1102-
goto error_log;
1113+
goto error;
11031114
}
11041115

11051116
snprintf (child->uuid, sizeof (child->uuid), "%s", uuid);
11061117
overlay_child_status_update (ov, child, status);
11071118

11081119
flux_log (ov->h,
11091120
hello_log_level,
1110-
"accepting connection from %s (rank %lu) uuid %s status %s",
1121+
"accepting connection from %s (rank %lu) status %s",
11111122
flux_get_hostbyrank (ov->h, child->rank),
11121123
(unsigned long)child->rank,
1113-
child->uuid,
11141124
subtree_status_str (child->status));
11151125

11161126
if (!(response = flux_response_derive (msg, 0))
@@ -1119,13 +1129,6 @@ static void hello_request_handler (struct overlay *ov, const flux_msg_t *msg)
11191129
flux_log_error (ov->h, "error responding to overlay.hello request");
11201130
flux_msg_destroy (response);
11211131
return;
1122-
error_log:
1123-
flux_log (ov->h, LOG_ERR,
1124-
"rejecting connection from %s (rank %lu) uuid %s: %s",
1125-
flux_get_hostbyrank (ov->h, rank),
1126-
(unsigned long)rank,
1127-
uuid,
1128-
reason);
11291132
error:
11301133
if (!(response = flux_response_derive (msg, errno))
11311134
|| (errmsg && flux_msg_set_string (response, errmsg) < 0)

src/broker/state_machine.c

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -712,6 +712,9 @@ static void broker_online_cb (flux_future_t *f, void *arg)
712712
struct state_machine *s = arg;
713713
const char *members;
714714
struct idset *ids;
715+
static double last_update = 0;
716+
double now = flux_reactor_now (flux_get_reactor (s->ctx->h));
717+
bool quorum_reached = false;
715718

716719
if (flux_rpc_get_unpack (f, "{s:s}", "members", &members) < 0
717720
|| !(ids = idset_decode (members))) {
@@ -720,15 +723,21 @@ static void broker_online_cb (flux_future_t *f, void *arg)
720723
return;
721724
}
722725

723-
if (strlen (members) > 0) {
726+
idset_destroy (s->quorum.have);
727+
s->quorum.have = ids;
728+
if (is_subset_of (s->quorum.want, s->quorum.have)) {
729+
quorum_reached = true;
730+
}
731+
732+
if (strlen (members) > 0
733+
&& (quorum_reached || now - last_update > 5)) {
724734
char *hosts = flux_hostmap_lookup (s->ctx->h, members, NULL);
725735
flux_log (s->ctx->h, LOG_INFO, "online: %s (ranks %s)", hosts, members);
726736
free (hosts);
737+
last_update = now;
727738
}
728739

729-
idset_destroy (s->quorum.have);
730-
s->quorum.have = ids;
731-
if (is_subset_of (s->quorum.want, s->quorum.have)) {
740+
if (quorum_reached) {
732741
if (s->state != STATE_RUN) {
733742
state_machine_post (s, "quorum-full");
734743
if (s->quorum.warned) {

0 commit comments

Comments
 (0)