@@ -449,19 +449,6 @@ static struct child *child_lookup_online (struct overlay *ov, const char *id)
449449 return ov -> child_hash ? zhashx_lookup (ov -> child_hash , id ) : NULL ;
450450}
451451
452- /* Slow path to look up child regardless of its status.
453- */
454- static struct child * child_lookup (struct overlay * ov , const char * id )
455- {
456- struct child * child ;
457-
458- foreach_overlay_child (ov , child ) {
459- if (streq (child -> uuid , id ))
460- return child ;
461- }
462- return NULL ;
463- }
464-
465452/* Lookup (direct) child peer by rank.
466453 * Returns NULL on lookup failure.
467454 */
@@ -903,44 +890,33 @@ static void child_cb (flux_reactor_t *r,
903890 goto done ;
904891 }
905892 if (!(child = child_lookup_online (ov , uuid ))) {
906- /* If child is not online but we know this uuid, message is from a
907- * child that has already transitioned to offline/lost:
908- * - message sent while control DISCONNECT is in flight
909- * - network partition disconnected child; now it's back
910- * Send a disconnect control message, which is required in the second
911- * case, and won't hurt in the first.
912- */
913- if ((child = child_lookup (ov , uuid ))) {
914- /* Don't log dropped messages as first case above is expected,
915- * it this can be noisey. See flux-framework/flux-core#4180
916- */
917- (void )overlay_control_child (ov , uuid , CONTROL_DISCONNECT , 0 );
918- }
919- /* Hello new peer!
920- * hello_request_handler() completes the handshake.
893+ /* This is a new peer trying to introduce itself by sending an
894+ * overlay.hello request.
895+ * N.B. the broker generates a new UUID on startup, and hello is only
896+ * sent once on startup, in overlay_connect(). Therefore, it is
897+ * assumed that a overlay.hello is always introducing a new UUID and
898+ * we don't bother checking if we've seen this UUID before, which can
899+ * be slow given current design. See flux-framework/flux-core#5864.
921900 */
922- else if (type == FLUX_MSGTYPE_REQUEST
901+ if (type == FLUX_MSGTYPE_REQUEST
923902 && flux_msg_get_topic (msg , & topic ) == 0
924903 && streq (topic , "overlay.hello" )) {
925904 hello_request_handler (ov , msg );
926905 }
927- /* New peer is trying to communicate without first saying hello.
928- * This likely means that *this broker* restarted without getting a
929- * message through to the child, and the child still thinks it is
930- * communicating with the same broker (since 0MQ hides reconnects).
931- * Send CONTROL_DISCONNECT to force subtree panic.
906+ /* Or one of the following cases occurred that requires (or at least
907+ * will not be harmed by) a DISCONNECT message sent to the peer:
908+ * 1) This is a known peer that has transitioned to offline/lost _here_
909+ * but the DISCONNECT is still in flight to the peer.
910+ * 2) This is a known peer that has transitioned to offline/lost
911+ * as a result of a network partition, but the child never received
912+ * the DISCONNECT and connectivity has been restored.
913+ * 3) This is a new-to-us peer because *we* restarted without getting
914+ * a message through (e.g. crash)
915+ * Control send failures may occur, see flux-framework/flux-core#4464.
916+ * Don't log here, see flux-framework/flux-core#4180.
932917 */
933918 else {
934- if (overlay_control_child (ov , uuid , CONTROL_DISCONNECT , 0 ) < 0 ) {
935- /* See flux-framework/flux-core#4464
936- * Failure to send CONTROL_DISCONNECT occurs naturally, e.g.
937- * when requests from a peer that fails overlay.hello are in
938- * the socket queue behind the hello request. The peer closes
939- * its end of the socket when hello fails, and meanwhile we
940- * are trying to send a CONTROL_DISCONNECT for each additional
941- * message and failing once the socket closes. Do not log.
942- */
943- }
919+ (void )overlay_control_child (ov , uuid , CONTROL_DISCONNECT , 0 );
944920 }
945921 goto done ;
946922 }
0 commit comments