From 286659a49b23524488b9876fb2f8b389c0d8c12e Mon Sep 17 00:00:00 2001 From: Zhijun Date: Fri, 7 Nov 2025 13:44:58 +0800 Subject: [PATCH 1/8] Cluster: Enhance debugging in logs Signed-off-by: Zhijun --- src/connection.h | 2 ++ tests/support/cluster_util.tcl | 3 +++ tests/support/server.tcl | 4 ++++ tests/test_helper.tcl | 1 + 4 files changed, 10 insertions(+) diff --git a/src/connection.h b/src/connection.h index 4f8fac88d6..9f1e8b33a1 100644 --- a/src/connection.h +++ b/src/connection.h @@ -159,6 +159,8 @@ struct connection { ConnectionState state; int last_errno; int fd; + char client_ip[NET_IP_STR_LEN]; + int client_port; short int flags; short int refs; unsigned short int iovcnt; diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index ee14c58648..1dd112d3e6 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -214,6 +214,9 @@ proc cluster_setup {masters replicas node_count slot_allocator replica_allocator for {set i 0} {$i < $node_count} {incr i} { R $i CLUSTER SET-CONFIG-EPOCH $config_epoch incr config_epoch + # Make it easier to understand how the server interacts with + # other nodes when reading the server logs. + R $i CONFIG SET cluster-announce-human-nodename "R$i" } # Have all nodes meet diff --git a/tests/support/server.tcl b/tests/support/server.tcl index cbe973c60b..f00876565a 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -464,6 +464,7 @@ proc start_server {options {code undefined}} { # Wait for the server to be ready and check for server liveness/client connectivity before starting the test. set wait_ready true + puts "~~running start_server with options: $options" # parse options foreach {option value} $options { switch $option { @@ -566,6 +567,7 @@ proc start_server {options {code undefined}} { } } + # use a different directory every time a server is started dict set config dir [tmpdir server] @@ -699,6 +701,8 @@ proc start_server {options {code undefined}} { dict set srv "pport" $pport } + puts "~~! config $config" + puts "~~! srv $srv" # if a block of code is supplied, we wait for the server to become # available, create a client object and kill the server afterwards if {$code ne "undefined"} { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 509e2a96ec..91295ba21d 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -1043,6 +1043,7 @@ if {$::client} { set ::numclients 1 } + puts "running test_server_main" if {[catch { test_server_main } err]} { if {[string length $err] > 0} { # only display error when not generated by the test suite From 7197288b4c6cec9c7460b698cb75681dabdae8f6 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Fri, 7 Nov 2025 14:07:30 +0800 Subject: [PATCH 2/8] Fix errors Signed-off-by: Zhijun --- tests/unit/cluster/manual-failover.tcl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 56952ff008..e8871a1b2e 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -478,21 +478,21 @@ start_cluster 3 1 {tags {external:skip cluster}} { # verify we print the logs. # Both importing slots and migrating slots are move to R3. - set pattern "*Failover occurred in migration source. Update importing source for slot 0 to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*Failover occurred in migration source. Update importing source for slot 0 to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 - set pattern "*Failover occurred in migration target. Slot 5462 is now being migrated to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*Failover occurred in migration target. Slot 5462 is now being migrated to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 # Both slots are move to R3. set R0_slots 5462 - set pattern "*A failover occurred in shard $R3_shardid; node $R0_nodeid () lost $R0_slots slot(s) and failed over to node $R3_nodeid*" + set pattern "*A failover occurred in shard $R3_shardid; node $R0_nodeid * lost $R0_slots slot(s) and failed over to node $R3_nodeid*" verify_log_message -1 $pattern $loglines1 verify_log_message -2 $pattern $loglines2 # Both importing slots and migrating slots are move to R3. - set pattern "*A failover occurred in migration source. Update importing source of 1 slot(s) to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*A failover occurred in migration source. Update importing source of 1 slot(s) to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 - set pattern "*A failover occurred in migration target. Update migrating target of 1 slot(s) to node $R3_nodeid () in shard $R3_shardid*" + set pattern "*A failover occurred in migration target. Update migrating target of 1 slot(s) to node $R3_nodeid * in shard $R3_shardid*" verify_log_message -1 $pattern $loglines1 R 1 debug disable-cluster-reconnection 0 From 85b13c7abe71e25ec5314de9bc6425f12308aa19 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Fri, 7 Nov 2025 15:28:41 +0800 Subject: [PATCH 3/8] Clean up Signed-off-by: Zhijun --- tests/support/server.tcl | 4 ---- tests/test_helper.tcl | 1 - 2 files changed, 5 deletions(-) diff --git a/tests/support/server.tcl b/tests/support/server.tcl index f00876565a..cbe973c60b 100644 --- a/tests/support/server.tcl +++ b/tests/support/server.tcl @@ -464,7 +464,6 @@ proc start_server {options {code undefined}} { # Wait for the server to be ready and check for server liveness/client connectivity before starting the test. set wait_ready true - puts "~~running start_server with options: $options" # parse options foreach {option value} $options { switch $option { @@ -567,7 +566,6 @@ proc start_server {options {code undefined}} { } } - # use a different directory every time a server is started dict set config dir [tmpdir server] @@ -701,8 +699,6 @@ proc start_server {options {code undefined}} { dict set srv "pport" $pport } - puts "~~! config $config" - puts "~~! srv $srv" # if a block of code is supplied, we wait for the server to become # available, create a client object and kill the server afterwards if {$code ne "undefined"} { diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 91295ba21d..509e2a96ec 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -1043,7 +1043,6 @@ if {$::client} { set ::numclients 1 } - puts "running test_server_main" if {[catch { test_server_main } err]} { if {[string length $err] > 0} { # only display error when not generated by the test suite From 904f684f94b40982217cab45e56b8b3c45d21328 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Sat, 15 Nov 2025 17:30:10 +0800 Subject: [PATCH 4/8] Use connAddr instead Signed-off-by: Zhijun --- .gitignore | 1 + src/connection.h | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d85087c459..44c3e4f6c2 100644 --- a/.gitignore +++ b/.gitignore @@ -54,3 +54,4 @@ build-debug/ build-release/ cmake-build-debug/ cmake-build-release/ +ignore/ \ No newline at end of file diff --git a/src/connection.h b/src/connection.h index 9f1e8b33a1..4f8fac88d6 100644 --- a/src/connection.h +++ b/src/connection.h @@ -159,8 +159,6 @@ struct connection { ConnectionState state; int last_errno; int fd; - char client_ip[NET_IP_STR_LEN]; - int client_port; short int flags; short int refs; unsigned short int iovcnt; From e442181afd842db88c254b6b44a5b963da2d8540 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Tue, 18 Nov 2025 11:23:15 +0800 Subject: [PATCH 5/8] Replace connAddr with connAddrPeerName; Wrap logging with verbosity check Signed-off-by: Zhijun --- src/cluster_legacy.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index b7df963b4a..ec0ccd188c 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -1708,6 +1708,17 @@ void setClusterNodeToInboundClusterLink(clusterNode *node, clusterLink *link) { serverAssert(!node->inbound_link); node->inbound_link = link; link->node = node; + if (server.verbosity <= LL_VERBOSE) { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(link->conn, ip, sizeof(ip), &port) != -1) { + serverLog(LL_VERBOSE, "Bound cluster node %.40s (%s) to connection of client %s:%d", + node->name, node->human_nodename, ip, port); + } else { + serverLog(LL_VERBOSE, "Error resolving the inbound connection address of node %.40s (%s)", + node->name, node->human_nodename); + } + } } static void clusterConnAcceptHandler(connection *conn) { @@ -3833,11 +3844,23 @@ int clusterProcessPacket(clusterLink *link) { clusterSendPing(link, CLUSTERMSG_TYPE_PONG); } + if (server.verbosity <= LL_DEBUG) { + char ip[NET_IP_STR_LEN]; + int port; + if (connAddrPeerName(link->conn, ip, sizeof(ip), &port) != -1) { + serverLog(LL_DEBUG, "%s packet received from: %.40s (%s) from client: %s:%d", + clusterGetMessageTypeString(type), + link->node ? link->node->name : "NULL", + link->node ? link->node->human_nodename : "", + ip, port); + } else { + serverLog(LL_DEBUG, "Error resolving the address of packet sender %.40s (%s)", + link->node ? link->node->name : "NULL", + link->node ? link->node->human_nodename : ""); + } + } /* PING, PONG, MEET: process config information. */ if (type == CLUSTERMSG_TYPE_PING || type == CLUSTERMSG_TYPE_PONG || type == CLUSTERMSG_TYPE_MEET) { - serverLog(LL_DEBUG, "%s packet received: %.40s", clusterGetMessageTypeString(type), - clusterLinkGetNodeName(link)); - if (sender && nodeInMeetState(sender)) { /* Once we get a response for MEET from the sender, we can stop sending more MEET. */ sender->flags &= ~CLUSTER_NODE_MEET; From 018365bf88d914d8a62a86c14a48201acbae61a6 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Tue, 18 Nov 2025 11:41:32 +0800 Subject: [PATCH 6/8] Resolve conflict leftovers Signed-off-by: Zhijun --- src/cluster_legacy.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index ec0ccd188c..a1bdbf9a58 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3850,13 +3850,11 @@ int clusterProcessPacket(clusterLink *link) { if (connAddrPeerName(link->conn, ip, sizeof(ip), &port) != -1) { serverLog(LL_DEBUG, "%s packet received from: %.40s (%s) from client: %s:%d", clusterGetMessageTypeString(type), - link->node ? link->node->name : "NULL", - link->node ? link->node->human_nodename : "", + clusterLinkGetNodeName(link), clusterLinkGetHumanNodeName(link), ip, port); } else { serverLog(LL_DEBUG, "Error resolving the address of packet sender %.40s (%s)", - link->node ? link->node->name : "NULL", - link->node ? link->node->human_nodename : ""); + clusterLinkGetNodeName(link), clusterLinkGetHumanNodeName(link)); } } /* PING, PONG, MEET: process config information. */ From 33962889112a90942f14115ec8819c8af8e89fa2 Mon Sep 17 00:00:00 2001 From: Zhijun Date: Wed, 19 Nov 2025 08:18:55 +0800 Subject: [PATCH 7/8] Remove ignore dir Signed-off-by: Zhijun --- .gitignore | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 44c3e4f6c2..98711fa11a 100644 --- a/.gitignore +++ b/.gitignore @@ -53,5 +53,4 @@ build/ build-debug/ build-release/ cmake-build-debug/ -cmake-build-release/ -ignore/ \ No newline at end of file +cmake-build-release/ \ No newline at end of file From 5e9d84bbfa96697290352ca088a36da38e92b33a Mon Sep 17 00:00:00 2001 From: Zhijun Date: Tue, 25 Nov 2025 21:39:57 +0800 Subject: [PATCH 8/8] Recover gitignore Signed-off-by: Zhijun --- .gitignore | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 98711fa11a..d85087c459 100644 --- a/.gitignore +++ b/.gitignore @@ -53,4 +53,4 @@ build/ build-debug/ build-release/ cmake-build-debug/ -cmake-build-release/ \ No newline at end of file +cmake-build-release/