@@ -492,6 +492,11 @@ def start_cluster(
492492 else :
493493 logging .info ("All TLS certificates successfully copied and verified" )
494494
495+ # Run TLS diagnostics if TLS is enabled
496+ if tls :
497+ self .diagnose_tls_issue (endpoints )
498+ self .test_cluster_discovery_tls (endpoints )
499+
495500 return endpoints
496501 else :
497502 logging .error ("Could not parse cluster endpoints from output" )
@@ -634,6 +639,120 @@ def _copy_file_to_remote(self, local_path: str, remote_path: str) -> bool:
634639 return False
635640
636641
642+ def diagnose_tls_issue (self , endpoints : List [str ]) -> None :
643+ """Diagnose TLS connectivity issues for cluster endpoints"""
644+ logging .info ("=== TLS DIAGNOSTICS ===" )
645+
646+ # 1. Check cluster topology and what nodes advertise
647+ if endpoints :
648+ first_endpoint = endpoints [0 ]
649+ host , port = first_endpoint .split (':' )
650+
651+ logging .info ("1. Checking cluster topology..." )
652+ cluster_nodes_cmd = f"cd { self .remote_repo_path } /utils && export PATH={ self .engine_path } /src:$PATH && echo 'CLUSTER NODES' | valkey-cli -h { host } -p { port } --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
653+ returncode , stdout , stderr = self ._execute_remote_command (cluster_nodes_cmd , timeout = 15 )
654+
655+ if returncode == 0 :
656+ logging .info ("Cluster nodes output:" )
657+ for line in stdout .strip ().split ('\n ' ):
658+ if line .strip ():
659+ # Parse node info: node_id ip:port@cluster_port flags master/slave ...
660+ parts = line .split ()
661+ if len (parts ) >= 2 :
662+ node_addr = parts [1 ].split ('@' )[0 ] # Remove cluster port
663+ logging .info (f" Node advertises: { node_addr } " )
664+ else :
665+ logging .error (f"Failed to get cluster nodes: { stderr } " )
666+
667+ # 2. Test TLS handshake to each endpoint
668+ logging .info ("2. Testing TLS handshake to each endpoint..." )
669+ for i , endpoint in enumerate (endpoints ):
670+ host , port = endpoint .split (':' )
671+ logging .info (f"Testing endpoint { i + 1 } /{ len (endpoints )} : { endpoint } " )
672+
673+ # Test with openssl s_client
674+ openssl_cmd = f"echo 'QUIT' | openssl s_client -connect { host } :{ port } -servername { host } -verify_return_error -CAfile { self .remote_repo_path } /utils/tls_crts/ca.crt 2>&1"
675+ returncode , stdout , stderr = self ._execute_remote_command (openssl_cmd , timeout = 10 )
676+
677+ if "Verify return code: 0 (ok)" in stdout :
678+ logging .info (f" ✓ TLS handshake OK for { endpoint } " )
679+ else :
680+ logging .warning (f" ✗ TLS handshake FAILED for { endpoint } " )
681+ # Extract relevant error info
682+ for line in stdout .split ('\n ' ):
683+ if 'verify error' in line .lower () or 'certificate verify failed' in line .lower ():
684+ logging .warning (f" Error: { line .strip ()} " )
685+
686+ # 3. Check certificate details
687+ logging .info ("3. Checking certificate SAN entries..." )
688+ cert_cmd = f"cd { self .remote_repo_path } /utils && openssl x509 -in tls_crts/server.crt -text -noout | grep -A1 'Subject Alternative Name'"
689+ returncode , stdout , stderr = self ._execute_remote_command (cert_cmd , timeout = 5 )
690+
691+ if returncode == 0 and stdout .strip ():
692+ logging .info (f"Certificate SAN: { stdout .strip ()} " )
693+ else :
694+ logging .warning ("Could not extract certificate SAN entries" )
695+
696+ # 4. Test connection order dependency
697+ logging .info ("4. Testing connection order dependency..." )
698+ for i , endpoint in enumerate (endpoints ):
699+ host , port = endpoint .split (':' )
700+ ping_cmd = f"cd { self .remote_repo_path } /utils && export PATH={ self .engine_path } /src:$PATH && timeout 5 echo 'PING' | valkey-cli -h { host } -p { port } --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
701+ returncode , stdout , stderr = self ._execute_remote_command (ping_cmd , timeout = 10 )
702+
703+ if returncode == 0 and 'PONG' in stdout :
704+ logging .info (f" Connection { i + 1 } : { endpoint } - OK" )
705+ else :
706+ logging .warning (f" Connection { i + 1 } : { endpoint } - FAILED: { stderr } " )
707+
708+ logging .info ("=== END TLS DIAGNOSTICS ===" )
709+
710+ def test_cluster_discovery_tls (self , endpoints : List [str ]) -> None :
711+ """Test if cluster discovery reveals different IPs than initial connections"""
712+ if not endpoints :
713+ return
714+
715+ logging .info ("=== CLUSTER DISCOVERY TLS TEST ===" )
716+
717+ # Connect to first node and get full cluster topology
718+ first_endpoint = endpoints [0 ]
719+ host , port = first_endpoint .split (':' )
720+
721+ cluster_nodes_cmd = f"cd { self .remote_repo_path } /utils && export PATH={ self .engine_path } /src:$PATH && echo 'CLUSTER NODES' | valkey-cli -h { host } -p { port } --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
722+ returncode , stdout , stderr = self ._execute_remote_command (cluster_nodes_cmd , timeout = 15 )
723+
724+ if returncode == 0 :
725+ discovered_nodes = []
726+ for line in stdout .strip ().split ('\n ' ):
727+ if line .strip ():
728+ parts = line .split ()
729+ if len (parts ) >= 2 :
730+ node_addr = parts [1 ].split ('@' )[0 ] # Remove cluster port
731+ discovered_nodes .append (node_addr )
732+
733+ logging .info (f"Initial endpoints: { endpoints } " )
734+ logging .info (f"Discovered nodes: { discovered_nodes } " )
735+
736+ # Check if discovered nodes match initial endpoints
737+ initial_set = set (endpoints )
738+ discovered_set = set (discovered_nodes )
739+
740+ if initial_set == discovered_set :
741+ logging .info ("✓ Discovered nodes match initial endpoints" )
742+ else :
743+ logging .warning ("✗ Discovered nodes differ from initial endpoints" )
744+ only_initial = initial_set - discovered_set
745+ only_discovered = discovered_set - initial_set
746+ if only_initial :
747+ logging .warning (f" Only in initial: { only_initial } " )
748+ if only_discovered :
749+ logging .warning (f" Only in discovered: { only_discovered } " )
750+ else :
751+ logging .error (f"Failed to get cluster topology: { stderr } " )
752+
753+ logging .info ("=== END CLUSTER DISCOVERY TLS TEST ===" )
754+
755+
637756def main ():
638757 logfile = "./cluster_manager.log"
639758 init_logger (logfile )
0 commit comments