Skip to content

Commit 2c4e354

Browse files
committed
Add TLS diagnostics
Signed-off-by: James Duong <[email protected]>
1 parent dab8235 commit 2c4e354

File tree

1 file changed

+119
-0
lines changed

1 file changed

+119
-0
lines changed

utils/remote_cluster_manager.py

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,11 @@ def start_cluster(
492492
else:
493493
logging.info("All TLS certificates successfully copied and verified")
494494

495+
# Run TLS diagnostics if TLS is enabled
496+
if tls:
497+
self.diagnose_tls_issue(endpoints)
498+
self.test_cluster_discovery_tls(endpoints)
499+
495500
return endpoints
496501
else:
497502
logging.error("Could not parse cluster endpoints from output")
@@ -634,6 +639,120 @@ def _copy_file_to_remote(self, local_path: str, remote_path: str) -> bool:
634639
return False
635640

636641

642+
def diagnose_tls_issue(self, endpoints: List[str]) -> None:
643+
"""Diagnose TLS connectivity issues for cluster endpoints"""
644+
logging.info("=== TLS DIAGNOSTICS ===")
645+
646+
# 1. Check cluster topology and what nodes advertise
647+
if endpoints:
648+
first_endpoint = endpoints[0]
649+
host, port = first_endpoint.split(':')
650+
651+
logging.info("1. Checking cluster topology...")
652+
cluster_nodes_cmd = f"cd {self.remote_repo_path}/utils && export PATH={self.engine_path}/src:$PATH && echo 'CLUSTER NODES' | valkey-cli -h {host} -p {port} --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
653+
returncode, stdout, stderr = self._execute_remote_command(cluster_nodes_cmd, timeout=15)
654+
655+
if returncode == 0:
656+
logging.info("Cluster nodes output:")
657+
for line in stdout.strip().split('\n'):
658+
if line.strip():
659+
# Parse node info: node_id ip:port@cluster_port flags master/slave ...
660+
parts = line.split()
661+
if len(parts) >= 2:
662+
node_addr = parts[1].split('@')[0] # Remove cluster port
663+
logging.info(f" Node advertises: {node_addr}")
664+
else:
665+
logging.error(f"Failed to get cluster nodes: {stderr}")
666+
667+
# 2. Test TLS handshake to each endpoint
668+
logging.info("2. Testing TLS handshake to each endpoint...")
669+
for i, endpoint in enumerate(endpoints):
670+
host, port = endpoint.split(':')
671+
logging.info(f"Testing endpoint {i+1}/{len(endpoints)}: {endpoint}")
672+
673+
# Test with openssl s_client
674+
openssl_cmd = f"echo 'QUIT' | openssl s_client -connect {host}:{port} -servername {host} -verify_return_error -CAfile {self.remote_repo_path}/utils/tls_crts/ca.crt 2>&1"
675+
returncode, stdout, stderr = self._execute_remote_command(openssl_cmd, timeout=10)
676+
677+
if "Verify return code: 0 (ok)" in stdout:
678+
logging.info(f" ✓ TLS handshake OK for {endpoint}")
679+
else:
680+
logging.warning(f" ✗ TLS handshake FAILED for {endpoint}")
681+
# Extract relevant error info
682+
for line in stdout.split('\n'):
683+
if 'verify error' in line.lower() or 'certificate verify failed' in line.lower():
684+
logging.warning(f" Error: {line.strip()}")
685+
686+
# 3. Check certificate details
687+
logging.info("3. Checking certificate SAN entries...")
688+
cert_cmd = f"cd {self.remote_repo_path}/utils && openssl x509 -in tls_crts/server.crt -text -noout | grep -A1 'Subject Alternative Name'"
689+
returncode, stdout, stderr = self._execute_remote_command(cert_cmd, timeout=5)
690+
691+
if returncode == 0 and stdout.strip():
692+
logging.info(f"Certificate SAN: {stdout.strip()}")
693+
else:
694+
logging.warning("Could not extract certificate SAN entries")
695+
696+
# 4. Test connection order dependency
697+
logging.info("4. Testing connection order dependency...")
698+
for i, endpoint in enumerate(endpoints):
699+
host, port = endpoint.split(':')
700+
ping_cmd = f"cd {self.remote_repo_path}/utils && export PATH={self.engine_path}/src:$PATH && timeout 5 echo 'PING' | valkey-cli -h {host} -p {port} --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
701+
returncode, stdout, stderr = self._execute_remote_command(ping_cmd, timeout=10)
702+
703+
if returncode == 0 and 'PONG' in stdout:
704+
logging.info(f" Connection {i+1}: {endpoint} - OK")
705+
else:
706+
logging.warning(f" Connection {i+1}: {endpoint} - FAILED: {stderr}")
707+
708+
logging.info("=== END TLS DIAGNOSTICS ===")
709+
710+
def test_cluster_discovery_tls(self, endpoints: List[str]) -> None:
711+
"""Test if cluster discovery reveals different IPs than initial connections"""
712+
if not endpoints:
713+
return
714+
715+
logging.info("=== CLUSTER DISCOVERY TLS TEST ===")
716+
717+
# Connect to first node and get full cluster topology
718+
first_endpoint = endpoints[0]
719+
host, port = first_endpoint.split(':')
720+
721+
cluster_nodes_cmd = f"cd {self.remote_repo_path}/utils && export PATH={self.engine_path}/src:$PATH && echo 'CLUSTER NODES' | valkey-cli -h {host} -p {port} --tls --cert tls_crts/server.crt --key tls_crts/server.key --cacert tls_crts/ca.crt"
722+
returncode, stdout, stderr = self._execute_remote_command(cluster_nodes_cmd, timeout=15)
723+
724+
if returncode == 0:
725+
discovered_nodes = []
726+
for line in stdout.strip().split('\n'):
727+
if line.strip():
728+
parts = line.split()
729+
if len(parts) >= 2:
730+
node_addr = parts[1].split('@')[0] # Remove cluster port
731+
discovered_nodes.append(node_addr)
732+
733+
logging.info(f"Initial endpoints: {endpoints}")
734+
logging.info(f"Discovered nodes: {discovered_nodes}")
735+
736+
# Check if discovered nodes match initial endpoints
737+
initial_set = set(endpoints)
738+
discovered_set = set(discovered_nodes)
739+
740+
if initial_set == discovered_set:
741+
logging.info("✓ Discovered nodes match initial endpoints")
742+
else:
743+
logging.warning("✗ Discovered nodes differ from initial endpoints")
744+
only_initial = initial_set - discovered_set
745+
only_discovered = discovered_set - initial_set
746+
if only_initial:
747+
logging.warning(f" Only in initial: {only_initial}")
748+
if only_discovered:
749+
logging.warning(f" Only in discovered: {only_discovered}")
750+
else:
751+
logging.error(f"Failed to get cluster topology: {stderr}")
752+
753+
logging.info("=== END CLUSTER DISCOVERY TLS TEST ===")
754+
755+
637756
def main():
638757
logfile = "./cluster_manager.log"
639758
init_logger(logfile)

0 commit comments

Comments
 (0)