@@ -323,7 +323,9 @@ def run_detailed_checks(host):
323323 result = host .run ("sudo -u postgres env | grep POSTGRES" )
324324 logger .info (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
325325
326- def is_healthy (host , instance_ip , ssh_identity_file ) -> bool :
326+ def is_healthy (host , instance_ip , ssh_identity_file ) -> tuple [bool , dict ]:
327+ service_status = {} # Track status of each service
328+
327329 health_checks = [
328330 (
329331 "postgres" ,
@@ -358,10 +360,9 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
358360 if service == "postgres" :
359361 pg_isready = check (host )
360362
361- # Always read and log the PostgreSQL logs first
363+ # Always read and log the PostgreSQL logs
362364 logger .warning ("PostgreSQL status check:" )
363365 try :
364- # Read both .log and .csv files
365366 log_files = [
366367 "/var/log/postgresql/*.log" ,
367368 "/var/log/postgresql/*.csv"
@@ -379,37 +380,48 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
379380 except Exception as e :
380381 logger .error (f"Error reading PostgreSQL logs: { str (e )} " )
381382
382- # Then check the status and return
383- if not pg_isready .failed :
384- continue
385- # Wait before next attempt
386- sleep (5 )
387- return False
383+ service_status [service ] = not pg_isready .failed
384+
388385 else :
389386 cmd = check (host )
390- if cmd .failed is True :
387+ service_status [service ] = not cmd .failed
388+ if cmd .failed :
391389 logger .warning (f"{ service } not ready" )
392390 logger .error (f"{ service } command failed with rc={ cmd .rc } " )
393391 logger .error (f"{ service } stdout: { cmd .stdout } " )
394392 logger .error (f"{ service } stderr: { cmd .stderr } " )
395- return False
393+
396394 except Exception as e :
397- logger .warning (
398- f"Connection failed during { service } check, attempting reconnect..."
399- )
395+ logger .warning (f"Connection failed during { service } check, attempting reconnect..." )
400396 logger .error (f"Error details: { str (e )} " )
401397 host = get_ssh_connection (instance_ip , ssh_identity_file )
402- return False
398+ service_status [service ] = False
399+
400+ # Log overall status of all services
401+ logger .info ("Service health status:" )
402+ for service , healthy in service_status .items ():
403+ logger .info (f"{ service } : { 'healthy' if healthy else 'unhealthy' } " )
403404
404- return True
405+ # If any service is unhealthy, wait and return False with status
406+ if not all (service_status .values ()):
407+ if service_status .get ("postgres" , False ): # If postgres is healthy but others aren't
408+ sleep (5 ) # Only wait if postgres is up but other services aren't
409+ logger .warning ("Some services are not healthy, will retry..." )
410+ return False , service_status
411+
412+ logger .info ("All services are healthy, proceeding to tests..." )
413+ return True , service_status
405414
406415 while True :
407- if is_healthy (
416+ healthy , status = is_healthy (
408417 host = host ,
409418 instance_ip = instance .public_ip_address ,
410419 ssh_identity_file = temp_key .get_priv_key_file (),
411- ):
420+ )
421+ if healthy :
422+ logger .info ("Health check passed, starting tests..." )
412423 break
424+ logger .warning (f"Health check failed, service status: { status } " )
413425 sleep (1 )
414426
415427 # return a testinfra connection to the instance
0 commit comments