@@ -323,7 +323,9 @@ def run_detailed_checks(host):
323
323
result = host .run ("sudo -u postgres env | grep POSTGRES" )
324
324
logger .info (f"postgres environment:\n { result .stdout } \n { result .stderr } " )
325
325
326
- def is_healthy (host , instance_ip , ssh_identity_file ) -> bool :
326
+ def is_healthy (host , instance_ip , ssh_identity_file ) -> tuple [bool , dict ]:
327
+ service_status = {} # Track status of each service
328
+
327
329
health_checks = [
328
330
(
329
331
"postgres" ,
@@ -358,10 +360,9 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
358
360
if service == "postgres" :
359
361
pg_isready = check (host )
360
362
361
- # Always read and log the PostgreSQL logs first
363
+ # Always read and log the PostgreSQL logs
362
364
logger .warning ("PostgreSQL status check:" )
363
365
try :
364
- # Read both .log and .csv files
365
366
log_files = [
366
367
"/var/log/postgresql/*.log" ,
367
368
"/var/log/postgresql/*.csv"
@@ -379,37 +380,48 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
379
380
except Exception as e :
380
381
logger .error (f"Error reading PostgreSQL logs: { str (e )} " )
381
382
382
- # Then check the status and return
383
- if not pg_isready .failed :
384
- continue
385
- # Wait before next attempt
386
- sleep (5 )
387
- return False
383
+ service_status [service ] = not pg_isready .failed
384
+
388
385
else :
389
386
cmd = check (host )
390
- if cmd .failed is True :
387
+ service_status [service ] = not cmd .failed
388
+ if cmd .failed :
391
389
logger .warning (f"{ service } not ready" )
392
390
logger .error (f"{ service } command failed with rc={ cmd .rc } " )
393
391
logger .error (f"{ service } stdout: { cmd .stdout } " )
394
392
logger .error (f"{ service } stderr: { cmd .stderr } " )
395
- return False
393
+
396
394
except Exception as e :
397
- logger .warning (
398
- f"Connection failed during { service } check, attempting reconnect..."
399
- )
395
+ logger .warning (f"Connection failed during { service } check, attempting reconnect..." )
400
396
logger .error (f"Error details: { str (e )} " )
401
397
host = get_ssh_connection (instance_ip , ssh_identity_file )
402
- return False
398
+ service_status [service ] = False
399
+
400
+ # Log overall status of all services
401
+ logger .info ("Service health status:" )
402
+ for service , healthy in service_status .items ():
403
+ logger .info (f"{ service } : { 'healthy' if healthy else 'unhealthy' } " )
403
404
404
- return True
405
+ # If any service is unhealthy, wait and return False with status
406
+ if not all (service_status .values ()):
407
+ if service_status .get ("postgres" , False ): # If postgres is healthy but others aren't
408
+ sleep (5 ) # Only wait if postgres is up but other services aren't
409
+ logger .warning ("Some services are not healthy, will retry..." )
410
+ return False , service_status
411
+
412
+ logger .info ("All services are healthy, proceeding to tests..." )
413
+ return True , service_status
405
414
406
415
while True :
407
- if is_healthy (
416
+ healthy , status = is_healthy (
408
417
host = host ,
409
418
instance_ip = instance .public_ip_address ,
410
419
ssh_identity_file = temp_key .get_priv_key_file (),
411
- ):
420
+ )
421
+ if healthy :
422
+ logger .info ("Health check passed, starting tests..." )
412
423
break
424
+ logger .warning (f"Health check failed, service status: { status } " )
413
425
sleep (1 )
414
426
415
427
# return a testinfra connection to the instance
0 commit comments