test: restructure checks to avoid race

samrose · samrose · commit 164752bd8be3 · 2025-04-29T12:15:43.000-04:00
diff --git a/testinfra/test_ami_nix.py b/testinfra/test_ami_nix.py
@@ -323,7 +323,9 @@ def run_detailed_checks(host):
         result = host.run("sudo -u postgres env | grep POSTGRES")
         logger.info(f"postgres environment:\n{result.stdout}\n{result.stderr}")
 
-    def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
+    def is_healthy(host, instance_ip, ssh_identity_file) -> tuple[bool, dict]:
+        service_status = {}  # Track status of each service
+
         health_checks = [
             (
                 "postgres",
@@ -358,10 +360,9 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
                 if service == "postgres":
                     pg_isready = check(host)
                     
-                    # Always read and log the PostgreSQL logs first
+                    # Always read and log the PostgreSQL logs
                     logger.warning("PostgreSQL status check:")
                     try:
-                        # Read both .log and .csv files
                         log_files = [
                             "/var/log/postgresql/*.log",
                             "/var/log/postgresql/*.csv"
@@ -379,37 +380,48 @@ def is_healthy(host, instance_ip, ssh_identity_file) -> bool:
                     except Exception as e:
                         logger.error(f"Error reading PostgreSQL logs: {str(e)}")
 
-                    # Then check the status and return
-                    if not pg_isready.failed:
-                        continue
-                    # Wait before next attempt
-                    sleep(5)
-                    return False
+                    service_status[service] = not pg_isready.failed
+
                 else:
                     cmd = check(host)
-                    if cmd.failed is True:
+                    service_status[service] = not cmd.failed
+                    if cmd.failed:
                         logger.warning(f"{service} not ready")
                         logger.error(f"{service} command failed with rc={cmd.rc}")
                         logger.error(f"{service} stdout: {cmd.stdout}")
                         logger.error(f"{service} stderr: {cmd.stderr}")
-                        return False
+
             except Exception as e:
-                logger.warning(
-                    f"Connection failed during {service} check, attempting reconnect..."
-                )
+                logger.warning(f"Connection failed during {service} check, attempting reconnect...")
                 logger.error(f"Error details: {str(e)}")
                 host = get_ssh_connection(instance_ip, ssh_identity_file)
-                return False
+                service_status[service] = False
+
+        # Log overall status of all services
+        logger.info("Service health status:")
+        for service, healthy in service_status.items():
+            logger.info(f"{service}: {'healthy' if healthy else 'unhealthy'}")
 
-        return True
+        # If any service is unhealthy, wait and return False with status
+        if not all(service_status.values()):
+            if service_status.get("postgres", False):  # If postgres is healthy but others aren't
+                sleep(5)  # Only wait if postgres is up but other services aren't
+            logger.warning("Some services are not healthy, will retry...")
+            return False, service_status
+
+        logger.info("All services are healthy, proceeding to tests...")
+        return True, service_status
 
     while True:
-        if is_healthy(
+        healthy, status = is_healthy(
             host=host,
             instance_ip=instance.public_ip_address,
             ssh_identity_file=temp_key.get_priv_key_file(),
-        ):
+        )
+        if healthy:
+            logger.info("Health check passed, starting tests...")
             break
+        logger.warning(f"Health check failed, service status: {status}")
         sleep(1)
 
     # return a testinfra connection to the instance