Skip to content

Commit ff67b36

Browse files
committed
Add comprehensive logging for debugging polling issues
Added detailed logging to diagnose two issues: 1. **Unknown status detection**: When the deployment status shows as "unknown", we now log the actual stage value received from HuggingFace and list all known stages. This helps identify if HuggingFace has introduced new stages we don't recognize yet. 2. **Health check failures**: Override _check_deployment_health with better logging to show: - The exact health check URL being tested - Whether health check passed or failed - Specific error messages when health check fails - Status codes returned from the endpoint These logs will help diagnose why deployments continue polling even when the Space appears to be running and the health endpoint is working. The logs now show stage transitions and health check results at INFO level for easier debugging without requiring DEBUG logging.
1 parent 0584dcf commit ff67b36

File tree

1 file changed

+76
-1
lines changed

1 file changed

+76
-1
lines changed

src/zenml/integrations/huggingface/deployers/huggingface_deployer.py

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,6 +548,11 @@ def do_get_deployment_state(
548548
status = DeploymentStatus.ABSENT
549549
else:
550550
# Unknown/future stages
551+
logger.warning(
552+
f"Space {space_id} has unrecognized stage: '{runtime.stage}'. "
553+
f"This might be a new HuggingFace stage. Treating as UNKNOWN. "
554+
f"Known stages: {[s.value for s in SpaceStage]}"
555+
)
551556
status = DeploymentStatus.UNKNOWN
552557

553558
# Get deployment URL from Space domains (only when fully ready)
@@ -560,7 +565,8 @@ def do_get_deployment_state(
560565
):
561566
url = f"https://{domains[0]['domain']}"
562567
logger.info(
563-
f"Space {space_id} deployment URL: {url} (status={status})"
568+
f"Space {space_id} deployment URL: {url} "
569+
f"(status={status}, stage={runtime.stage}, domain_stage={domain_stage})"
564570
)
565571

566572
return DeploymentOperationalState(
@@ -578,6 +584,75 @@ def do_get_deployment_state(
578584
f"Space {space_id} not found: {e}"
579585
) from e
580586

587+
def _check_deployment_health(
588+
self,
589+
deployment: DeploymentResponse,
590+
) -> bool:
591+
"""Check if the deployment is healthy by calling its health check endpoint.
592+
593+
Overrides base method to add better logging for debugging.
594+
595+
Args:
596+
deployment: The deployment to check.
597+
598+
Returns:
599+
True if the deployment is healthy, False otherwise.
600+
"""
601+
import requests
602+
603+
from zenml.enums import DeploymentDefaultEndpoints
604+
605+
assert deployment.snapshot, "Deployment snapshot not found"
606+
607+
settings = (
608+
deployment.snapshot.pipeline_configuration.deployment_settings
609+
)
610+
611+
# If the health check endpoint is disabled, we consider the deployment healthy.
612+
if (
613+
DeploymentDefaultEndpoints.HEALTH
614+
not in settings.include_default_endpoints
615+
):
616+
logger.debug(
617+
f"Health check disabled for deployment {deployment.name}"
618+
)
619+
return True
620+
621+
if not deployment.url:
622+
logger.debug(
623+
f"No URL available for deployment {deployment.name}, health check fails"
624+
)
625+
return False
626+
627+
health_check_path = f"{settings.root_url_path}{settings.api_url_path}{settings.health_url_path}"
628+
health_check_url = f"{deployment.url}{health_check_path}"
629+
630+
logger.info(
631+
f"Checking health endpoint for deployment {deployment.name}: {health_check_url}"
632+
)
633+
634+
# Attempt to connect to the deployment and check if it is healthy
635+
try:
636+
response = requests.get(health_check_url, timeout=3)
637+
if response.status_code == 200:
638+
logger.info(
639+
f"Health check passed for deployment {deployment.name}"
640+
)
641+
return True
642+
else:
643+
logger.warning(
644+
f"Health check endpoint for deployment '{deployment.name}' "
645+
f"at '{health_check_url}' returned status code "
646+
f"{response.status_code}"
647+
)
648+
return False
649+
except Exception as e:
650+
logger.warning(
651+
f"Health check endpoint for deployment '{deployment.name}' "
652+
f"at '{health_check_url}' is not reachable: {e}"
653+
)
654+
return False
655+
581656
def do_get_deployment_state_logs(
582657
self,
583658
deployment: DeploymentResponse,

0 commit comments

Comments
 (0)