Skip to content

Commit d895d03

Browse files
committed
Address comprehensive PR review feedback from @stefannica
Implements all requested improvements from second round of code review: 1. Use specific HfHubHTTPError exception instead of broad Exception - Check for 404 status code when space_info fails - Raise DeploymentProvisionError for other HTTP errors - Provides better error messages and debugging 2. Fail early on environment variable failures - Changed from logger.warning to raise DeploymentProvisionError - Deployment will fail immediately if env vars can't be set - Prevents deployment from starting without required configuration 3. Fail early on secret failures - Changed from logger.warning to raise DeploymentProvisionError - Deployment will fail immediately if secrets can't be set - Critical since secrets are needed for ZenML server connection 4. Fix RUNNING_BUILDING state mapping - Moved RUNNING_BUILDING from RUNNING to PENDING status - Only fully running Spaces return RUNNING status - Health endpoint not available during RUNNING_BUILDING 5. Store external state in metadata for debugging - Added runtime.stage to DeploymentOperationalState metadata - Helps with debugging deployment status issues - Provides visibility into HF Space's internal state 6. Use correct exception type in deprovision - Changed from DeploymentNotFoundError to DeploymentDeprovisionError - Signals backend errors vs successful deletion - Updated docstring to document both exception types 7. Document API behavior with clarifying comments - add_space_variable/secret are upsert operations (add or update) - request_space_hardware/storage replace the current tier - Clarifies behavior for redeployments 8. Handle space_id mismatch corner case - Detect when space_id changes (renamed deployment/changed prefix) - Automatically deprovision old Space before creating new one - Prevents orphaned Spaces from accumulating All changes improve error handling, debuggability, and resource cleanup.
1 parent 270f8e2 commit d895d03

File tree

1 file changed

+43
-9
lines changed

1 file changed

+43
-9
lines changed

src/zenml/integrations/huggingface/deployers/huggingface_deployer.py

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -298,12 +298,28 @@ def do_provision_deployment(
298298
space_id = self._get_space_id(deployment)
299299
image = self.get_image(deployment.snapshot)
300300

301+
# Handle space_id mismatch (e.g., renamed deployment or changed prefix)
302+
old_space_id = deployment.deployment_metadata.get("space_id")
303+
if old_space_id and old_space_id != space_id:
304+
logger.info(
305+
f"Space ID changed from {old_space_id} to {space_id}. "
306+
f"Cleaning up old Space..."
307+
)
308+
try:
309+
self.do_deprovision_deployment(deployment, timeout=0)
310+
except Exception as e:
311+
logger.warning(
312+
f"Failed to clean up old Space {old_space_id}: {e}"
313+
)
314+
301315
logger.info(
302316
f"Deploying image {image} to Hugging Face Space. "
303317
"Ensure the image is publicly accessible."
304318
)
305319

306320
try:
321+
from huggingface_hub.utils import HfHubHTTPError
322+
307323
# Create Space if it doesn't exist, or update visibility if needed
308324
try:
309325
space_info = api.space_info(space_id)
@@ -320,7 +336,11 @@ def do_provision_deployment(
320336
private=settings.private,
321337
repo_type="space",
322338
)
323-
except Exception:
339+
except HfHubHTTPError as e:
340+
if e.response.status_code != 404:
341+
raise DeploymentProvisionError(
342+
f"Failed to check Space {space_id}: {e}"
343+
) from e
324344
logger.info(f"Creating new Space: {space_id}")
325345
api.create_repo(
326346
repo_id=space_id,
@@ -366,6 +386,7 @@ def do_provision_deployment(
366386

367387
# Set environment variables using Space variables API
368388
# This is secure - variables are not exposed in the Dockerfile
389+
# Note: add_space_variable is an upsert operation (adds or updates)
369390
logger.info(f"Setting {len(environment)} environment variables...")
370391
for key, value in environment.items():
371392
try:
@@ -375,12 +396,13 @@ def do_provision_deployment(
375396
value=value,
376397
)
377398
except Exception as e:
378-
logger.warning(
399+
raise DeploymentProvisionError(
379400
f"Failed to set environment variable {key}: {e}"
380-
)
401+
) from e
381402

382403
# Set secrets using Space secrets API
383404
# This is secure - secrets are encrypted and not exposed
405+
# Note: add_space_secret is an upsert operation (adds or updates)
384406
logger.info(f"Setting {len(secrets)} secrets...")
385407
for key, value in secrets.items():
386408
try:
@@ -390,9 +412,12 @@ def do_provision_deployment(
390412
value=value,
391413
)
392414
except Exception as e:
393-
logger.warning(f"Failed to set secret {key}: {e}")
415+
raise DeploymentProvisionError(
416+
f"Failed to set secret {key}: {e}"
417+
) from e
394418

395419
# Set hardware if specified (fail if this doesn't work)
420+
# Note: request_space_hardware replaces the current hardware tier
396421
hardware = settings.space_hardware or self.config.space_hardware
397422
if hardware:
398423
from huggingface_hub import SpaceHardware
@@ -416,6 +441,7 @@ def do_provision_deployment(
416441
) from e
417442

418443
# Set storage if specified (fail if this doesn't work)
444+
# Note: request_space_storage replaces the current storage tier
419445
storage = settings.space_storage or self.config.space_storage
420446
if storage:
421447
from huggingface_hub import SpaceStorage
@@ -472,9 +498,13 @@ def do_get_deployment_state(
472498
runtime = api.get_space_runtime(repo_id=space_id)
473499

474500
# Map Space stage to deployment status
475-
if runtime.stage in ["RUNNING", "RUNNING_BUILDING"]:
501+
if runtime.stage == "RUNNING":
476502
status = DeploymentStatus.RUNNING
477-
elif runtime.stage in ["BUILDING", "NO_APP_FILE"]:
503+
elif runtime.stage in [
504+
"BUILDING",
505+
"RUNNING_BUILDING",
506+
"NO_APP_FILE",
507+
]:
478508
status = DeploymentStatus.PENDING
479509
else:
480510
# BUILD_ERROR, RUNTIME_ERROR, STOPPED, etc.
@@ -483,7 +513,10 @@ def do_get_deployment_state(
483513
return DeploymentOperationalState(
484514
status=status,
485515
url=f"https://huggingface.co/spaces/{space_id}",
486-
metadata={"space_id": space_id},
516+
metadata={
517+
"space_id": space_id,
518+
"external_state": runtime.stage,
519+
},
487520
)
488521

489522
except Exception as e:
@@ -526,7 +559,8 @@ def do_deprovision_deployment(
526559
None, indicating immediate deletion completed.
527560
528561
Raises:
529-
DeploymentNotFoundError: If the Space is not found.
562+
DeploymentNotFoundError: If the Space ID is not in metadata.
563+
DeploymentDeprovisionError: If deletion fails.
530564
"""
531565
space_id = deployment.deployment_metadata.get("space_id")
532566
if not space_id:
@@ -544,6 +578,6 @@ def do_deprovision_deployment(
544578
if e.response.status_code == 404:
545579
logger.info(f"Space {space_id} already deleted")
546580
return None
547-
raise DeploymentNotFoundError(
581+
raise DeploymentDeprovisionError(
548582
f"Failed to delete Space {space_id}: {e}"
549583
) from e

0 commit comments

Comments
 (0)