Skip to content

Commit 20bbe85

Browse files
authored
Merge branch 'master' into boeker/handle-internal-ids-recall
2 parents 9511d48 + e3afd7f commit 20bbe85

File tree

2 files changed

+46
-28
lines changed

2 files changed

+46
-28
lines changed

tests/unit/test_deployment.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,21 @@ def test_check_production_build_status_deploying(self, mock_request):
172172

173173
self.assertEqual(status, {"deployed": False, "status": "deploying"})
174174

175+
@patch("vespa.deployment.VespaCloud._request")
176+
def test_wait_for_prod_deployment_raises_on_failed_job(self, mock_request):
177+
mock_request.return_value = {
178+
"deployed": False,
179+
"status": "deploying",
180+
"jobs": [
181+
{"jobName": "production-us-central-1", "runStatus": "success"},
182+
{"jobName": "production-us-east-3", "runStatus": "deploymentFailed"},
183+
],
184+
}
185+
186+
with self.assertRaises(RuntimeError) as ctx:
187+
self.vespa_cloud.wait_for_prod_deployment(456)
188+
self.assertIn("production-us-east-3: deploymentFailed", str(ctx.exception))
189+
175190
@patch("vespa.deployment.VespaCloud._try_get_access_token")
176191
def test_try_get_access_token(self, mock_get_token):
177192
mock_get_token.return_value = "fake_access_token"

vespa/deployment.py

Lines changed: 31 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -925,32 +925,31 @@ def check_production_build_status(self, build_no: Optional[int]) -> dict:
925925
vespa_cloud = VespaCloud(...)
926926
build_no = vespa_cloud.deploy_to_prod()
927927
status = vespa_cloud.check_production_build_status(build_no)
928-
# This can yield one of three responses:
929-
# 1. If the revision (build_no), or higher, has successfully converged everywhere, and nothing older has then been deployed on top of that again. Nothing more will happen in this case.
930-
# {
931-
# "deployed": True,
932-
# "status": "done"
933-
# }
934-
935-
# 2. If the revision (build_no), or newer, has not yet converged, but the system is (most likely) still trying to deploy it. There is a point in polling again later when this is the response.
936-
# {
937-
# "deployed": False,
938-
# "status": "deploying"
939-
# }
940-
# 3. If the revision, or newer, has not yet converged everywhere, and it's never going to, because it was similar to the previous build, or marked obsolete by a user. There is no point in asking again for this revision.
941-
# {
942-
# "deployed": False,
943-
# "status": "done"
944-
# }
928+
# The response contains:
929+
# - "deployed" (bool): True if the build has converged everywhere.
930+
# - "status" (str): "deploying" or "done".
931+
# - "skipReason" (str, optional): Why the build was skipped, e.g. "no-changes" or "cancelled".
932+
# - "jobs" (list): Per-zone deployment details, each with "jobName" and "runStatus".
933+
#
934+
# Example responses:
935+
# 1. Successfully deployed everywhere:
936+
# {"deployed": True, "status": "done", "jobs": [{"jobName": "production-us-east-3", "runStatus": "success"}]}
937+
#
938+
# 2. Still deploying:
939+
# {"deployed": False, "status": "deploying", "jobs": [{"jobName": "production-us-east-3", "runStatus": "running"}]}
940+
#
941+
# 3. Skipped (no changes to deploy):
942+
# {"deployed": False, "status": "done", "skipReason": "no-changes", "jobs": []}
943+
#
944+
# 4. A job failed:
945+
# {"deployed": False, "status": "deploying", "jobs": [{"jobName": "production-us-east-3", "runStatus": "deploymentFailed"}]}
945946
```
946947
947948
Args:
948949
build_no (int): The build number to check.
949950
950951
Returns:
951-
dict: A dictionary with the aggregated status of all deployment jobs for the given build number. The dictionary contains:
952-
- "deployed" (bool): Whether the build has successfully converged.
953-
- "status" (str): The current status of the build ("done", "deploying").
952+
dict: The build status response from the API. See example responses above for the full shape.
954953
955954
Raises:
956955
RuntimeError: If there are issues with retrieving the status of the build.
@@ -993,23 +992,27 @@ def wait_for_prod_deployment(
993992
poll_interval (int, optional): Polling interval in seconds. Default is 5 seconds.
994993
995994
Returns:
996-
bool: True if the deployment is done and converged, False if the deployment has failed.
995+
bool: True if the build was deployed to all production zones, False if it completed
996+
without deploying (e.g. no changes).
997997
998998
Raises:
999+
RuntimeError: If any production job failed (e.g. deploymentFailed, installationFailed).
9991000
TimeoutError: If the deployment did not finish within `max_wait` seconds.
10001001
"""
10011002
start_time = time.time()
10021003
while time.time() - start_time < max_wait:
10031004
status = self.check_production_build_status(build_no)
1005+
failed_jobs = [
1006+
job for job in status.get("jobs", [])
1007+
if job["runStatus"] not in ("success", "running")
1008+
]
1009+
if failed_jobs:
1010+
failures = ", ".join(
1011+
f"{job['jobName']}: {job['runStatus']}" for job in failed_jobs
1012+
)
1013+
raise RuntimeError(f"Deployment failed: {failures}")
10041014
if status["status"] == "done":
10051015
return status["deployed"]
1006-
if "detailed-status" in status and status["detailed-status"] not in [
1007-
"success",
1008-
"running",
1009-
]:
1010-
raise RuntimeError(
1011-
f"The build failed with status code: {status['detailed-status']}"
1012-
)
10131016
time.sleep(poll_interval)
10141017
raise TimeoutError(f"Deployment did not finish within {max_wait} seconds. ")
10151018

0 commit comments

Comments
 (0)