Skip to content

Commit e903f88

Browse files
jopemachineclaude
andauthored
feat(BA-5280): Consolidate deploying handlers and remove unused sub-steps (#10276)
Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 3e95c79 commit e903f88

File tree

13 files changed

+253
-365
lines changed

13 files changed

+253
-365
lines changed

changes/10276.feature.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Consolidate deploying handlers and remove unused sub-steps

src/ai/backend/manager/data/deployment/types.py

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -153,28 +153,22 @@ class DeploymentSubStatus(enum.StrEnum):
153153
154154
Each lifecycle type can define its own sub-status enum by
155155
inheriting from this class. For example, DEPLOYING handlers
156-
use ``DeploymentSubStep`` (provisioning, progressing, …).
156+
use ``DeploymentSubStep`` (provisioning, rolling_back, …).
157157
"""
158158

159159

160160
class DeploymentSubStep(DeploymentSubStatus):
161161
"""Sub-steps for the DEPLOYING lifecycle phase.
162162
163-
Active states:
164-
- PROVISIONING: New revision routes are being provisioned; waiting for readiness.
165-
- PROGRESSING: Actively replacing old routes with new routes.
166-
- ROLLING_BACK: Actively rolling back failed new routes to previous revision.
167-
168-
Terminal markers (no handler execution, trigger transition only):
169-
- COMPLETED: All strategy conditions satisfied; ready for revision swap.
170-
- ROLLED_BACK: Rollback finished; ready for cleanup and transition to READY.
163+
- PROVISIONING: New revision routes are being provisioned and old routes
164+
are being drained. The main handler for rolling updates.
165+
- ROLLING_BACK: Clearing deploying_revision and transitioning to READY.
166+
- COMPLETED: All strategy conditions satisfied; triggers revision swap.
171167
"""
172168

173169
PROVISIONING = "provisioning"
174-
PROGRESSING = "progressing"
175170
ROLLING_BACK = "rolling_back"
176171
COMPLETED = "completed"
177-
ROLLED_BACK = "rolled_back"
178172

179173

180174
@dataclass(frozen=True)
@@ -201,7 +195,11 @@ class DeploymentStatusTransitions:
201195
202196
Attributes:
203197
success: Target lifecycle when handler succeeds, None means no change
204-
need_retry: Target lifecycle when handler fails but can retry
198+
need_retry: Target lifecycle when handler fails but can retry, or when
199+
route mutations were executed but the deployment stays in the same
200+
sub-step (e.g. PROVISIONING → PROVISIONING after create/drain).
201+
Items explicitly returned as need_retry by handlers are never
202+
escalated to give_up — they represent normal progress.
205203
expired: Target lifecycle when time elapsed in current state
206204
give_up: Target lifecycle when retry count exceeded
207205
"""

src/ai/backend/manager/repositories/base/types.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
# QueryCondition now returns a ColumnElement (whereclause) instead of modifying stmt
2020
type QueryCondition = Callable[[], sa.sql.expression.ColumnElement[bool]]
2121

22+
2223
T = TypeVar("T")
2324

2425

src/ai/backend/manager/repositories/deployment/db_source/db_source.py

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2519,38 +2519,22 @@ async def search_deployment_policies(
25192519

25202520
async def apply_strategy_mutations(
25212521
self,
2522-
assignments: Mapping[uuid.UUID, DeploymentSubStep],
25232522
rollout: Sequence[RBACEntityCreator[RoutingRow]],
25242523
drain: BatchUpdater[RoutingRow] | None,
25252524
completed_ids: set[uuid.UUID],
2526-
rolled_back_ids: set[uuid.UUID],
25272525
) -> int:
2528-
"""Apply all DB mutations from a strategy evaluation cycle in a single transaction.
2526+
"""Apply route mutations from a strategy evaluation cycle in a single transaction.
2527+
2528+
Sub-step transitions are handled exclusively by the coordinator
2529+
via ``EndpointLifecycleBatchUpdaterSpec``.
25292530
25302531
Returns:
25312532
Number of deployments whose revision was swapped.
25322533
"""
25332534
async with self._begin_session_read_committed() as db_sess:
2534-
await self._update_sub_steps(db_sess, assignments)
25352535
await self._create_routes(db_sess, rollout)
25362536
await self._drain_routes(db_sess, drain)
2537-
swapped = await self._complete_deployment_revision_swap(db_sess, completed_ids)
2538-
await self._clear_deploying_revision(db_sess, rolled_back_ids)
2539-
return swapped
2540-
2541-
@staticmethod
2542-
async def _update_sub_steps(
2543-
db_sess: SASession,
2544-
assignments: Mapping[uuid.UUID, DeploymentSubStep],
2545-
) -> None:
2546-
"""Update deployment sub-step assignments."""
2547-
for endpoint_id, sub_step in assignments.items():
2548-
query = (
2549-
sa.update(EndpointRow)
2550-
.where(EndpointRow.id == endpoint_id)
2551-
.values(sub_step=sub_step)
2552-
)
2553-
await db_sess.execute(query)
2537+
return await self._complete_deployment_revision_swap(db_sess, completed_ids)
25542538

25552539
@staticmethod
25562540
async def _create_routes(
@@ -2593,20 +2577,24 @@ async def _complete_deployment_revision_swap(
25932577
result = await db_sess.execute(query)
25942578
return cast(CursorResult[Any], result).rowcount
25952579

2596-
@staticmethod
2597-
async def _clear_deploying_revision(
2598-
db_sess: SASession,
2599-
rolled_back_ids: set[uuid.UUID],
2580+
async def clear_deploying_revision(
2581+
self,
2582+
deployment_ids: set[uuid.UUID],
26002583
) -> None:
2601-
"""Clear deploying_revision for rolled-back deployments."""
2602-
if not rolled_back_ids:
2584+
"""Clear deploying_revision and sub_step for rolled-back deployments.
2585+
2586+
This is called explicitly by ``DeployingRollingBackHandler`` after
2587+
rollback completes, NOT automatically by apply_strategy_mutations.
2588+
"""
2589+
if not deployment_ids:
26032590
return
2604-
query = (
2605-
sa.update(EndpointRow)
2606-
.where(EndpointRow.id.in_(rolled_back_ids))
2607-
.values(
2608-
deploying_revision=None,
2609-
sub_step=None,
2591+
async with self._begin_session_read_committed() as db_sess:
2592+
query = (
2593+
sa.update(EndpointRow)
2594+
.where(EndpointRow.id.in_(deployment_ids))
2595+
.values(
2596+
deploying_revision=None,
2597+
sub_step=None,
2598+
)
26102599
)
2611-
)
2612-
await db_sess.execute(query)
2600+
await db_sess.execute(query)

src/ai/backend/manager/repositories/deployment/repository.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1406,26 +1406,33 @@ async def search_deployment_policies(
14061406
"""
14071407
return await self._db_source.search_deployment_policies(querier)
14081408

1409+
@deployment_repository_resilience.apply()
14091410
async def apply_strategy_mutations(
14101411
self,
1411-
assignments: Mapping[UUID, DeploymentSubStep],
14121412
rollout: Sequence[RBACEntityCreator[RoutingRow]],
14131413
drain: BatchUpdater[RoutingRow] | None,
14141414
completed_ids: set[UUID],
1415-
rolled_back_ids: set[UUID],
14161415
) -> int:
1417-
"""Apply all DB mutations from a strategy evaluation cycle.
1416+
"""Apply route mutations from a strategy evaluation cycle.
14181417
1419-
Performs sub-step updates, route rollout/drain, revision swap,
1420-
and deploying_revision cleanup in a single transaction.
1418+
Performs route rollout/drain and revision swap in a single transaction.
1419+
Sub-step transitions are handled by the coordinator via
1420+
``EndpointLifecycleBatchUpdaterSpec``.
14211421
14221422
Returns:
14231423
Number of deployments whose revision was swapped.
14241424
"""
14251425
return await self._db_source.apply_strategy_mutations(
1426-
assignments=assignments,
14271426
rollout=rollout,
14281427
drain=drain,
14291428
completed_ids=completed_ids,
1430-
rolled_back_ids=rolled_back_ids,
14311429
)
1430+
1431+
@deployment_repository_resilience.apply()
1432+
async def clear_deploying_revision(self, deployment_ids: set[UUID]) -> None:
1433+
"""Clear deploying_revision and sub_step for rolled-back deployments.
1434+
1435+
Called explicitly by ``DeployingRollingBackHandler`` after rollback
1436+
completes — NOT automatically during strategy mutations.
1437+
"""
1438+
await self._db_source.clear_deploying_revision(deployment_ids)

src/ai/backend/manager/sokovan/deployment/coordinator.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@
6161
from .handlers import (
6262
CheckPendingDeploymentHandler,
6363
CheckReplicaDeploymentHandler,
64-
DeployingProgressingHandler,
6564
DeployingProvisioningHandler,
6665
DeployingRollingBackHandler,
6766
DeploymentHandler,
@@ -325,21 +324,11 @@ def _init_handlers(
325324
applier=applier,
326325
),
327326
),
328-
(
329-
(DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.PROGRESSING),
330-
DeployingProgressingHandler(
331-
deployment_controller=self._deployment_controller,
332-
route_controller=self._route_controller,
333-
evaluator=evaluator,
334-
applier=applier,
335-
),
336-
),
337327
(
338328
(DeploymentLifecycleType.DEPLOYING, DeploymentSubStep.ROLLING_BACK),
339329
DeployingRollingBackHandler(
340330
deployment_controller=self._deployment_controller,
341331
route_controller=self._route_controller,
342-
evaluator=evaluator,
343332
applier=applier,
344333
),
345334
),
@@ -437,8 +426,8 @@ async def _handle_status_transitions(
437426

438427
transitions = handler.status_transitions()
439428

440-
# Success transitions (None = stay in current state)
441-
if transitions.success is not None and result.successes:
429+
# Success transitions
430+
if result.successes and transitions.success is not None:
442431
transition = self._build_success_transition(
443432
handler_name=handler_name,
444433
deployments=result.successes,
@@ -451,6 +440,21 @@ async def _handle_status_transitions(
451440
all_history_specs.extend(transition.history_specs)
452441
notification_events.extend(transition.notification_events)
453442

443+
# Explicit need_retry from handlers (e.g. route mutations in progress).
444+
# These are never escalated to give_up — they represent normal progress.
445+
if result.need_retry and transitions.need_retry is not None:
446+
transition = self._build_success_transition(
447+
handler_name=handler_name,
448+
deployments=result.need_retry,
449+
lifecycle_status=transitions.need_retry,
450+
target_lifecycles=target_statuses,
451+
records=records,
452+
timestamp_now=timestamp_now,
453+
)
454+
batch_updaters.append(transition.updater)
455+
all_history_specs.extend(transition.history_specs)
456+
notification_events.extend(transition.notification_events)
457+
454458
# Failure transitions — classify into need_retry/expired/give_up
455459
if result.errors:
456460
current_dbtime = await self._deployment_repository.get_db_now()

src/ai/backend/manager/sokovan/deployment/handlers/__init__.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
from .base import DeploymentHandler
66
from .deploying import (
7-
DeployingProgressingHandler,
87
DeployingProvisioningHandler,
98
DeployingRollingBackHandler,
109
)
@@ -17,7 +16,6 @@
1716
__all__ = [
1817
"CheckPendingDeploymentHandler",
1918
"CheckReplicaDeploymentHandler",
20-
"DeployingProgressingHandler",
2119
"DeployingProvisioningHandler",
2220
"DeployingRollingBackHandler",
2321
"DeploymentHandler",

0 commit comments

Comments
 (0)