From 4b849a6bf08aee00f7ddde5ea9f82429064df2d9 Mon Sep 17 00:00:00 2001 From: Tony Kao Date: Tue, 11 Feb 2025 08:15:55 -0800 Subject: [PATCH] torchx support early validation before workspace build Summary: add `_pre_build_validate()` hook for torchx scheduler to perform app validation before build workspace step. This earlier validation enables detecting issue sooner without the need to wait for build workspace to complete. This change only exposes the pre_build_validate() hook for torchx scheduler and there is no change to existing behavior (validation will continue to perform after build workspace if required). Subsequent change will change the behavior within specific scheduler if validation can be moved from validate to pre_build_validate. additional change to add event logging for build_workspace_and_update_role call. Differential Revision: D69463377 --- torchx/runner/api.py | 16 +++++++++++++++- torchx/schedulers/api.py | 13 ++++++++++--- 2 files changed, 25 insertions(+), 4 deletions(-) diff --git a/torchx/runner/api.py b/torchx/runner/api.py index cd66bd489..e63f7a2fd 100644 --- a/torchx/runner/api.py +++ b/torchx/runner/api.py @@ -412,6 +412,14 @@ def dryrun( ): sched = self._scheduler(scheduler) resolved_cfg = sched.run_opts().resolve(cfg) + + # early validation before build workspace + with log_event( + "pre_build_validate", + scheduler, + ): + sched._pre_build_validate(app, scheduler, resolved_cfg) + if workspace and isinstance(sched, WorkspaceMixin): role = app.roles[0] old_img = role.image @@ -420,7 +428,13 @@ def dryrun( logger.info( 'To disable workspaces pass: --workspace="" from CLI or workspace=None programmatically.' ) - sched.build_workspace_and_update_role(role, workspace, resolved_cfg) + with log_event( + "build_workspace_and_update_role", + scheduler, + ) as ctx: + sched.build_workspace_and_update_role(role, workspace, resolved_cfg) + ctx._torchx_event.app_image = role.image + ctx._torchx_event.workspace = workspace if old_img != role.image: logger.info( diff --git a/torchx/schedulers/api.py b/torchx/schedulers/api.py index 3ef3c5899..6a9e1bf70 100644 --- a/torchx/schedulers/api.py +++ b/torchx/schedulers/api.py @@ -337,12 +337,19 @@ def log_iter( f"{self.__class__.__qualname__} does not support application log iteration" ) + def _pre_build_validate(self, app: AppDef, scheduler: str, cfg: T) -> None: + """ + validates before workspace build whether application is consistent with the scheduler. + + Raises error if application is not compatible with scheduler + """ + pass + def _validate(self, app: AppDef, scheduler: str, cfg: T) -> None: """ - Validates whether application is consistent with the scheduler. + Validates after workspace build whether application is consistent with the scheduler. - Raises: - ValueError: if application is not compatible with scheduler + Raises error if application is not compatible with scheduler """ for role in app.roles: if role.resource == NULL_RESOURCE: