Increase Lambda provisioning timeout and refactor (dstackai#2353)

jvstme · pranitnaik43 · commit 396444d8c18c · 2025-03-04T22:14:44.000+05:30
Increase the provisioning timeout for the `lambda`
backend to 30 minutes. In recent testing,
8:A100 takes about 15 minutes to start, 8:H100
a little over 20 minutes.

Also refactor setting the timeout to avoid
duplication in `_get_instance_timeout_interval`
and `_get_runner_timeout_interval`. These two
timeouts are safe to merge because they are both
tied to instance provisioning time:
`_get_instance_timeout_interval` is the time
between requesting instance creation in a backend
and the instance becoming `busy`/`idle`, while
`_get_runner_timeout_interval` is the time between
submitting a job and the job becoming
`pulling`/`running`.
diff --git a/contributing/BACKENDS.md b/contributing/BACKENDS.md
@@ -183,6 +183,11 @@ and add it to `AnyBackendConfig` (in the same file).
 In [`src/dstack/_internal/server/services/backends/__init__.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/__init__.py), 
 add the `try`/`except` block that imports the backend configurator and appends it to `_CONFIGURATOR_CLASSES`.
 
+##### 2.4.11. (Optional) Override provisioning timeout
+
+If instances in the backend take more than 10 minutes to start, override the default provisioning timeout in
+[`src/dstack/_internal/server/background/tasks/common.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/background/tasks/common.py).
+
 ## 3. Appendix
 
 #### 3.1. Backend compute type
diff --git a/src/dstack/_internal/server/background/tasks/common.py b/src/dstack/_internal/server/background/tasks/common.py
@@ -0,0 +1,22 @@
+from datetime import timedelta
+
+from dstack._internal.core.models.backends.base import BackendType
+
+
+def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
+    """
+    This timeout is used in a few places, but roughly refers to the max time between
+    requesting instance creation and the instance becoming ready to accept jobs.
+    For container-based backends, this also includes the image pulling time.
+    """
+    if backend_type == BackendType.LAMBDA:
+        return timedelta(minutes=30)
+    if backend_type == BackendType.RUNPOD:
+        return timedelta(minutes=20)
+    if backend_type == BackendType.KUBERNETES:
+        return timedelta(minutes=20)
+    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
+        return timedelta(minutes=20)
+    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
+        return timedelta(minutes=55)
+    return timedelta(minutes=10)
diff --git a/src/dstack/_internal/server/background/tasks/process_instances.py b/src/dstack/_internal/server/background/tasks/process_instances.py
@@ -62,6 +62,7 @@
     Retry,
 )
 from dstack._internal.core.services.profiles import get_retry
+from dstack._internal.server.background.tasks.common import get_provisioning_timeout
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
     FleetModel,
@@ -963,25 +964,12 @@ def _get_provisioning_deadline(
     instance: InstanceModel,
     job_provisioning_data: JobProvisioningData,
 ) -> datetime.datetime:
-    timeout_interval = _get_instance_timeout_interval(
+    timeout_interval = get_provisioning_timeout(
         backend_type=job_provisioning_data.get_base_backend(),
         instance_type_name=job_provisioning_data.instance_type.name,
     )
     return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
 
 
-def _get_instance_timeout_interval(
-    backend_type: BackendType, instance_type_name: str
-) -> timedelta:
-    # when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
-    if backend_type == BackendType.RUNPOD:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
-        return timedelta(seconds=3300)
-    return timedelta(seconds=600)
-
-
 def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
     return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]
diff --git a/src/dstack/_internal/server/background/tasks/process_running_jobs.py b/src/dstack/_internal/server/background/tasks/process_running_jobs.py
@@ -1,5 +1,4 @@
 import asyncio
-from datetime import timedelta
 from typing import Dict, List, Optional
 
 from sqlalchemy import select
@@ -29,6 +28,7 @@
     RunSpec,
 )
 from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
+from dstack._internal.server.background.tasks.common import get_provisioning_timeout
 from dstack._internal.server.db import get_session_ctx
 from dstack._internal.server.models import (
     InstanceModel,
@@ -244,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
 
             if not success:
                 # check timeout
-                if job_submission.age > _get_runner_timeout_interval(
+                if job_submission.age > get_provisioning_timeout(
                     backend_type=job_provisioning_data.get_base_backend(),
                     instance_type_name=job_provisioning_data.instance_type.name,
                 ):
@@ -769,16 +769,3 @@ def _submit_job_to_runner(
     # do not log here, because the runner will send a new status
 
     return True
-
-
-def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
-    # when changing timeouts, also consider process_instances._get_instance_timeout_interval
-    if backend_type == BackendType.LAMBDA:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.KUBERNETES:
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
-        return timedelta(seconds=1200)
-    if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
-        return timedelta(seconds=3300)
-    return timedelta(seconds=600)