Skip to content

Commit 396444d

Browse files
jvstmepranitnaik43
authored andcommitted
Increase Lambda provisioning timeout and refactor (dstackai#2353)
Increase the provisioning timeout for the `lambda` backend to 30 minutes. In recent testing, 8:A100 takes about 15 minutes to start, 8:H100 a little over 20 minutes. Also refactor setting the timeout to avoid duplication in `_get_instance_timeout_interval` and `_get_runner_timeout_interval`. These two timeouts are safe to merge because they are both tied to instance provisioning time: `_get_instance_timeout_interval` is the time between requesting instance creation in a backend and the instance becoming `busy`/`idle`, while `_get_runner_timeout_interval` is the time between submitting a job and the job becoming `pulling`/`running`.
1 parent e32ac0e commit 396444d

File tree

4 files changed

+31
-29
lines changed

4 files changed

+31
-29
lines changed

contributing/BACKENDS.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,11 @@ and add it to `AnyBackendConfig` (in the same file).
183183
In [`src/dstack/_internal/server/services/backends/__init__.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/services/backends/__init__.py),
184184
add the `try`/`except` block that imports the backend configurator and appends it to `_CONFIGURATOR_CLASSES`.
185185

186+
##### 2.4.11. (Optional) Override provisioning timeout
187+
188+
If instances in the backend take more than 10 minutes to start, override the default provisioning timeout in
189+
[`src/dstack/_internal/server/background/tasks/common.py`](https://github.com/dstackai/dstack/blob/master/src/dstack/_internal/server/background/tasks/common.py).
190+
186191
## 3. Appendix
187192

188193
#### 3.1. Backend compute type
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from datetime import timedelta
2+
3+
from dstack._internal.core.models.backends.base import BackendType
4+
5+
6+
def get_provisioning_timeout(backend_type: BackendType, instance_type_name: str) -> timedelta:
7+
"""
8+
This timeout is used in a few places, but roughly refers to the max time between
9+
requesting instance creation and the instance becoming ready to accept jobs.
10+
For container-based backends, this also includes the image pulling time.
11+
"""
12+
if backend_type == BackendType.LAMBDA:
13+
return timedelta(minutes=30)
14+
if backend_type == BackendType.RUNPOD:
15+
return timedelta(minutes=20)
16+
if backend_type == BackendType.KUBERNETES:
17+
return timedelta(minutes=20)
18+
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
19+
return timedelta(minutes=20)
20+
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
21+
return timedelta(minutes=55)
22+
return timedelta(minutes=10)

src/dstack/_internal/server/background/tasks/process_instances.py

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
Retry,
6363
)
6464
from dstack._internal.core.services.profiles import get_retry
65+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
6566
from dstack._internal.server.db import get_session_ctx
6667
from dstack._internal.server.models import (
6768
FleetModel,
@@ -963,25 +964,12 @@ def _get_provisioning_deadline(
963964
instance: InstanceModel,
964965
job_provisioning_data: JobProvisioningData,
965966
) -> datetime.datetime:
966-
timeout_interval = _get_instance_timeout_interval(
967+
timeout_interval = get_provisioning_timeout(
967968
backend_type=job_provisioning_data.get_base_backend(),
968969
instance_type_name=job_provisioning_data.instance_type.name,
969970
)
970971
return instance.started_at.replace(tzinfo=datetime.timezone.utc) + timeout_interval
971972

972973

973-
def _get_instance_timeout_interval(
974-
backend_type: BackendType, instance_type_name: str
975-
) -> timedelta:
976-
# when changing timeouts, also consider process_running_jobs._get_runner_timeout_interval
977-
if backend_type == BackendType.RUNPOD:
978-
return timedelta(seconds=1200)
979-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
980-
return timedelta(seconds=1200)
981-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
982-
return timedelta(seconds=3300)
983-
return timedelta(seconds=600)
984-
985-
986974
def _ssh_keys_to_pkeys(ssh_keys: list[SSHKey]) -> list[PKey]:
987975
return [pkey_from_str(sk.private) for sk in ssh_keys if sk.private is not None]

src/dstack/_internal/server/background/tasks/process_running_jobs.py

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import asyncio
2-
from datetime import timedelta
32
from typing import Dict, List, Optional
43

54
from sqlalchemy import select
@@ -29,6 +28,7 @@
2928
RunSpec,
3029
)
3130
from dstack._internal.core.models.volumes import InstanceMountPoint, Volume, VolumeMountPoint
31+
from dstack._internal.server.background.tasks.common import get_provisioning_timeout
3232
from dstack._internal.server.db import get_session_ctx
3333
from dstack._internal.server.models import (
3434
InstanceModel,
@@ -244,7 +244,7 @@ async def _process_running_job(session: AsyncSession, job_model: JobModel):
244244

245245
if not success:
246246
# check timeout
247-
if job_submission.age > _get_runner_timeout_interval(
247+
if job_submission.age > get_provisioning_timeout(
248248
backend_type=job_provisioning_data.get_base_backend(),
249249
instance_type_name=job_provisioning_data.instance_type.name,
250250
):
@@ -769,16 +769,3 @@ def _submit_job_to_runner(
769769
# do not log here, because the runner will send a new status
770770

771771
return True
772-
773-
774-
def _get_runner_timeout_interval(backend_type: BackendType, instance_type_name: str) -> timedelta:
775-
# when changing timeouts, also consider process_instances._get_instance_timeout_interval
776-
if backend_type == BackendType.LAMBDA:
777-
return timedelta(seconds=1200)
778-
if backend_type == BackendType.KUBERNETES:
779-
return timedelta(seconds=1200)
780-
if backend_type == BackendType.OCI and instance_type_name.startswith("BM."):
781-
return timedelta(seconds=1200)
782-
if backend_type == BackendType.VULTR and instance_type_name.startswith("vbm"):
783-
return timedelta(seconds=3300)
784-
return timedelta(seconds=600)

0 commit comments

Comments
 (0)