Skip to content

Commit 6f53b5d

Browse files
authored
Use unique names for backend resources (#2350)
* Use unique resource names on GCP * Test generate unique backend name * Use unique resource names on Azure * Use unique resource names on Cudo * Use unique resource names on AWS * Use unique resource names on k8s * Use unique resource names on Lambda * Use unique resource names on OCI * Use unique resource names on Tensordock * Use unique resource names on Runpod * Use unique resource names on VastAI * Use unique resource names on Datacrunch * Use unique resource names on Vultr * Fix OCI missing from BackendType docstring * Use unique volume names on Runpod * Fix unique instance_name not used
1 parent e842232 commit 6f53b5d

File tree

22 files changed

+347
-88
lines changed

22 files changed

+347
-88
lines changed

src/dstack/_internal/core/backends/aws/compute.py

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,11 @@
1111
from dstack._internal.core.backends.aws.config import AWSConfig
1212
from dstack._internal.core.backends.base.compute import (
1313
Compute,
14+
generate_unique_gateway_instance_name,
15+
generate_unique_instance_name,
16+
generate_unique_volume_name,
1417
get_gateway_user_data,
15-
get_instance_name,
18+
get_job_instance_name,
1619
get_user_data,
1720
merge_tags,
1821
)
@@ -152,10 +155,12 @@ def create_instance(
152155
if zones is not None and len(zones) == 0:
153156
raise NoCapacityError("No eligible availability zones")
154157

158+
instance_name = generate_unique_instance_name(instance_config)
155159
tags = {
156-
"Name": instance_config.instance_name,
160+
"Name": instance_name,
157161
"owner": "dstack",
158162
"dstack_project": project_name,
163+
"dstack_name": instance_config.instance_name,
159164
"dstack_user": instance_config.user,
160165
}
161166
tags = merge_tags(tags=tags, backend_tags=self.config.tags)
@@ -274,7 +279,7 @@ def run_job(
274279
# TODO: run_job is the same for vm-based backends, refactor
275280
instance_config = InstanceConfiguration(
276281
project_name=run.project_name,
277-
instance_name=get_instance_name(run, job), # TODO: generate name
282+
instance_name=get_job_instance_name(run, job), # TODO: generate name
278283
ssh_keys=[
279284
SSHKey(public=project_ssh_public_key.strip()),
280285
],
@@ -342,10 +347,12 @@ def create_gateway(
342347
ec2_resource = self.session.resource("ec2", region_name=configuration.region)
343348
ec2_client = self.session.client("ec2", region_name=configuration.region)
344349

350+
instance_name = generate_unique_gateway_instance_name(configuration)
345351
tags = {
346-
"Name": configuration.instance_name,
352+
"Name": instance_name,
347353
"owner": "dstack",
348354
"dstack_project": configuration.project_name,
355+
"dstack_name": configuration.instance_name,
349356
}
350357
if settings.DSTACK_VERSION is not None:
351358
tags["dstack_version"] = settings.DSTACK_VERSION
@@ -403,7 +410,7 @@ def create_gateway(
403410

404411
logger.debug("Creating ALB for gateway %s...", configuration.instance_name)
405412
response = elb_client.create_load_balancer(
406-
Name=f"{configuration.instance_name}-lb",
413+
Name=f"{instance_name}-lb",
407414
Subnets=subnets_ids,
408415
SecurityGroups=[security_group_id],
409416
Scheme="internet-facing" if configuration.public_ip else "internal",
@@ -418,7 +425,7 @@ def create_gateway(
418425

419426
logger.debug("Creating Target Group for gateway %s...", configuration.instance_name)
420427
response = elb_client.create_target_group(
421-
Name=f"{configuration.instance_name}-tg",
428+
Name=f"{instance_name}-tg",
422429
Protocol="HTTP",
423430
Port=80,
424431
VpcId=vpc_id,
@@ -535,11 +542,13 @@ def register_volume(self, volume: Volume) -> VolumeProvisioningData:
535542
def create_volume(self, volume: Volume) -> VolumeProvisioningData:
536543
ec2_client = self.session.client("ec2", region_name=volume.configuration.region)
537544

545+
volume_name = generate_unique_volume_name(volume)
538546
tags = {
539-
"Name": volume.configuration.name,
547+
"Name": volume_name,
540548
"owner": "dstack",
541-
"dstack_user": volume.user,
542549
"dstack_project": volume.project_name,
550+
"dstack_name": volume.name,
551+
"dstack_user": volume.user,
543552
}
544553
tags = merge_tags(tags=tags, backend_tags=self.config.tags)
545554

src/dstack/_internal/core/backends/azure/compute.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,10 @@
3939
from dstack._internal.core.backends.azure.config import AzureConfig
4040
from dstack._internal.core.backends.base.compute import (
4141
Compute,
42+
generate_unique_gateway_instance_name,
43+
generate_unique_instance_name,
4244
get_gateway_user_data,
43-
get_instance_name,
45+
get_job_instance_name,
4446
get_user_data,
4547
merge_tags,
4648
)
@@ -103,6 +105,9 @@ def create_instance(
103105
instance_offer: InstanceOfferWithAvailability,
104106
instance_config: InstanceConfiguration,
105107
) -> JobProvisioningData:
108+
instance_name = generate_unique_instance_name(
109+
instance_config, max_length=azure_resources.MAX_RESOURCE_NAME_LEN
110+
)
106111
location = instance_offer.region
107112
logger.info(
108113
"Requesting %s %s instance in %s...",
@@ -129,6 +134,7 @@ def create_instance(
129134
tags = {
130135
"owner": "dstack",
131136
"dstack_project": instance_config.project_name,
137+
"dstack_name": instance_config.instance_name,
132138
"dstack_user": instance_config.user,
133139
}
134140
tags = merge_tags(tags=tags, backend_tags=self.config.tags)
@@ -150,9 +156,7 @@ def create_instance(
150156
variant=VMImageVariant.from_instance_type(instance_offer.instance),
151157
),
152158
vm_size=instance_offer.instance.name,
153-
# instance_name includes region because Azure may create an instance resource
154-
# even when provisioning fails.
155-
instance_name=f"{instance_config.instance_name}-{instance_offer.region}",
159+
instance_name=instance_name,
156160
user_data=get_user_data(authorized_keys=ssh_pub_keys),
157161
ssh_pub_keys=ssh_pub_keys,
158162
spot=instance_offer.instance.resources.spot,
@@ -197,7 +201,7 @@ def run_job(
197201
) -> JobProvisioningData:
198202
instance_config = InstanceConfiguration(
199203
project_name=run.project_name,
200-
instance_name=get_instance_name(run, job), # TODO: generate name
204+
instance_name=get_job_instance_name(run, job), # TODO: generate name
201205
ssh_keys=[
202206
SSHKey(public=project_ssh_public_key.strip()),
203207
],
@@ -223,7 +227,9 @@ def create_gateway(
223227
configuration.instance_name,
224228
configuration.region,
225229
)
226-
230+
instance_name = generate_unique_gateway_instance_name(
231+
configuration, max_length=azure_resources.MAX_RESOURCE_NAME_LEN
232+
)
227233
network_resource_group, network, subnet = get_resource_group_network_subnet_or_error(
228234
network_client=self._network_client,
229235
resource_group=self.config.resource_group,
@@ -237,9 +243,9 @@ def create_gateway(
237243
)
238244

239245
tags = {
240-
"Name": configuration.instance_name,
241246
"owner": "dstack",
242247
"dstack_project": configuration.project_name,
248+
"dstack_name": configuration.instance_name,
243249
}
244250
if settings.DSTACK_VERSION is not None:
245251
tags["dstack_version"] = settings.DSTACK_VERSION
@@ -256,7 +262,7 @@ def create_gateway(
256262
managed_identity=None,
257263
image_reference=_get_gateway_image_ref(),
258264
vm_size="Standard_B1ms",
259-
instance_name=configuration.instance_name,
265+
instance_name=instance_name,
260266
user_data=get_gateway_user_data(configuration.ssh_key_pub),
261267
ssh_pub_keys=[configuration.ssh_key_pub],
262268
spot=False,

src/dstack/_internal/core/backends/azure/resources.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
from dstack._internal.core.errors import BackendError
88

9+
MAX_RESOURCE_NAME_LEN = 64
10+
911

1012
def get_network_subnets(
1113
network_client: network_mgmt.NetworkManagementClient,

src/dstack/_internal/core/backends/base/compute.py

Lines changed: 102 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import os
2+
import random
23
import re
4+
import string
35
import threading
46
from abc import ABC, abstractmethod
57
from functools import lru_cache
@@ -31,6 +33,7 @@
3133
VolumeAttachmentData,
3234
VolumeProvisioningData,
3335
)
36+
from dstack._internal.core.services import is_valid_dstack_resource_name
3437
from dstack._internal.utils.logging import get_logger
3538

3639
logger = get_logger(__name__)
@@ -209,8 +212,105 @@ def get_offers_cached(
209212
return self.get_offers(requirements)
210213

211214

212-
def get_instance_name(run: Run, job: Job) -> str:
213-
return f"{run.project_name.lower()}-{job.job_spec.job_name}"
215+
def get_job_instance_name(run: Run, job: Job) -> str:
216+
return job.job_spec.job_name
217+
218+
219+
_DEFAULT_MAX_RESOURCE_NAME_LEN = 60
220+
_CLOUD_RESOURCE_SUFFIX_LEN = 8
221+
222+
223+
def generate_unique_instance_name(
224+
instance_configuration: InstanceConfiguration,
225+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
226+
) -> str:
227+
"""
228+
Generates a unique instance name valid across all backends.
229+
"""
230+
return generate_unique_backend_name(
231+
resource_name=instance_configuration.instance_name,
232+
project_name=instance_configuration.project_name,
233+
max_length=max_length,
234+
)
235+
236+
237+
def generate_unique_instance_name_for_job(
238+
run: Run,
239+
job: Job,
240+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
241+
) -> str:
242+
"""
243+
Generates a unique instance name for a job valid across all backends.
244+
"""
245+
return generate_unique_backend_name(
246+
resource_name=get_job_instance_name(run, job),
247+
project_name=run.project_name,
248+
max_length=max_length,
249+
)
250+
251+
252+
def generate_unique_gateway_instance_name(
253+
gateway_compute_configuration: GatewayComputeConfiguration,
254+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
255+
) -> str:
256+
"""
257+
Generates a unique gateway instance name valid across all backends.
258+
"""
259+
return generate_unique_backend_name(
260+
resource_name=gateway_compute_configuration.instance_name,
261+
project_name=gateway_compute_configuration.project_name,
262+
max_length=max_length,
263+
)
264+
265+
266+
def generate_unique_volume_name(
267+
volume: Volume,
268+
max_length: int = _DEFAULT_MAX_RESOURCE_NAME_LEN,
269+
) -> str:
270+
"""
271+
Generates a unique volume name valid across all backends.
272+
"""
273+
return generate_unique_backend_name(
274+
resource_name=volume.name,
275+
project_name=volume.project_name,
276+
max_length=max_length,
277+
)
278+
279+
280+
def generate_unique_backend_name(
281+
resource_name: str,
282+
project_name: Optional[str],
283+
max_length: int,
284+
) -> str:
285+
"""
286+
Generates a unique resource name valid across all backends.
287+
Backend resource names must be unique on every provisioning so that
288+
resource re-submission/re-creation doesn't lead to conflicts
289+
on backends that require unique names (e.g. Azure, GCP).
290+
"""
291+
# resource_name is guaranteed to be valid in all backends
292+
prefix = f"dstack-{resource_name}"
293+
if project_name is not None and is_valid_dstack_resource_name(project_name):
294+
# project_name is not guaranteed to be valid in all backends,
295+
# so we add it only if it passes the validation
296+
prefix = f"dstack-{project_name}-{resource_name}"
297+
return _generate_unique_backend_name_with_prefix(
298+
prefix=prefix,
299+
max_length=max_length,
300+
)
301+
302+
303+
def _generate_unique_backend_name_with_prefix(
304+
prefix: str,
305+
max_length: int,
306+
) -> str:
307+
prefix_len = max_length - _CLOUD_RESOURCE_SUFFIX_LEN - 1
308+
prefix = prefix[:prefix_len]
309+
suffix = "".join(
310+
random.choice(string.ascii_lowercase + string.digits)
311+
for _ in range(_CLOUD_RESOURCE_SUFFIX_LEN)
312+
)
313+
return f"{prefix}-{suffix}"
214314

215315

216316
def get_cloud_config(**config) -> str:

src/dstack/_internal/core/backends/cudo/compute.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@
44

55
from dstack._internal.core.backends.base import Compute
66
from dstack._internal.core.backends.base.compute import (
7-
get_instance_name,
7+
generate_unique_instance_name,
8+
get_job_instance_name,
89
get_shim_commands,
910
)
1011
from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -25,6 +26,9 @@
2526
logger = get_logger(__name__)
2627

2728

29+
MAX_RESOURCE_NAME_LEN = 30
30+
31+
2832
class CudoCompute(Compute):
2933
def __init__(self, config: CudoConfig):
3034
super().__init__()
@@ -58,7 +62,7 @@ def run_job(
5862
) -> JobProvisioningData:
5963
instance_config = InstanceConfiguration(
6064
project_name=run.project_name,
61-
instance_name=get_instance_name(run, job),
65+
instance_name=get_job_instance_name(run, job),
6266
ssh_keys=[
6367
SSHKey(public=project_ssh_public_key.strip()),
6468
],
@@ -71,6 +75,7 @@ def create_instance(
7175
instance_offer: InstanceOfferWithAvailability,
7276
instance_config: InstanceConfiguration,
7377
) -> JobProvisioningData:
78+
vm_id = generate_unique_instance_name(instance_config, max_length=MAX_RESOURCE_NAME_LEN)
7479
public_keys = instance_config.get_public_keys()
7580
memory_size = round(instance_offer.instance.resources.memory_mib / 1024)
7681
disk_size = round(instance_offer.instance.resources.disk.size_mib / 1024)
@@ -81,13 +86,12 @@ def create_instance(
8186
shim_commands if gpus_no > 0 else f"{install_docker_script()} && {shim_commands}"
8287
)
8388

84-
vm_id = f"{instance_config.instance_name}-{instance_offer.region}"
8589
try:
8690
resp_data = self.api_client.create_virtual_machine(
8791
project_id=self.config.project_id,
8892
boot_disk_storage_class="STORAGE_CLASS_NETWORK",
8993
boot_disk_size_gib=disk_size,
90-
book_disk_id=f"{instance_config.instance_name}_{instance_offer.region}_disk_id",
94+
book_disk_id=f"{vm_id}_disk_id",
9195
boot_disk_image_id=_get_image_id(gpus_no > 0),
9296
data_center_id=instance_offer.region,
9397
gpus=gpus_no,

src/dstack/_internal/core/backends/datacrunch/compute.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from dstack._internal.core.backends.base import Compute
44
from dstack._internal.core.backends.base.compute import (
5+
generate_unique_instance_name,
56
get_shim_commands,
67
)
78
from dstack._internal.core.backends.base.offers import get_catalog_offers
@@ -22,6 +23,8 @@
2223

2324
logger = get_logger("datacrunch.compute")
2425

26+
MAX_INSTANCE_NAME_LEN = 60
27+
2528
# Ubuntu 22.04 + CUDA 12.0 + Docker
2629
# from API https://datacrunch.stoplight.io/docs/datacrunch-public/c46ab45dbc508-get-all-image-types
2730
IMAGE_ID = "2088da25-bb0d-41cc-a191-dccae45d96fd"
@@ -78,6 +81,9 @@ def create_instance(
7881
instance_offer: InstanceOfferWithAvailability,
7982
instance_config: InstanceConfiguration,
8083
) -> JobProvisioningData:
84+
instance_name = generate_unique_instance_name(
85+
instance_config, max_length=MAX_INSTANCE_NAME_LEN
86+
)
8187
public_keys = instance_config.get_public_keys()
8288
ssh_ids = []
8389
for ssh_public_key in public_keys:
@@ -106,8 +112,8 @@ def create_instance(
106112
instance_type=instance_offer.instance.name,
107113
ssh_key_ids=ssh_ids,
108114
startup_script_id=startup_script_ids,
109-
hostname=instance_config.instance_name,
110-
description=instance_config.instance_name,
115+
hostname=instance_name,
116+
description=instance_name,
111117
image=IMAGE_ID,
112118
disk_size=disk_size,
113119
location=instance_offer.region,
@@ -119,8 +125,8 @@ def create_instance(
119125
"instance_type": instance_offer.instance.name,
120126
"ssh_key_ids": ssh_ids,
121127
"startup_script_id": startup_script_ids,
122-
"hostname": instance_config.instance_name,
123-
"description": instance_config.instance_name,
128+
"hostname": instance_name,
129+
"description": instance_name,
124130
"image": IMAGE_ID,
125131
"disk_size": disk_size,
126132
"location": instance_offer.region,

0 commit comments

Comments
 (0)