Skip to content

Commit d5d37dc

Browse files
committed
Refactor _get_image_id
1 parent e392a86 commit d5d37dc

File tree

4 files changed

+60
-80
lines changed

4 files changed

+60
-80
lines changed

src/dstack/_internal/core/backends/gcp/compute.py

Lines changed: 28 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010

1111
import dstack._internal.core.backends.gcp.auth as auth
1212
import dstack._internal.core.backends.gcp.resources as gcp_resources
13+
from dstack import version
1314
from dstack._internal.core.backends.base.compute import (
1415
Compute,
1516
ComputeWithCreateInstanceSupport,
@@ -26,6 +27,7 @@
2627
merge_tags,
2728
)
2829
from dstack._internal.core.backends.base.offers import get_catalog_offers
30+
from dstack._internal.core.backends.gcp.features import tcpx as tcpx_features
2931
from dstack._internal.core.backends.gcp.models import GCPConfig
3032
from dstack._internal.core.errors import (
3133
ComputeError,
@@ -273,8 +275,9 @@ def create_instance(
273275
request.project = self.config.project_id
274276
request.instance_resource = gcp_resources.create_instance_struct(
275277
disk_size=disk_size,
276-
image_id=gcp_resources.get_image_id(
277-
len(instance_offer.instance.resources.gpus) > 0,
278+
image_id=_get_image_id(
279+
instance_type_name=instance_offer.instance.name,
280+
cuda=len(instance_offer.instance.resources.gpus) > 0,
278281
),
279282
machine_type=instance_offer.instance.name,
280283
accelerators=gcp_resources.get_accelerators(
@@ -285,7 +288,9 @@ def create_instance(
285288
spot=instance_offer.instance.resources.spot,
286289
user_data=get_user_data(
287290
authorized_keys,
288-
backend_specific_commands=_get_backend_specific_commands_tcpxo(),
291+
backend_specific_commands=_get_backend_specific_commands(
292+
instance_offer.instance.name
293+
),
289294
),
290295
authorized_keys=authorized_keys,
291296
labels=labels,
@@ -467,7 +472,7 @@ def create_gateway(
467472
request.project = self.config.project_id
468473
request.instance_resource = gcp_resources.create_instance_struct(
469474
disk_size=10,
470-
image_id=gcp_resources.get_gateway_image_id(),
475+
image_id=_get_gateway_image_id(),
471476
machine_type="e2-small",
472477
accelerators=[],
473478
spot=False,
@@ -809,66 +814,25 @@ def _unique_instance_name(instance: InstanceType) -> str:
809814
return f"{name}-{gpu.name}-{gpu.memory_mib}"
810815

811816

812-
def _get_backend_specific_commands_tcpx() -> List[str]:
813-
return [
814-
"cos-extensions install gpu -- --version=latest",
815-
"sudo mount --bind /var/lib/nvidia /var/lib/nvidia",
816-
"sudo mount -o remount,exec /var/lib/nvidia",
817-
(
818-
"docker run "
819-
"--detach "
820-
"--pull=always"
821-
"--name receive-datapath-manager "
822-
"--privileged "
823-
"--cap-add=NET_ADMIN --network=host "
824-
"--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 "
825-
"--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 "
826-
"--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 "
827-
"--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 "
828-
"--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 "
829-
"--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl "
830-
"--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 "
831-
"--volume /run/tcpx:/run/tcpx "
832-
"--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd "
833-
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd "
834-
'--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0"'
835-
),
836-
"sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT",
837-
"docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl",
838-
"sudo mount --bind /var/lib/tcpx /var/lib/tcpx",
839-
"sudo mount -o remount,exec /var/lib/tcpx",
840-
]
841-
842-
843-
def _get_backend_specific_commands_tcpxo() -> List[str]:
844-
return [
845-
"modprobe import-helper",
846-
"gcloud -q auth configure-docker us-docker.pkg.dev",
847-
# Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/.
848-
(
849-
"docker run --rm --name nccl-installer "
850-
"--network=host "
851-
"--volume /var/lib:/var/lib "
852-
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 "
853-
"install --install-nccl"
854-
),
855-
# Start FasTrak receive-datapath-manager
856-
(
857-
"docker run "
858-
"--detach "
859-
"--pull=always "
860-
"--name receive-datapath-manager "
861-
"--cap-add=NET_ADMIN "
862-
"--network=host "
863-
"--privileged "
864-
"--gpus all "
865-
"--volume /usr/lib32:/usr/local/nvidia/lib64 "
866-
"--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper "
867-
"--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu "
868-
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 "
869-
"--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
870-
),
871-
]
817+
def _get_image_id(instance_type_name: str, cuda: bool) -> str:
818+
if instance_type_name == "a3-megagpu-8g":
819+
image_name = "dstack-a3mega-2"
820+
elif cuda:
821+
image_name = f"dstack-cuda-{version.base_image}"
822+
else:
823+
image_name = f"dstack-{version.base_image}"
824+
image_name = image_name.replace(".", "-")
825+
return f"projects/dstack/global/images/{image_name}"
826+
827+
828+
def _get_gateway_image_id() -> str:
829+
return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
830+
831+
832+
def _get_backend_specific_commands(instance_type_name: str) -> List[str]:
833+
if instance_type_name == "a3-megagpu-8g":
834+
return tcpx_features.get_backend_specific_commands_tcpxo()
835+
return []
872836

873837

874838
def _get_volume_price(size: int) -> float:

src/dstack/_internal/core/backends/gcp/features/__init__.py

Whitespace-only changes.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
from typing import List
2+
3+
4+
def get_backend_specific_commands_tcpxo() -> List[str]:
5+
return [
6+
"modprobe import-helper",
7+
"gcloud -q auth configure-docker us-docker.pkg.dev",
8+
# Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/.
9+
(
10+
"docker run --rm --name nccl-installer "
11+
"--network=host "
12+
"--volume /var/lib:/var/lib "
13+
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 "
14+
"install --install-nccl"
15+
),
16+
# Start FasTrak receive-datapath-manager
17+
(
18+
"docker run "
19+
"--detach "
20+
"--pull=always "
21+
"--name receive-datapath-manager "
22+
"--cap-add=NET_ADMIN "
23+
"--network=host "
24+
"--privileged "
25+
"--gpus all "
26+
"--volume /usr/lib32:/usr/local/nvidia/lib64 "
27+
"--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper "
28+
"--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu "
29+
"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 "
30+
"--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
31+
),
32+
]

src/dstack/_internal/core/backends/gcp/resources.py

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -209,22 +209,6 @@ def _get_network_interfaces(
209209
return network_interfaces
210210

211211

212-
def get_image_id(cuda: bool) -> str:
213-
# if not cuda:
214-
# image_name = f"dstack-{version.base_image}"
215-
# else:
216-
# image_name = f"dstack-cuda-{version.base_image}"
217-
# image_name = image_name.replace(".", "-")
218-
219-
# return f"projects/dstack/global/images/{image_name}"
220-
# return "projects/cos-cloud/global/images/cos-105-17412-535-78" # TCPX
221-
return "projects/dstack/global/images/slurm-a3mega-20250327t101736z-cloudinit" # TCPXO
222-
223-
224-
def get_gateway_image_id() -> str:
225-
return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
226-
227-
228212
def get_vpc_subnet_or_error(
229213
subnetworks_client: compute_v1.SubnetworksClient,
230214
vpc_project_id: str,

0 commit comments

Comments
 (0)