1010
1111import dstack ._internal .core .backends .gcp .auth as auth
1212import dstack ._internal .core .backends .gcp .resources as gcp_resources
13+ from dstack import version
1314from dstack ._internal .core .backends .base .compute import (
1415 Compute ,
1516 ComputeWithCreateInstanceSupport ,
2627 merge_tags ,
2728)
2829from dstack ._internal .core .backends .base .offers import get_catalog_offers
30+ from dstack ._internal .core .backends .gcp .features import tcpx as tcpx_features
2931from dstack ._internal .core .backends .gcp .models import GCPConfig
3032from dstack ._internal .core .errors import (
3133 ComputeError ,
@@ -273,8 +275,9 @@ def create_instance(
273275 request .project = self .config .project_id
274276 request .instance_resource = gcp_resources .create_instance_struct (
275277 disk_size = disk_size ,
276- image_id = gcp_resources .get_image_id (
277- len (instance_offer .instance .resources .gpus ) > 0 ,
278+ image_id = _get_image_id (
279+ instance_type_name = instance_offer .instance .name ,
280+ cuda = len (instance_offer .instance .resources .gpus ) > 0 ,
278281 ),
279282 machine_type = instance_offer .instance .name ,
280283 accelerators = gcp_resources .get_accelerators (
@@ -285,7 +288,9 @@ def create_instance(
285288 spot = instance_offer .instance .resources .spot ,
286289 user_data = get_user_data (
287290 authorized_keys ,
288- backend_specific_commands = _get_backend_specific_commands_tcpxo (),
291+ backend_specific_commands = _get_backend_specific_commands (
292+ instance_offer .instance .name
293+ ),
289294 ),
290295 authorized_keys = authorized_keys ,
291296 labels = labels ,
@@ -467,7 +472,7 @@ def create_gateway(
467472 request .project = self .config .project_id
468473 request .instance_resource = gcp_resources .create_instance_struct (
469474 disk_size = 10 ,
470- image_id = gcp_resources . get_gateway_image_id (),
475+ image_id = _get_gateway_image_id (),
471476 machine_type = "e2-small" ,
472477 accelerators = [],
473478 spot = False ,
@@ -809,66 +814,25 @@ def _unique_instance_name(instance: InstanceType) -> str:
809814 return f"{ name } -{ gpu .name } -{ gpu .memory_mib } "
810815
811816
812- def _get_backend_specific_commands_tcpx () -> List [str ]:
813- return [
814- "cos-extensions install gpu -- --version=latest" ,
815- "sudo mount --bind /var/lib/nvidia /var/lib/nvidia" ,
816- "sudo mount -o remount,exec /var/lib/nvidia" ,
817- (
818- "docker run "
819- "--detach "
820- "--pull=always"
821- "--name receive-datapath-manager "
822- "--privileged "
823- "--cap-add=NET_ADMIN --network=host "
824- "--volume /var/lib/nvidia/lib64:/usr/local/nvidia/lib64 "
825- "--device /dev/nvidia0:/dev/nvidia0 --device /dev/nvidia1:/dev/nvidia1 "
826- "--device /dev/nvidia2:/dev/nvidia2 --device /dev/nvidia3:/dev/nvidia3 "
827- "--device /dev/nvidia4:/dev/nvidia4 --device /dev/nvidia5:/dev/nvidia5 "
828- "--device /dev/nvidia6:/dev/nvidia6 --device /dev/nvidia7:/dev/nvidia7 "
829- "--device /dev/nvidia-uvm:/dev/nvidia-uvm --device /dev/nvidiactl:/dev/nvidiactl "
830- "--env LD_LIBRARY_PATH=/usr/local/nvidia/lib64 "
831- "--volume /run/tcpx:/run/tcpx "
832- "--entrypoint /tcpgpudmarxd/build/app/tcpgpudmarxd "
833- "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd "
834- '--gpu_nic_preset a3vm --gpu_shmem_type fd --uds_path "/run/tcpx" --setup_param "--verbose 128 2 0"'
835- ),
836- "sudo iptables -I INPUT -p tcp -m tcp -j ACCEPT" ,
837- "docker run --rm -v /var/lib:/var/lib us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx install --install-nccl" ,
838- "sudo mount --bind /var/lib/tcpx /var/lib/tcpx" ,
839- "sudo mount -o remount,exec /var/lib/tcpx" ,
840- ]
841-
842-
843- def _get_backend_specific_commands_tcpxo () -> List [str ]:
844- return [
845- "modprobe import-helper" ,
846- "gcloud -q auth configure-docker us-docker.pkg.dev" ,
847- # Install the nccl, nccl-net lib into /var/lib/tcpxo/lib64/.
848- (
849- "docker run --rm --name nccl-installer "
850- "--network=host "
851- "--volume /var/lib:/var/lib "
852- "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.8-1 "
853- "install --install-nccl"
854- ),
855- # Start FasTrak receive-datapath-manager
856- (
857- "docker run "
858- "--detach "
859- "--pull=always "
860- "--name receive-datapath-manager "
861- "--cap-add=NET_ADMIN "
862- "--network=host "
863- "--privileged "
864- "--gpus all "
865- "--volume /usr/lib32:/usr/local/nvidia/lib64 "
866- "--volume /dev/dmabuf_import_helper:/dev/dmabuf_import_helper "
867- "--env LD_LIBRARY_PATH=/usr/lib/x86_64-linux-gnu "
868- "us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.14 "
869- "--num_hops=2 --num_nics=8 --uid= --alsologtostderr"
870- ),
871- ]
817+ def _get_image_id (instance_type_name : str , cuda : bool ) -> str :
818+ if instance_type_name == "a3-megagpu-8g" :
819+ image_name = "dstack-a3mega-2"
820+ elif cuda :
821+ image_name = f"dstack-cuda-{ version .base_image } "
822+ else :
823+ image_name = f"dstack-{ version .base_image } "
824+ image_name = image_name .replace ("." , "-" )
825+ return f"projects/dstack/global/images/{ image_name } "
826+
827+
828+ def _get_gateway_image_id () -> str :
829+ return "projects/ubuntu-os-cloud/global/images/ubuntu-2204-jammy-v20230714"
830+
831+
832+ def _get_backend_specific_commands (instance_type_name : str ) -> List [str ]:
833+ if instance_type_name == "a3-megagpu-8g" :
834+ return tcpx_features .get_backend_specific_commands_tcpxo ()
835+ return []
872836
873837
874838def _get_volume_price (size : int ) -> float :
0 commit comments