Skip to content

Commit cdde3fd

Browse files
committed
Mount tcpxo dirs via instance mounts
1 parent 8ad206d commit cdde3fd

File tree

2 files changed

+20
-12
lines changed

2 files changed

+20
-12
lines changed

runner/internal/shim/docker.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,18 +1005,6 @@ func configureGpus(config *container.Config, hostConfig *container.HostConfig, v
10051005
DeviceIDs: ids,
10061006
},
10071007
)
1008-
hostConfig.Mounts = append(
1009-
hostConfig.Mounts,
1010-
mount.Mount{Type: mount.TypeBind, Source: "/dev/aperture_devices", Target: "/dev/aperture_devices"},
1011-
)
1012-
hostConfig.Mounts = append(
1013-
hostConfig.Mounts,
1014-
mount.Mount{Type: mount.TypeBind, Source: "/var/lib/tcpxo/lib64", Target: "/var/lib/tcpxo/lib64"},
1015-
)
1016-
hostConfig.Mounts = append(
1017-
hostConfig.Mounts,
1018-
mount.Mount{Type: mount.TypeBind, Source: "/var/lib/fastrak/lib64", Target: "/var/lib/fastrak/lib64"},
1019-
)
10201008
case host.GpuVendorAmd:
10211009
// All options are listed here: https://hub.docker.com/r/rocm/pytorch
10221010
// Only --device are mandatory, other seem to be performance-related.

src/dstack/_internal/server/background/tasks/process_running_jobs.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,10 @@ def _process_provisioning_with_shim(
434434
for volume, volume_mount in zip(volumes, volume_mounts):
435435
volume_mount.name = volume.name
436436

437+
instance_mounts += _get_instance_specific_mounts(
438+
job_provisioning_data.backend, job_provisioning_data.instance_type.name
439+
)
440+
437441
container_user = "root"
438442

439443
job_runtime_data = get_job_runtime_data(job_model)
@@ -825,3 +829,19 @@ def _submit_job_to_runner(
825829
# do not log here, because the runner will send a new status
826830

827831
return True
832+
833+
834+
def _get_instance_specific_mounts(
835+
backend_type: BackendType, instance_type_name: str
836+
) -> List[InstanceMountPoint]:
837+
if backend_type == BackendType.GCP and instance_type_name == "a3-megagpu-8g":
838+
return [
839+
InstanceMountPoint(
840+
instance_path="/dev/aperture_devices", path="/dev/aperture_devices"
841+
),
842+
InstanceMountPoint(instance_path="/var/lib/tcpxo/lib64", path="/var/lib/tcpxo/lib64"),
843+
InstanceMountPoint(
844+
instance_path="/var/lib/fastrak/lib64", path="/var/lib/fastrak/lib64"
845+
),
846+
]
847+
return []

0 commit comments

Comments
 (0)