Skip to content

Commit c882888

Browse files
authored
fix: add shm device for shared limiter among process, communicate with hypervisor (#276)
1 parent 990eaef commit c882888

File tree

3 files changed

+23
-1
lines changed

3 files changed

+23
-1
lines changed

internal/constants/constants.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ const (
159159
ProvisionerNamePlaceholder = "__GPU_NODE_RESOURCE_NAME__"
160160
)
161161

162-
const TFDataPath = "/tmp/tensor-fusion/data"
162+
const TFDataPath = "/run/tensor-fusion"
163163
const DataVolumeName = "tf-data"
164164
const TensorFusionPoolManualCompaction = Domain + "/manual-compaction"
165165
const AlertJobName = "tensor-fusion"

internal/constants/env.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ const (
9393
LdPreloadEnv = "LD_PRELOAD"
9494
LdPreloadLimiter = "/home/app/libcuda_limiter.so"
9595

96+
SharedMemResName = "tensor-fusion.ai/shm"
97+
9698
// disable GPU limiter, for emergency use
9799
DisableGpuLimiterEnv = "DISABLE_GPU_LIMITER"
98100
// directly forward CUDA calls to GPU driver in nGPU mode, for emergency use

internal/utils/compose.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,8 @@ func AddTFDefaultClientConfBeforePatch(
243243
Value: constants.NGPUPathValue,
244244
})
245245

246+
pod.Spec.Containers[injectContainerIndex].Resources.Limits[constants.SharedMemResName] = resource.MustParse("1")
247+
246248
// disable GPU limiter killer switch
247249
if pod.Annotations[constants.DisableFeaturesAnnotation] != "" {
248250
features := strings.Split(pod.Annotations[constants.DisableFeaturesAnnotation], ",")
@@ -334,13 +336,29 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
334336
},
335337
})
336338

339+
composeHypervisorInitContainer(spec, pool)
337340
composeHypervisorContainer(spec, pool)
338341

339342
if enableVector {
340343
composeVectorContainer(spec, pool)
341344
}
342345
}
343346

347+
func composeHypervisorInitContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {
348+
spec.InitContainers = append(spec.InitContainers, v1.Container{
349+
Name: "init-shm",
350+
Image: pool.Spec.ComponentConfig.Hypervisor.Image,
351+
Command: []string{"hypervisor", "mount-shm"},
352+
VolumeMounts: []v1.VolumeMount{
353+
{
354+
Name: constants.DataVolumeName,
355+
ReadOnly: false,
356+
MountPath: constants.TFDataPath,
357+
},
358+
},
359+
})
360+
}
361+
344362
func composeHypervisorContainer(spec *v1.PodSpec, pool *tfv1.GPUPool) {
345363
spec.Containers[0].VolumeMounts = append(spec.Containers[0].VolumeMounts, v1.VolumeMount{
346364
Name: constants.DataVolumeName,
@@ -553,6 +571,8 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
553571
},
554572
})
555573

574+
spec.Containers[0].Resources.Limits[constants.SharedMemResName] = resource.MustParse("1")
575+
556576
// Add volume from host for CUDA hot migration and snapshot
557577
spec.Volumes = append(spec.Volumes, v1.Volume{
558578
Name: constants.DataVolumeName,

0 commit comments

Comments
 (0)