Skip to content

Commit 96c8dab

Browse files
authored
fix: support vram hard-isolation (#414)
1 parent f1b261f commit 96c8dab

File tree

2 files changed

+17
-4
lines changed

2 files changed

+17
-4
lines changed

internal/constants/env.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,8 +122,11 @@ const (
122122
DisableVRAMManagerEnv = "TF_DISABLE_MEMORY_MANAGER"
123123
DisableWorkerFeatureEnvVal = "1"
124124

125-
// hard limiter mode
125+
// hard limiter mode (not open sourced) in percent, only take effect on worker container yet
126126
HardSMLimiterEnv = "TF_CUDA_SM_PERCENT_LIMIT"
127+
// hard limiter (not open sourced) in megabytes, only take effect on worker container and when open source vgpu.rs gpu-limiter is disabled
128+
// when use this mode, memory request can not autoscale dynamically
129+
HardMemLimiterEnv = "TF_CUDA_MEMORY_LIMIT"
127130

128131
TensorFusionRemoteWorkerPortNumber = 8000
129132
TensorFusionRemoteWorkerPortName = "remote-vgpu"

internal/utils/compose.go

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -804,9 +804,6 @@ func SetWorkerContainerSpec(
804804
}, v1.EnvVar{
805805
Name: constants.ContainerNameEnv,
806806
Value: constants.TFContainerNameWorker,
807-
}, v1.EnvVar{
808-
Name: constants.LdPreloadEnv,
809-
Value: constants.LdPreloadLimiter,
810807
}, v1.EnvVar{
811808
Name: constants.PodNamespaceEnv,
812809
ValueFrom: &v1.EnvVarSource{
@@ -816,15 +813,28 @@ func SetWorkerContainerSpec(
816813
},
817814
})
818815

816+
if !strings.Contains(disabledFeatures, constants.BuiltInFeaturesGpuLimiter) &&
817+
workloadProfile.ComputeIsolation != constants.ComputingIsolationModeHard {
818+
container.Env = append(container.Env, v1.EnvVar{
819+
Name: constants.LdPreloadEnv,
820+
Value: constants.LdPreloadLimiter,
821+
})
822+
}
823+
819824
if disabledFeatures != "" {
820825
container.Env = convertDisabledFeaturesToEnvs(disabledFeatures, container.Env)
821826
}
822827

823828
// TODO should calculate and set by hypervisor before container created
829+
// when compute isolation mode is hard-isolation, memory limit also change to hard-mode
830+
// open source vgpu.rs memory limiter is feedback-loop based, potentially cause resource contention
824831
if workloadProfile.ComputeIsolation == constants.ComputingIsolationModeHard {
825832
container.Env = append(container.Env, v1.EnvVar{
826833
Name: constants.HardSMLimiterEnv,
827834
Value: workloadProfile.Resources.Limits.ComputePercent.String(),
835+
}, v1.EnvVar{
836+
Name: constants.HardMemLimiterEnv,
837+
Value: strconv.FormatInt(workloadProfile.Resources.Limits.Vram.Value()/(1024*1024), 10),
828838
})
829839
}
830840

0 commit comments

Comments
 (0)