@@ -243,6 +243,8 @@ func AddTFDefaultClientConfBeforePatch(
243243 Value : constants .NGPUPathValue ,
244244 })
245245
246+ pod .Spec .Containers [injectContainerIndex ].Resources .Limits [constants .SharedMemResName ] = resource .MustParse ("1" )
247+
246248 // disable GPU limiter killer switch
247249 if pod .Annotations [constants .DisableFeaturesAnnotation ] != "" {
248250 features := strings .Split (pod .Annotations [constants .DisableFeaturesAnnotation ], "," )
@@ -334,13 +336,29 @@ func AddTFHypervisorConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, poo
334336 },
335337 })
336338
339+ composeHypervisorInitContainer (spec , pool )
337340 composeHypervisorContainer (spec , pool )
338341
339342 if enableVector {
340343 composeVectorContainer (spec , pool )
341344 }
342345}
343346
347+ func composeHypervisorInitContainer (spec * v1.PodSpec , pool * tfv1.GPUPool ) {
348+ spec .InitContainers = append (spec .InitContainers , v1.Container {
349+ Name : "init-shm" ,
350+ Image : pool .Spec .ComponentConfig .Hypervisor .Image ,
351+ Command : []string {"hypervisor" , "mount-shm" },
352+ VolumeMounts : []v1.VolumeMount {
353+ {
354+ Name : constants .DataVolumeName ,
355+ ReadOnly : false ,
356+ MountPath : constants .TFDataPath ,
357+ },
358+ },
359+ })
360+ }
361+
344362func composeHypervisorContainer (spec * v1.PodSpec , pool * tfv1.GPUPool ) {
345363 spec .Containers [0 ].VolumeMounts = append (spec .Containers [0 ].VolumeMounts , v1.VolumeMount {
346364 Name : constants .DataVolumeName ,
@@ -553,6 +571,8 @@ func AddWorkerConfAfterTemplate(ctx context.Context, spec *v1.PodSpec, workerCon
553571 },
554572 })
555573
574+ spec .Containers [0 ].Resources .Limits [constants .SharedMemResName ] = resource .MustParse ("1" )
575+
556576 // Add volume from host for CUDA hot migration and snapshot
557577 spec .Volumes = append (spec .Volumes , v1.Volume {
558578 Name : constants .DataVolumeName ,
0 commit comments