@@ -304,7 +304,7 @@ func (m *TensorFusionPodMutator) patchTFClient(
304304			return  nil , fmt .Errorf ("unmarshal patched container, invalid container patch: %w" , err )
305305		}
306306
307- 		removeNativeGPUResourceClaim ( container )
307+ 		removeNativeGPULimitsAndAddCountToAnnotation ( pod ,  container )
308308
309309		if  ! isLocalGPU  {
310310			addConnectionForRemoteFixedReplicaVirtualGPU (pod , container , clientConfig )
@@ -423,13 +423,21 @@ func addConnectionForRemoteFixedReplicaVirtualGPU(pod *corev1.Pod, container *co
423423	})
424424}
425425
426- // remove nvidia.com/gpu in resources 
427- func  removeNativeGPUResourceClaim ( container  * corev1.Container ) {
426+ // remove nvidia.com/gpu in resources, add the GPU number into annotation  
427+ func  removeNativeGPULimitsAndAddCountToAnnotation ( pod   * corev1. Pod ,  container  * corev1.Container ) {
428428	if  container .Resources .Requests  !=  nil  {
429429		delete (container .Resources .Requests , constants .NvidiaGPUKey )
430430	}
431431	if  container .Resources .Limits  !=  nil  {
432- 		delete (container .Resources .Limits , constants .NvidiaGPUKey )
432+ 		if  quantity , ok  :=  container .Resources .Limits [constants .NvidiaGPUKey ]; ok  {
433+ 			gpuNumber , err  :=  strconv .Atoi (quantity .String ())
434+ 			if  err  !=  nil  ||  gpuNumber  <=  0  {
435+ 				ctrl .Log .Error (err , "unrecognized nvidia.com/gpu in resources, not a valid number" , "pod" , pod .Name , "container" , container .Name )
436+ 			} else  {
437+ 				pod .Annotations [constants .GpuCountAnnotation ] =  strconv .Itoa (gpuNumber )
438+ 			}
439+ 			delete (container .Resources .Limits , constants .NvidiaGPUKey )
440+ 		}
433441	}
434442}
435443
0 commit comments