fix: parse nvidia.com/gpu to annotation when enabled (#401)

Code2Life · web-flow · commit 12c3dd562ba8 · 2025-10-22T17:06:57.000+08:00
* fix: disable ngpu mode by default

* chore: lint

* fix: parse nvidia.com/gpu to annotation when enabled

* fix: optimize
diff --git a/internal/webhook/v1/pod_webhook.go b/internal/webhook/v1/pod_webhook.go
@@ -304,7 +304,7 @@ func (m *TensorFusionPodMutator) patchTFClient(
 			return nil, fmt.Errorf("unmarshal patched container, invalid container patch: %w", err)
 		}
 
-		removeNativeGPUResourceClaim(container)
+		removeNativeGPULimitsAndAddCountToAnnotation(pod, container)
 
 		if !isLocalGPU {
 			addConnectionForRemoteFixedReplicaVirtualGPU(pod, container, clientConfig)
@@ -423,13 +423,21 @@ func addConnectionForRemoteFixedReplicaVirtualGPU(pod *corev1.Pod, container *co
 	})
 }
 
-// remove nvidia.com/gpu in resources
-func removeNativeGPUResourceClaim(container *corev1.Container) {
+// remove nvidia.com/gpu in resources, add the GPU number into annotation
+func removeNativeGPULimitsAndAddCountToAnnotation(pod *corev1.Pod, container *corev1.Container) {
 	if container.Resources.Requests != nil {
 		delete(container.Resources.Requests, constants.NvidiaGPUKey)
 	}
 	if container.Resources.Limits != nil {
-		delete(container.Resources.Limits, constants.NvidiaGPUKey)
+		if quantity, ok := container.Resources.Limits[constants.NvidiaGPUKey]; ok {
+			gpuNumber, err := strconv.Atoi(quantity.String())
+			if err != nil || gpuNumber <= 0 {
+				ctrl.Log.Error(err, "unrecognized nvidia.com/gpu in resources, not a valid number", "pod", pod.Name, "container", container.Name)
+			} else {
+				pod.Annotations[constants.GpuCountAnnotation] = strconv.Itoa(gpuNumber)
+			}
+			delete(container.Resources.Limits, constants.NvidiaGPUKey)
+		}
 	}
 }
 

Original file line number	Diff line number	Diff line change
`@@ -304,7 +304,7 @@ func (m *TensorFusionPodMutator) patchTFClient(`
`304`	`304`	`return nil, fmt.Errorf("unmarshal patched container, invalid container patch: %w", err)`
`305`	`305`	`}`
`306`	`306`
`307`		`- removeNativeGPUResourceClaim(container)`
	`307`	`+ removeNativeGPULimitsAndAddCountToAnnotation(pod, container)`
`308`	`308`
`309`	`309`	`if !isLocalGPU {`
`310`	`310`	`addConnectionForRemoteFixedReplicaVirtualGPU(pod, container, clientConfig)`
`@@ -423,13 +423,21 @@ func addConnectionForRemoteFixedReplicaVirtualGPU(pod corev1.Pod, container co`
`423`	`423`	`})`
`424`	`424`	`}`
`425`	`425`
`426`		`-// remove nvidia.com/gpu in resources`
`427`		`-func removeNativeGPUResourceClaim(container *corev1.Container) {`
	`426`	`+// remove nvidia.com/gpu in resources, add the GPU number into annotation`
	`427`	`+func removeNativeGPULimitsAndAddCountToAnnotation(pod corev1.Pod, container corev1.Container) {`
`428`	`428`	`if container.Resources.Requests != nil {`
`429`	`429`	`delete(container.Resources.Requests, constants.NvidiaGPUKey)`
`430`	`430`	`}`
`431`	`431`	`if container.Resources.Limits != nil {`
`432`		`- delete(container.Resources.Limits, constants.NvidiaGPUKey)`
	`432`	`+ if quantity, ok := container.Resources.Limits[constants.NvidiaGPUKey]; ok {`
	`433`	`+ gpuNumber, err := strconv.Atoi(quantity.String())`
	`434`	`+ if err != nil \|\| gpuNumber <= 0 {`
	`435`	`+ ctrl.Log.Error(err, "unrecognized nvidia.com/gpu in resources, not a valid number", "pod", pod.Name, "container", container.Name)`
	`436`	`+ } else {`
	`437`	`+ pod.Annotations[constants.GpuCountAnnotation] = strconv.Itoa(gpuNumber)`
	`438`	`+ }`
	`439`	`+ delete(container.Resources.Limits, constants.NvidiaGPUKey)`
	`440`	`+ }`
`433`	`441`	`}`
`434`	`442`	`}`
`435`	`443`