Skip to content

Commit e7b3a37

Browse files
authored
fix: support simpler migration from nvidia.com/gpu limits (#417)
* fix: update status for in mem gpus * fix: support simpler migration from nvidia.com/gpu limits * fix: support one label seamless migration
1 parent 6323781 commit e7b3a37

File tree

8 files changed

+144
-71
lines changed

8 files changed

+144
-71
lines changed

api/v1/gpu_types.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
)
2323

2424
// GPUStatus defines the observed state of GPU.
25+
// NOTE: When new fields added, remember to update syncGPUMetadataAndStatusFromCluster
2526
type GPUStatus struct {
2627
// +kubebuilder:default=Pending
2728
Phase TensorFusionGPUPhase `json:"phase"`

charts/tensor-fusion/crds/tensor-fusion.ai_gpus.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ spec:
6565
metadata:
6666
type: object
6767
status:
68-
description: GPUStatus defines the observed state of GPU.
68+
description: |-
69+
GPUStatus defines the observed state of GPU.
70+
NOTE: When new fields added, remember to update syncGPUMetadataAndStatusFromCluster
6971
properties:
7072
available:
7173
properties:

config/crd/bases/tensor-fusion.ai_gpus.yaml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ spec:
6565
metadata:
6666
type: object
6767
status:
68-
description: GPUStatus defines the observed state of GPU.
68+
description: |-
69+
GPUStatus defines the observed state of GPU.
70+
NOTE: When new fields added, remember to update syncGPUMetadataAndStatusFromCluster
6971
properties:
7072
available:
7173
properties:

internal/gpuallocator/gpuallocator.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1063,6 +1063,9 @@ func syncGPUMetadataAndStatusFromCluster(old *tfv1.GPU, gpu *tfv1.GPU) {
10631063
old.Status.NodeSelector = gpu.Status.NodeSelector
10641064
old.Status.GPUModel = gpu.Status.GPUModel
10651065
old.Status.UsedBy = gpu.Status.UsedBy
1066+
old.Status.Vendor = gpu.Status.Vendor
1067+
old.Status.NUMANode = gpu.Status.NUMANode
1068+
old.Status.Index = gpu.Status.Index
10661069
}
10671070

10681071
func (s *GpuAllocator) handleGPUUpdateCapacityDiff(old, gpu *tfv1.GPU) {
@@ -1437,13 +1440,14 @@ func removeRunningApp(ctx context.Context, gpu *tfv1.GPU, workloadNameNamespace
14371440
}
14381441

14391442
func (s *GpuAllocator) ComposeAllocationRequest(pod *v1.Pod) (*tfv1.AllocRequest, string, error) {
1443+
// allow Pods with no requests/limits to use TensorFusion, Pod webhook will ensure at least one request/limit is set
14401444
gpuRequestResource, err := utils.GetGPUResource(pod, true)
14411445
if err != nil {
1442-
return &tfv1.AllocRequest{}, "invalid gpu request annotation", err
1446+
log.FromContext(s.ctx).Error(err, "Invalid gpu request annotation", "pod", pod.Name, "namespace", pod.Namespace)
14431447
}
14441448
gpuLimitResource, err := utils.GetGPUResource(pod, false)
14451449
if err != nil {
1446-
return &tfv1.AllocRequest{}, "invalid gpu limit annotation", err
1450+
log.FromContext(s.ctx).Error(err, "Invalid gpu limit annotation", "pod", pod.Name, "namespace", pod.Namespace)
14471451
}
14481452

14491453
count := 1

internal/utils/compose.go

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -99,27 +99,37 @@ func AddOrOverrideTFClientMissingAnnotationsBeforePatch(pod *v1.Pod, tfInfo Tens
9999
}
100100

101101
// add full annotations
102-
pod.Annotations[constants.TFLOPSLimitAnnotation] = tfInfo.Profile.Resources.Limits.Tflops.String()
103-
pod.Annotations[constants.VRAMLimitAnnotation] = tfInfo.Profile.Resources.Limits.Vram.String()
104-
if tfInfo.Profile.Qos == "" {
105-
pod.Annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
106-
} else {
107-
pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
102+
if !tfInfo.Profile.Resources.Limits.Tflops.IsZero() {
103+
pod.Annotations[constants.TFLOPSLimitAnnotation] = tfInfo.Profile.Resources.Limits.Tflops.String()
104+
}
105+
if !tfInfo.Profile.Resources.Limits.Vram.IsZero() {
106+
pod.Annotations[constants.VRAMLimitAnnotation] = tfInfo.Profile.Resources.Limits.Vram.String()
107+
}
108+
if !tfInfo.Profile.Resources.Requests.Tflops.IsZero() {
109+
pod.Annotations[constants.TFLOPSRequestAnnotation] = tfInfo.Profile.Resources.Requests.Tflops.String()
110+
}
111+
if !tfInfo.Profile.Resources.Requests.Vram.IsZero() {
112+
pod.Annotations[constants.VRAMRequestAnnotation] = tfInfo.Profile.Resources.Requests.Vram.String()
108113
}
109-
pod.Annotations[constants.TFLOPSRequestAnnotation] = tfInfo.Profile.Resources.Requests.Tflops.String()
110-
pod.Annotations[constants.VRAMRequestAnnotation] = tfInfo.Profile.Resources.Requests.Vram.String()
111-
112114
if !tfInfo.Profile.Resources.Requests.ComputePercent.IsZero() {
113115
pod.Annotations[constants.ComputeRequestAnnotation] = tfInfo.Profile.Resources.Requests.ComputePercent.String()
114116
}
115117
if !tfInfo.Profile.Resources.Limits.ComputePercent.IsZero() {
116118
pod.Annotations[constants.ComputeLimitAnnotation] = tfInfo.Profile.Resources.Limits.ComputePercent.String()
117119
}
120+
if tfInfo.Profile.Qos == "" {
121+
pod.Annotations[constants.QoSLevelAnnotation] = string(tfv1.QoSMedium)
122+
} else {
123+
pod.Annotations[constants.QoSLevelAnnotation] = string(tfInfo.Profile.Qos)
124+
}
118125
pod.Annotations[constants.GpuCountAnnotation] = fmt.Sprintf("%d", tfInfo.Profile.GPUCount)
119126
pod.Annotations[constants.GpuPoolKey] = tfInfo.Profile.PoolName
120127
if tfInfo.Profile.GPUModel != "" {
121128
pod.Annotations[constants.GPUModelAnnotation] = tfInfo.Profile.GPUModel
122129
}
130+
if tfInfo.Profile.GPUVendor != "" {
131+
pod.Annotations[constants.GpuVendorAnnotation] = tfInfo.Profile.GPUVendor
132+
}
123133
pod.Annotations[constants.IsLocalGPUAnnotation] = strconv.FormatBool(tfInfo.Profile.IsLocalGPU)
124134
pod.Annotations[constants.SidecarWorkerAnnotation] = strconv.FormatBool(tfInfo.Profile.SidecarWorker)
125135
// add inject container annotation for client Pod, in case user doesn't specify it
@@ -185,6 +195,7 @@ func AppendTFWorkerLabelsAndAnnotationsAfterTemplate(
185195
return strconv.Itoa(int(index))
186196
}), ",")
187197
}
198+
annotations[constants.ComputingIsolationModeAnnotation] = string(workload.Spec.ComputeIsolation)
188199
return labels, annotations
189200
}
190201

internal/webhook/v1/pod_webhook.go

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -125,8 +125,9 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
125125
if err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.Profile.PoolName}, pool); err != nil {
126126
return admission.Errored(http.StatusInternalServerError, fmt.Errorf("gpu pool(%s) does not exist", tfInfo.Profile.PoolName))
127127
}
128+
tfInfo.Profile.Qos = calculateQoSLevel(tfInfo.Profile, pool)
128129

129-
if workload, err := m.createOrUpdateWorkload(ctx, pod, &tfInfo, pool); err != nil {
130+
if workload, err := m.createOrUpdateWorkload(ctx, pod, &tfInfo); err != nil {
130131
return admission.Errored(http.StatusInternalServerError, fmt.Errorf("create tf workload: %w", err))
131132
} else {
132133
// Pod mutating webhook can not get Pod UID,
@@ -159,13 +160,12 @@ func (m *TensorFusionPodMutator) Handle(ctx context.Context, req admission.Reque
159160
}
160161

161162
// Check if final profile is valid and contains valid GPU resource requests
162-
if tfInfo.Profile.Resources.Limits.Tflops.IsZero() && tfInfo.Profile.Resources.Limits.ComputePercent.IsZero() {
163+
if tfInfo.Profile.Resources.Requests.Tflops.IsZero() &&
164+
tfInfo.Profile.Resources.Requests.ComputePercent.IsZero() &&
165+
tfInfo.Profile.Resources.Requests.Vram.IsZero() {
163166
return admission.Errored(http.StatusInternalServerError,
164-
fmt.Errorf("tflops limit is not set, please set tensor-fusion.ai/tflops-limit or tensor-fusion.ai/compute-percent-limit annotation on Pod"))
165-
}
166-
if tfInfo.Profile.Resources.Limits.Vram.IsZero() {
167-
return admission.Errored(http.StatusInternalServerError,
168-
fmt.Errorf("vram limit is not set, please set tensor-fusion.ai/vram-limit annotation on Pod"))
167+
fmt.Errorf("tflops request is not set, please set tensor-fusion.ai/tflops-request or/and tensor-fusion.ai/compute-percent-request"+
168+
" or/and tensor-fusion.ai/vram-request annotation on Pod"))
169169
}
170170

171171
// Add defaults and tensor-fusion injection logic
@@ -213,18 +213,9 @@ func (m *TensorFusionPodMutator) createOrUpdateWorkload(
213213
ctx context.Context,
214214
pod *corev1.Pod,
215215
tfInfo *utils.TensorFusionInfo,
216-
pool *tfv1.GPUPool) (*tfv1.TensorFusionWorkload, error) {
216+
) (*tfv1.TensorFusionWorkload, error) {
217217
// Create the desired spec for comparison
218-
desiredSpec := tfv1.WorkloadProfileSpec{
219-
Replicas: nil,
220-
PoolName: tfInfo.Profile.PoolName,
221-
Resources: tfInfo.Profile.Resources,
222-
Qos: calculateQoSLevel(tfInfo.Profile, pool),
223-
IsLocalGPU: tfInfo.Profile.IsLocalGPU,
224-
GPUCount: tfInfo.Profile.GPUCount,
225-
GPUModel: tfInfo.Profile.GPUModel,
226-
AutoScalingConfig: tfInfo.Profile.AutoScalingConfig,
227-
}
218+
desiredSpec := *tfInfo.Profile.DeepCopy()
228219

229220
workload := &tfv1.TensorFusionWorkload{}
230221
err := m.Client.Get(ctx, client.ObjectKey{Name: tfInfo.WorkloadName, Namespace: pod.Namespace}, workload)

internal/webhook/v1/pod_webhook_test.go

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ func isAddOperation(patch jsonpatch.JsonPatchOperation) bool {
4545
return patch.Operation == "add"
4646
}
4747

48+
// nolint:goconst
4849
var _ = Describe("TensorFusionPodMutator", func() {
4950
var (
5051
mutator *TensorFusionPodMutator
@@ -359,6 +360,60 @@ var _ = Describe("TensorFusionPodMutator", func() {
359360
Expect(op.Value).To(Equal("3"))
360361
})
361362

363+
It("should handle gpu resource limits with only one label", func() {
364+
pod := &corev1.Pod{
365+
ObjectMeta: metav1.ObjectMeta{
366+
Name: "test-pod-local-gpu",
367+
Namespace: "default",
368+
Labels: map[string]string{
369+
constants.TensorFusionEnabledLabelKey: "true",
370+
},
371+
Annotations: map[string]string{
372+
constants.GpuPoolKey: "mock",
373+
},
374+
},
375+
Spec: corev1.PodSpec{
376+
Containers: []corev1.Container{
377+
{
378+
Name: "main",
379+
Image: "test-image",
380+
Resources: corev1.ResourceRequirements{
381+
Limits: corev1.ResourceList{
382+
corev1.ResourceName(constants.NvidiaGPUKey): resource.MustParse("3"),
383+
},
384+
},
385+
},
386+
},
387+
},
388+
}
389+
podBytes, err := json.Marshal(pod)
390+
Expect(err).NotTo(HaveOccurred())
391+
req := admission.Request{
392+
AdmissionRequest: admissionv1.AdmissionRequest{
393+
Object: runtime.RawExtension{
394+
Raw: podBytes,
395+
},
396+
Operation: admissionv1.Create,
397+
Namespace: "default",
398+
},
399+
}
400+
resp := mutator.Handle(ctx, req)
401+
Expect(resp.Allowed).To(BeTrue())
402+
403+
op, found := lo.Find(resp.Patches, func(patch jsonpatch.JsonPatchOperation) bool {
404+
return isAddOperation(patch) &&
405+
patch.Path == "/metadata/annotations/tensor-fusion.ai~1compute-percent-request"
406+
})
407+
Expect(found).To(BeTrue())
408+
Expect(op.Value).To(Equal("100"))
409+
op, found = lo.Find(resp.Patches, func(patch jsonpatch.JsonPatchOperation) bool {
410+
return isAddOperation(patch) &&
411+
patch.Path == "/metadata/annotations/tensor-fusion.ai~1gpu-count"
412+
})
413+
Expect(found).To(BeTrue())
414+
Expect(op.Value).To(Equal("3"))
415+
})
416+
362417
It("should handle ignore gpu resource limits when annotation is present", func() {
363418
pod := &corev1.Pod{
364419
ObjectMeta: metav1.ObjectMeta{

internal/webhook/v1/tf_parser.go

Lines changed: 47 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -181,32 +181,44 @@ func parseAutoScalingAnnotations(pod *corev1.Pod, workloadProfile *tfv1.Workload
181181
}
182182

183183
func parseGPUResourcesAnnotations(pod *corev1.Pod, workloadProfile *tfv1.WorkloadProfile) error {
184+
// extract any containers has GPU count limits and set to annotation
185+
isMigratedFromContainerLimits := false
186+
gpuCount, hasValue := pod.Annotations[constants.GpuCountAnnotation]
187+
if hasValue {
188+
val, err := strconv.ParseInt(gpuCount, 10, 32)
189+
if err != nil {
190+
return fmt.Errorf("invalid gpuCount value: %w", err)
191+
}
192+
workloadProfile.Spec.GPUCount = uint32(val)
193+
} else if workloadProfile.Spec.GPUCount == 0 {
194+
for _, container := range pod.Spec.Containers {
195+
if quantity, ok := container.Resources.Limits[constants.NvidiaGPUKey]; ok {
196+
gpuNumber, err := strconv.Atoi(quantity.String())
197+
if err != nil || gpuNumber <= 0 {
198+
ctrl.Log.Error(err, "unrecognized nvidia.com/gpu in resources, not a valid number", "pod", pod.Name, "container", container.Name)
199+
} else {
200+
workloadProfile.Spec.GPUCount = uint32(gpuNumber)
201+
// For seamless migration with only one tensor-fusion.ai/enabled label
202+
// and one tensor-fusion.ai/vram-limit annotation, convert this to 100% computing-percent
203+
workloadProfile.Spec.Resources.Limits.ComputePercent = resource.MustParse("100")
204+
isMigratedFromContainerLimits = true
205+
break
206+
}
207+
}
208+
}
209+
}
210+
184211
if tflopsLimit, hasValue := parseResourceQuantity(pod, constants.TFLOPSLimitAnnotation); hasValue {
185212
workloadProfile.Spec.Resources.Limits.Tflops = tflopsLimit
213+
// clean compute percent limit when tflops limit is set in annotation
214+
if isMigratedFromContainerLimits {
215+
workloadProfile.Spec.Resources.Limits.ComputePercent = resource.Quantity{}
216+
}
186217
}
187218
if vramLimit, hasValue := parseResourceQuantity(pod, constants.VRAMLimitAnnotation); hasValue {
188219
workloadProfile.Spec.Resources.Limits.Vram = vramLimit
189220
}
190221

191-
computeRequest, hasValue := parseResourceQuantity(pod, constants.ComputeLimitAnnotation)
192-
if hasValue {
193-
workloadProfile.Spec.Resources.Limits.ComputePercent = computeRequest
194-
}
195-
computeLimit, hasValue := parseResourceQuantity(pod, constants.ComputeRequestAnnotation)
196-
if hasValue {
197-
workloadProfile.Spec.Resources.Requests.ComputePercent = computeLimit
198-
} else {
199-
workloadProfile.Spec.Resources.Requests.ComputePercent = workloadProfile.Spec.Resources.Limits.ComputePercent
200-
}
201-
202-
// tflops - computePercent are mutually exclusive
203-
if !workloadProfile.Spec.Resources.Requests.Tflops.IsZero() && !workloadProfile.Spec.Resources.Requests.ComputePercent.IsZero() {
204-
return fmt.Errorf("tflops- and computePercent request are mutually exclusive, please specify only one")
205-
}
206-
if !workloadProfile.Spec.Resources.Limits.Tflops.IsZero() && !workloadProfile.Spec.Resources.Limits.ComputePercent.IsZero() {
207-
return fmt.Errorf("tflops- and computePercent limit are mutually exclusive, please specify only one")
208-
}
209-
210222
if tflopsRequest, hasValue := parseResourceQuantity(pod, constants.TFLOPSRequestAnnotation); hasValue {
211223
workloadProfile.Spec.Resources.Requests.Tflops = tflopsRequest
212224
} else if workloadProfile.Spec.Resources.Requests.Tflops.IsZero() {
@@ -218,31 +230,26 @@ func parseGPUResourcesAnnotations(pod *corev1.Pod, workloadProfile *tfv1.Workloa
218230
workloadProfile.Spec.Resources.Requests.Vram = workloadProfile.Spec.Resources.Limits.Vram
219231
}
220232

221-
qosLevel, hasValue := pod.Annotations[constants.QoSLevelAnnotation]
233+
// Percentage way to specify GPU resource request, not recommended, should use TFLOPs instead
234+
computeLimit, hasValue := parseResourceQuantity(pod, constants.ComputeLimitAnnotation)
222235
if hasValue {
223-
workloadProfile.Spec.Qos = tfv1.QoSLevel(qosLevel)
236+
workloadProfile.Spec.Resources.Limits.ComputePercent = computeLimit
237+
}
238+
computeRequest, hasValue := parseResourceQuantity(pod, constants.ComputeRequestAnnotation)
239+
if hasValue {
240+
workloadProfile.Spec.Resources.Requests.ComputePercent = computeRequest
241+
} else if workloadProfile.Spec.Resources.Requests.Tflops.IsZero() && workloadProfile.Spec.Resources.Requests.ComputePercent.IsZero() {
242+
workloadProfile.Spec.Resources.Requests.ComputePercent = workloadProfile.Spec.Resources.Limits.ComputePercent
224243
}
225244

226-
// extract any containers has GPU count limits and set to annotation
227-
gpuCount, hasValue := pod.Annotations[constants.GpuCountAnnotation]
245+
// tflops - computePercent are mutually exclusive
246+
if !workloadProfile.Spec.Resources.Requests.Tflops.IsZero() && !workloadProfile.Spec.Resources.Requests.ComputePercent.IsZero() {
247+
return fmt.Errorf("tflops- and computePercent request are mutually exclusive, please specify only one")
248+
}
249+
250+
qosLevel, hasValue := pod.Annotations[constants.QoSLevelAnnotation]
228251
if hasValue {
229-
val, err := strconv.ParseInt(gpuCount, 10, 32)
230-
if err != nil {
231-
return fmt.Errorf("invalid gpuCount value: %w", err)
232-
}
233-
workloadProfile.Spec.GPUCount = uint32(val)
234-
} else if workloadProfile.Spec.GPUCount == 0 {
235-
for _, container := range pod.Spec.Containers {
236-
if quantity, ok := container.Resources.Limits[constants.NvidiaGPUKey]; ok {
237-
gpuNumber, err := strconv.Atoi(quantity.String())
238-
if err != nil || gpuNumber <= 0 {
239-
ctrl.Log.Error(err, "unrecognized nvidia.com/gpu in resources, not a valid number", "pod", pod.Name, "container", container.Name)
240-
} else {
241-
workloadProfile.Spec.GPUCount = uint32(gpuNumber)
242-
break
243-
}
244-
}
245-
}
252+
workloadProfile.Spec.Qos = tfv1.QoSLevel(qosLevel)
246253
}
247254

248255
gpuVendor, hasValue := pod.Annotations[constants.GpuVendorAnnotation]

0 commit comments

Comments
 (0)