@@ -181,32 +181,44 @@ func parseAutoScalingAnnotations(pod *corev1.Pod, workloadProfile *tfv1.Workload
181181}
182182
183183func  parseGPUResourcesAnnotations (pod  * corev1.Pod , workloadProfile  * tfv1.WorkloadProfile ) error  {
184+ 	// extract any containers has GPU count limits and set to annotation 
185+ 	isMigratedFromContainerLimits  :=  false 
186+ 	gpuCount , hasValue  :=  pod .Annotations [constants .GpuCountAnnotation ]
187+ 	if  hasValue  {
188+ 		val , err  :=  strconv .ParseInt (gpuCount , 10 , 32 )
189+ 		if  err  !=  nil  {
190+ 			return  fmt .Errorf ("invalid gpuCount value: %w" , err )
191+ 		}
192+ 		workloadProfile .Spec .GPUCount  =  uint32 (val )
193+ 	} else  if  workloadProfile .Spec .GPUCount  ==  0  {
194+ 		for  _ , container  :=  range  pod .Spec .Containers  {
195+ 			if  quantity , ok  :=  container .Resources .Limits [constants .NvidiaGPUKey ]; ok  {
196+ 				gpuNumber , err  :=  strconv .Atoi (quantity .String ())
197+ 				if  err  !=  nil  ||  gpuNumber  <=  0  {
198+ 					ctrl .Log .Error (err , "unrecognized nvidia.com/gpu in resources, not a valid number" , "pod" , pod .Name , "container" , container .Name )
199+ 				} else  {
200+ 					workloadProfile .Spec .GPUCount  =  uint32 (gpuNumber )
201+ 					// For seamless migration with only one tensor-fusion.ai/enabled label 
202+ 					// and one tensor-fusion.ai/vram-limit annotation, convert this to 100% computing-percent 
203+ 					workloadProfile .Spec .Resources .Limits .ComputePercent  =  resource .MustParse ("100" )
204+ 					isMigratedFromContainerLimits  =  true 
205+ 					break 
206+ 				}
207+ 			}
208+ 		}
209+ 	}
210+ 
184211	if  tflopsLimit , hasValue  :=  parseResourceQuantity (pod , constants .TFLOPSLimitAnnotation ); hasValue  {
185212		workloadProfile .Spec .Resources .Limits .Tflops  =  tflopsLimit 
213+ 		// clean compute percent limit when tflops limit is set in annotation 
214+ 		if  isMigratedFromContainerLimits  {
215+ 			workloadProfile .Spec .Resources .Limits .ComputePercent  =  resource.Quantity {}
216+ 		}
186217	}
187218	if  vramLimit , hasValue  :=  parseResourceQuantity (pod , constants .VRAMLimitAnnotation ); hasValue  {
188219		workloadProfile .Spec .Resources .Limits .Vram  =  vramLimit 
189220	}
190221
191- 	computeRequest , hasValue  :=  parseResourceQuantity (pod , constants .ComputeLimitAnnotation )
192- 	if  hasValue  {
193- 		workloadProfile .Spec .Resources .Limits .ComputePercent  =  computeRequest 
194- 	}
195- 	computeLimit , hasValue  :=  parseResourceQuantity (pod , constants .ComputeRequestAnnotation )
196- 	if  hasValue  {
197- 		workloadProfile .Spec .Resources .Requests .ComputePercent  =  computeLimit 
198- 	} else  {
199- 		workloadProfile .Spec .Resources .Requests .ComputePercent  =  workloadProfile .Spec .Resources .Limits .ComputePercent 
200- 	}
201- 
202- 	// tflops - computePercent are mutually exclusive 
203- 	if  ! workloadProfile .Spec .Resources .Requests .Tflops .IsZero () &&  ! workloadProfile .Spec .Resources .Requests .ComputePercent .IsZero () {
204- 		return  fmt .Errorf ("tflops- and computePercent request are mutually exclusive, please specify only one" )
205- 	}
206- 	if  ! workloadProfile .Spec .Resources .Limits .Tflops .IsZero () &&  ! workloadProfile .Spec .Resources .Limits .ComputePercent .IsZero () {
207- 		return  fmt .Errorf ("tflops- and computePercent limit are mutually exclusive, please specify only one" )
208- 	}
209- 
210222	if  tflopsRequest , hasValue  :=  parseResourceQuantity (pod , constants .TFLOPSRequestAnnotation ); hasValue  {
211223		workloadProfile .Spec .Resources .Requests .Tflops  =  tflopsRequest 
212224	} else  if  workloadProfile .Spec .Resources .Requests .Tflops .IsZero () {
@@ -218,31 +230,26 @@ func parseGPUResourcesAnnotations(pod *corev1.Pod, workloadProfile *tfv1.Workloa
218230		workloadProfile .Spec .Resources .Requests .Vram  =  workloadProfile .Spec .Resources .Limits .Vram 
219231	}
220232
221- 	qosLevel , hasValue  :=  pod .Annotations [constants .QoSLevelAnnotation ]
233+ 	// Percentage way to specify GPU resource request, not recommended, should use TFLOPs instead 
234+ 	computeLimit , hasValue  :=  parseResourceQuantity (pod , constants .ComputeLimitAnnotation )
222235	if  hasValue  {
223- 		workloadProfile .Spec .Qos  =  tfv1 .QoSLevel (qosLevel )
236+ 		workloadProfile .Spec .Resources .Limits .ComputePercent  =  computeLimit 
237+ 	}
238+ 	computeRequest , hasValue  :=  parseResourceQuantity (pod , constants .ComputeRequestAnnotation )
239+ 	if  hasValue  {
240+ 		workloadProfile .Spec .Resources .Requests .ComputePercent  =  computeRequest 
241+ 	} else  if  workloadProfile .Spec .Resources .Requests .Tflops .IsZero () &&  workloadProfile .Spec .Resources .Requests .ComputePercent .IsZero () {
242+ 		workloadProfile .Spec .Resources .Requests .ComputePercent  =  workloadProfile .Spec .Resources .Limits .ComputePercent 
224243	}
225244
226- 	// extract any containers has GPU count limits and set to annotation 
227- 	gpuCount , hasValue  :=  pod .Annotations [constants .GpuCountAnnotation ]
245+ 	// tflops - computePercent are mutually exclusive 
246+ 	if  ! workloadProfile .Spec .Resources .Requests .Tflops .IsZero () &&  ! workloadProfile .Spec .Resources .Requests .ComputePercent .IsZero () {
247+ 		return  fmt .Errorf ("tflops- and computePercent request are mutually exclusive, please specify only one" )
248+ 	}
249+ 
250+ 	qosLevel , hasValue  :=  pod .Annotations [constants .QoSLevelAnnotation ]
228251	if  hasValue  {
229- 		val , err  :=  strconv .ParseInt (gpuCount , 10 , 32 )
230- 		if  err  !=  nil  {
231- 			return  fmt .Errorf ("invalid gpuCount value: %w" , err )
232- 		}
233- 		workloadProfile .Spec .GPUCount  =  uint32 (val )
234- 	} else  if  workloadProfile .Spec .GPUCount  ==  0  {
235- 		for  _ , container  :=  range  pod .Spec .Containers  {
236- 			if  quantity , ok  :=  container .Resources .Limits [constants .NvidiaGPUKey ]; ok  {
237- 				gpuNumber , err  :=  strconv .Atoi (quantity .String ())
238- 				if  err  !=  nil  ||  gpuNumber  <=  0  {
239- 					ctrl .Log .Error (err , "unrecognized nvidia.com/gpu in resources, not a valid number" , "pod" , pod .Name , "container" , container .Name )
240- 				} else  {
241- 					workloadProfile .Spec .GPUCount  =  uint32 (gpuNumber )
242- 					break 
243- 				}
244- 			}
245- 		}
252+ 		workloadProfile .Spec .Qos  =  tfv1 .QoSLevel (qosLevel )
246253	}
247254
248255	gpuVendor , hasValue  :=  pod .Annotations [constants .GpuVendorAnnotation ]
0 commit comments