refactor(gpuallocator): use parameter struct for Alloc function (#227)

0x5457 · web-flow · commit f51c4717c9d6 · 2025-06-08T18:02:00.000+08:00
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -357,16 +357,11 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 		return types.NamespacedName{Name: gpuName}
 	})
 	// Release GPU resources
-	if err := r.Allocator.Dealloc(ctx,
-		tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name},
-		workload.Spec.Resources.Requests, gpus); err != nil {
+	if err := r.Allocator.Dealloc(ctx, tfv1.NameNamespace{Name: workload.Name, Namespace: workload.Namespace}, workload.Spec.Resources.Requests, gpus); err != nil {
 		log.Error(err, "Failed to release GPU resources, will retry", "gpus", gpus, "pod", pod.Name)
 		return false, err
 	}
 	log.Info("Released GPU resources via finalizer", "gpus", gpus, "pod", pod.Name)
-	if pod.Annotations == nil {
-		pod.Annotations = make(map[string]string)
-	}
 
 	return true, nil
 }
@@ -391,7 +386,13 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
 	// Create worker pods
 	for range count {
 		// Schedule GPU for the worker
-		gpus, err := r.Allocator.Alloc(ctx, workload.Spec.PoolName, workloadNameNs, workload.Spec.Resources.Requests, workload.Spec.GPUCount, workload.Spec.GPUModel)
+		gpus, err := r.Allocator.Alloc(ctx, gpuallocator.AllocRequest{
+			PoolName:              workload.Spec.PoolName,
+			WorkloadNameNamespace: workloadNameNs,
+			Request:               workload.Spec.Resources.Requests,
+			Count:                 workload.Spec.GPUCount,
+			GPUModel:              workload.Spec.GPUModel,
+		})
 		if err != nil {
 			r.Recorder.Eventf(workload, corev1.EventTypeWarning, "ScheduleGPUFailed", "Failed to schedule GPU: %v", err)
 			return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
@@ -526,7 +527,7 @@ func (r *TensorFusionWorkloadReconciler) updateStatus(
 func (r *TensorFusionWorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error {
 	return ctrl.NewControllerManagedBy(mgr).
 		For(&tfv1.TensorFusionWorkload{}).
-		Named("tensorfusionworkload").
 		Owns(&corev1.Pod{}).
+		Named("tensorfusionworkload").
 		Complete(r)
 }
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -53,28 +53,35 @@ type GpuAllocator struct {
 	dirtyQueueLock sync.Mutex
 }
 
+// AllocRequest encapsulates all parameters needed for GPU allocation
+type AllocRequest struct {
+	// Name of the GPU pool to allocate from
+	PoolName string
+	// Namespace information for the workload
+	WorkloadNameNamespace tfv1.NameNamespace
+	// Resource requirements for the allocation
+	Request tfv1.Resource
+	// Number of GPUs to allocate
+	Count uint
+	// Specific GPU model to allocate, empty string means any model
+	GPUModel string
+}
+
 // Alloc allocates a request to a gpu or multiple gpus from the same node.
-func (s *GpuAllocator) Alloc(
-	ctx context.Context,
-	poolName string,
-	workloadNameNamespace tfv1.NameNamespace,
-	request tfv1.Resource,
-	count uint,
-	gpuModel string,
-) ([]*tfv1.GPU, error) {
+func (s *GpuAllocator) Alloc(ctx context.Context, req AllocRequest) ([]*tfv1.GPU, error) {
 	// Get GPUs from the pool using the in-memory store
-	poolGPUs := s.listGPUsFromPool(poolName)
+	poolGPUs := s.listGPUsFromPool(req.PoolName)
 
 	// Add SameNodeFilter if count > 1 to ensure GPUs are from the same node
-	filterRegistry := s.filterRegistry.With(filter.NewResourceFilter(request))
+	filterRegistry := s.filterRegistry.With(filter.NewResourceFilter(req.Request))
 
 	// Add GPU model filter if specified
-	if gpuModel != "" {
-		filterRegistry = filterRegistry.With(filter.NewGPUModelFilter(gpuModel))
+	if req.GPUModel != "" {
+		filterRegistry = filterRegistry.With(filter.NewGPUModelFilter(req.GPUModel))
 	}
 
-	if count > 1 {
-		filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(count))
+	if req.Count > 1 {
+		filterRegistry = filterRegistry.With(filter.NewSameNodeFilter(req.Count))
 	}
 
 	// Apply the filters in sequence
@@ -84,12 +91,12 @@ func (s *GpuAllocator) Alloc(
 	}
 
 	if len(filteredGPUs) == 0 {
-		return nil, fmt.Errorf("no gpus available in pool %s after filtering", poolName)
+		return nil, fmt.Errorf("no gpus available in pool %s after filtering", req.PoolName)
 	}
 
 	pool := &tfv1.GPUPool{}
-	if err := s.Get(ctx, client.ObjectKey{Name: poolName}, pool); err != nil {
-		return nil, fmt.Errorf("get pool %s: %w", poolName, err)
+	if err := s.Get(ctx, client.ObjectKey{Name: req.PoolName}, pool); err != nil {
+		return nil, fmt.Errorf("get pool %s: %w", req.PoolName, err)
 	}
 
 	schedulingConfigTemplate := &tfv1.SchedulingConfigTemplate{}
@@ -100,7 +107,7 @@ func (s *GpuAllocator) Alloc(
 	}
 
 	strategy := NewStrategy(schedulingConfigTemplate.Spec.Placement.Mode)
-	selectedGPUs, err := strategy.SelectGPUs(filteredGPUs, count)
+	selectedGPUs, err := strategy.SelectGPUs(filteredGPUs, req.Count)
 	if err != nil {
 		return nil, fmt.Errorf("select GPU: %w", err)
 	}
@@ -121,11 +128,11 @@ func (s *GpuAllocator) Alloc(
 		}
 
 		// reduce available resource on the GPU status
-		gpu.Status.Available.Tflops.Sub(request.Tflops)
-		gpu.Status.Available.Vram.Sub(request.Vram)
+		gpu.Status.Available.Tflops.Sub(req.Request.Tflops)
+		gpu.Status.Available.Vram.Sub(req.Request.Vram)
 
 		if !appAdded {
-			addRunningApp(ctx, gpu, workloadNameNamespace)
+			addRunningApp(ctx, gpu, req.WorkloadNameNamespace)
 			appAdded = true
 		}
 
diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go
@@ -36,7 +36,13 @@ var _ = Describe("GPU Allocator", func() {
 	var allocator *GpuAllocator
 
 	allocateAndSync := func(poolName string, request tfv1.Resource, count uint, gpuModel string) ([]*tfv1.GPU, error) {
-		gpus, err := allocator.Alloc(ctx, poolName, workloadNameNs, request, count, gpuModel)
+		gpus, err := allocator.Alloc(ctx, AllocRequest{
+			PoolName:              poolName,
+			WorkloadNameNamespace: workloadNameNs,
+			Request:               request,
+			Count:                 count,
+			GPUModel:              gpuModel,
+		})
 		allocator.syncToK8s(ctx)
 		return gpus, err
 	}