fix: Skip deleted workers during allocation state reconciliation (#239)

0x5457 · Code2Life · web-flow · commit 7d147df350eb · 2025-06-17T18:36:10.000+08:00
* fix: Skip deleted workers during allocation state reconciliation

* fix: tsdb SQL schema typing  issue

---------

Co-authored-by: Joey &lt;569475269@qq.com&gt;
diff --git a/internal/controller/tensorfusionworkload_controller.go b/internal/controller/tensorfusionworkload_controller.go
@@ -359,10 +359,7 @@ func (r *TensorFusionWorkloadReconciler) handlePodGPUCleanup(ctx context.Context
 		return types.NamespacedName{Name: gpuName}
 	})
 	// Release GPU resources
-	if err := r.Allocator.Dealloc(ctx, tfv1.NameNamespace{Name: workload.Name, Namespace: workload.Namespace}, workload.Spec.Resources.Requests, gpus); err != nil {
-		log.Error(err, "Failed to release GPU resources, will retry", "gpus", gpus, "pod", pod.Name)
-		return false, err
-	}
+	r.Allocator.Dealloc(ctx, tfv1.NameNamespace{Name: workload.Name, Namespace: workload.Namespace}, workload.Spec.Resources.Requests, gpus)
 	log.Info("Released GPU resources via finalizer", "gpus", gpus, "pod", pod.Name)
 
 	return true, nil
@@ -383,7 +380,6 @@ func (r *TensorFusionWorkloadReconciler) deletePod(ctx context.Context, pod *cor
 
 // scaleUpWorkers handles the scaling up of worker pods
 func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, workerGenerator *worker.WorkerGenerator, workload *tfv1.TensorFusionWorkload, count int, hash string) (ctrl.Result, error) {
-	log := log.FromContext(ctx)
 	workloadNameNs := tfv1.NameNamespace{Namespace: workload.Namespace, Name: workload.Name}
 	// Create worker pods
 	for range count {
@@ -408,10 +404,7 @@ func (r *TensorFusionWorkloadReconciler) scaleUpWorkers(ctx context.Context, wor
 			gpus := lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
 				return client.ObjectKeyFromObject(gpu)
 			})
-			releaseErr := r.Allocator.Dealloc(ctx, workloadNameNs, workload.Spec.Resources.Requests, gpus)
-			if releaseErr != nil {
-				log.Error(releaseErr, "Failed to release GPU after pod creation failure", "gpus", gpus)
-			}
+			r.Allocator.Dealloc(ctx, workloadNameNs, workload.Spec.Resources.Requests, gpus)
 			return ctrl.Result{}, fmt.Errorf("create worker pod: %w", err)
 		}
 
diff --git a/internal/gpuallocator/gpuallocator.go b/internal/gpuallocator/gpuallocator.go
@@ -150,7 +150,7 @@ func (s *GpuAllocator) Alloc(ctx context.Context, req AllocRequest) ([]*tfv1.GPU
 }
 
 // Dealloc a request from gpu to release available resources on it.
-func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.NameNamespace, request tfv1.Resource, gpus []types.NamespacedName) error {
+func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.NameNamespace, request tfv1.Resource, gpus []types.NamespacedName) {
 	log := log.FromContext(ctx)
 	s.storeMutex.Lock()
 	defer s.storeMutex.Unlock()
@@ -175,7 +175,6 @@ func (s *GpuAllocator) Dealloc(ctx context.Context, workloadNameNamespace tfv1.N
 		s.markGPUDirty(gpu)
 	}
 
-	return nil
 }
 
 func NewGpuAllocator(ctx context.Context, client client.Client, syncInterval time.Duration) *GpuAllocator {
@@ -452,7 +451,7 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
 		if node.Annotations == nil {
 			// Create annotations if they don't exist
 			patch = []byte(`[{
-				"op": "add",
+			"op": "add",
 				"path": "/metadata/annotations",
 				"value": {
 					"` + constants.GPULastReportTimeAnnotationKey + `": "` + timeValue + `"
@@ -464,7 +463,7 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
 				"op": "add",
 				"path": "/metadata/annotations/` + encodedKey + `",
 				"value": "` + timeValue + `"
-			}]`)
+		}]`)
 		}
 
 		err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
@@ -501,6 +500,10 @@ func (s *GpuAllocator) markGPUDirty(key types.NamespacedName) {
 	s.dirtyQueue[key] = struct{}{}
 }
 
+func (s *GpuAllocator) markGPUDirtyLoced(key types.NamespacedName) {
+	s.dirtyQueue[key] = struct{}{}
+}
+
 // When it's leader, should reconcile state based on existing workers
 // this function is run inside storeMutex lock
 func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
@@ -517,6 +520,9 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 	vramCapacityMap := make(map[types.NamespacedName]resource.Quantity)
 	gpuMap := make(map[types.NamespacedName]*tfv1.GPU)
 
+	defer s.storeMutex.Unlock()
+	s.storeMutex.Lock()
+
 	for gpuKey, gpu := range s.gpuStore {
 		if gpu.Status.Capacity != nil {
 			tflopsCapacityMap[gpuKey] = gpu.Status.Capacity.Tflops
@@ -527,6 +533,9 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 	}
 
 	for _, worker := range workers.Items {
+		if !worker.DeletionTimestamp.IsZero() {
+			continue
+		}
 		tflopsRequest, _ := resource.ParseQuantity(worker.Annotations[constants.TFLOPSRequestAnnotation])
 		vramRequest, _ := resource.ParseQuantity(worker.Annotations[constants.VRAMRequestAnnotation])
 		gpuIds := worker.Annotations[constants.GpuKey]
@@ -559,7 +568,7 @@ func (s *GpuAllocator) reconcileAllocationState(ctx context.Context) {
 		if !sameTflops || !sameVRAM {
 			gpu.Status.Available.Tflops = tflopsCapacityMap[gpuKey]
 			gpu.Status.Available.Vram = vramCapacityMap[gpuKey]
-			s.markGPUDirty(gpuKey)
+			s.markGPUDirtyLoced(gpuKey)
 			log.FromContext(ctx).Info("Correcting gpu available resources", "gpu", gpuKey.Name, "tflops", gpu.Status.Available.Tflops.String(), "vram", gpu.Status.Available.Vram.String())
 		}
 	}
diff --git a/internal/gpuallocator/gpuallocator_test.go b/internal/gpuallocator/gpuallocator_test.go
@@ -48,10 +48,9 @@ var _ = Describe("GPU Allocator", func() {
 	}
 
 	deallocateAndSync := func(gpus []*tfv1.GPU, request tfv1.Resource) {
-		err := allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
+		allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {
 			return client.ObjectKeyFromObject(gpu)
 		}))
-		Expect(err).NotTo(HaveOccurred())
 		allocator.syncToK8s(ctx)
 	}
 
diff --git a/internal/metrics/recorder.go b/internal/metrics/recorder.go
@@ -40,17 +40,20 @@ type ActiveNodeAndWorker struct {
 }
 
 func RemoveWorkerMetrics(workerName string, deletionTime time.Time) {
+	defer workerMetricsLock.Unlock()
 	workerMetricsLock.Lock()
+
 	// to get more accurate metrics, should record the deletion timestamp to calculate duration for the last metrics
-	workerMetricsMap[workerName].deletionTimestamp = &deletionTime
-	workerMetricsLock.Unlock()
+	if _, ok := workerMetricsMap[workerName]; ok {
+		workerMetricsMap[workerName].deletionTimestamp = &deletionTime
+	}
 }
 
 func RemoveNodeMetrics(nodeName string) {
+	defer nodeMetricsLock.Unlock()
 	nodeMetricsLock.Lock()
 	// Node lifecycle is much longer than worker, so just delete the metrics, 1 minute metrics interval is enough
 	delete(nodeMetricsMap, nodeName)
-	nodeMetricsLock.Unlock()
 }
 
 func SetWorkerMetricsByWorkload(pod *corev1.Pod, workload *tfv1.TensorFusionWorkload, now time.Time) {
diff --git a/internal/metrics/tag_parser.go b/internal/metrics/tag_parser.go
@@ -41,18 +41,19 @@ func getInitTableSQL(model schema.Tabler, ttl string) string {
 		var indexClass string
 		var isIndex bool
 		var extraOption string
+		timePrecision := "ns"
 
 		// Split by semicolon first
-		parts := strings.Split(gormTag, ";")
-		for _, part := range parts {
+		parts := strings.SplitSeq(gormTag, ";")
+		for part := range parts {
 			if part == "" {
 				continue
 			}
 
 			// Split by colon
-			keyValue := strings.Split(part, ",")
+			keyValue := strings.SplitSeq(part, ",")
 
-			for _, key := range keyValue {
+			for key := range keyValue {
 				if strings.HasPrefix(key, "column:") {
 					columnName = strings.TrimPrefix(key, "column:")
 				} else if strings.HasPrefix(key, "index:") {
@@ -61,6 +62,8 @@ func getInitTableSQL(model schema.Tabler, ttl string) string {
 					indexClass = strings.TrimPrefix(key, "class:")
 				} else if strings.HasPrefix(key, "option:") {
 					extraOption = strings.TrimPrefix(key, "option:")
+				} else if strings.HasPrefix(key, "precision:") {
+					timePrecision = strings.TrimPrefix(key, "precision:")
 				}
 			}
 		}
@@ -85,7 +88,7 @@ func getInitTableSQL(model schema.Tabler, ttl string) string {
 		default:
 			// Check if it's time.Time
 			if field.Type == reflect.TypeOf(time.Time{}) {
-				dbType = "Timestamp_ms"
+				dbType = fmt.Sprintf("Timestamp_%s", timePrecision)
 				isNullable = false
 			} else {
 				// Default to String for unknown types
diff --git a/internal/metrics/types.go b/internal/metrics/types.go
@@ -110,7 +110,7 @@ type TFSystemLog struct {
 
 	// NOTE: make sure new fields will be migrated in SetupTable function
 
-	GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME"`
+	GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME;precision:ms"`
 }
 
 func (sl TFSystemLog) TableName() string {

Original file line number	Diff line number	Diff line change
`@@ -48,10 +48,9 @@ var _ = Describe("GPU Allocator", func() {`
`48`	`48`	`}`
`49`	`49`
`50`	`50`	`deallocateAndSync := func(gpus []*tfv1.GPU, request tfv1.Resource) {`
`51`		`- err := allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {`
	`51`	`+ allocator.Dealloc(ctx, workloadNameNs, request, lo.Map(gpus, func(gpu *tfv1.GPU, _ int) types.NamespacedName {`
`52`	`52`	`return client.ObjectKeyFromObject(gpu)`
`53`	`53`	`}))`
`54`		`- Expect(err).NotTo(HaveOccurred())`
`55`	`54`	`allocator.syncToK8s(ctx)`
`56`	`55`	`}`
`57`	`56`
Original file line number	Diff line number	Diff line change
`@@ -110,7 +110,7 @@ type TFSystemLog struct {`
`110`	`110`
`111`	`111`	`// NOTE: make sure new fields will be migrated in SetupTable function`
`112`	`112`
`113`		- GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME"`
	`113`	+ GreptimeTimestamp time.Time `json:"greptime_timestamp" gorm:"column:greptime_timestamp;index:,class:TIME;precision:ms"`
`114`	`114`	`}`
`115`	`115`
`116`	`116`	`func (sl TFSystemLog) TableName() string {`