@@ -327,12 +327,13 @@ func (s *GpuAllocator) SetupWithManager(ctx context.Context, mgr manager.Manager
327327 log .Error (err , "Failed to initialize GPU store" )
328328 return err
329329 }
330- readyCh <- struct {}{}
330+ close ( readyCh )
331331 return nil
332332 }))
333333
334334 go func () {
335335 <- mgr .Elected ()
336+ <- readyCh
336337 // reconcile allocation state based on existing workers, run only when it's elected as leader
337338 // and only if it's leader, it will start allocating resources to workers, and start sync loop here
338339 s .reconcileAllocationState (ctx )
@@ -435,12 +436,37 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
435436 }
436437
437438 for nodeName := range dirtyNodes {
438- // Refer https://datatracker.ietf.org/doc/html/rfc6901#section-3 encode `/` as `~1`
439- patch := []byte (`[{
440- "op": "add",
441- "path": "/metadata/annotations/` + strings .ReplaceAll (constants .GPULastReportTimeAnnotationKey , "/" , "~1" ) + `",
442- "value": "` + time .Now ().Format (time .RFC3339 ) + `"
443- }]` )
439+ // First, get the current node to check if annotations exist
440+ node := & tfv1.GPUNode {}
441+ nodeKey := client.ObjectKey {Name : nodeName }
442+ if err := s .Get (ctx , nodeKey , node ); err != nil {
443+ log .Error (err , "Failed to get GPU node for updating last report time" , "node" , nodeName )
444+ continue
445+ }
446+
447+ var patch []byte
448+ timeValue := time .Now ().Format (time .RFC3339 )
449+ encodedKey := strings .ReplaceAll (constants .GPULastReportTimeAnnotationKey , "/" , "~1" )
450+
451+ // Check if annotations already exist
452+ if node .Annotations == nil {
453+ // Create annotations if they don't exist
454+ patch = []byte (`[{
455+ "op": "add",
456+ "path": "/metadata/annotations",
457+ "value": {
458+ "` + constants .GPULastReportTimeAnnotationKey + `": "` + timeValue + `"
459+ }
460+ }]` )
461+ } else {
462+ // Add to existing annotations
463+ patch = []byte (`[{
464+ "op": "add",
465+ "path": "/metadata/annotations/` + encodedKey + `",
466+ "value": "` + timeValue + `"
467+ }]` )
468+ }
469+
444470 err := retry .RetryOnConflict (retry .DefaultBackoff , func () error {
445471 return s .Patch (ctx , & tfv1.GPUNode {
446472 ObjectMeta : metav1.ObjectMeta {
@@ -449,7 +475,7 @@ func (s *GpuAllocator) syncToK8s(ctx context.Context) {
449475 }, client .RawPatch (types .JSONPatchType , patch ))
450476 })
451477 if err != nil {
452- log .Error (err , "Failed to update GPU node last report time, will retry later " , "node" , nodeName )
478+ log .Error (err , "Failed to update GPU node last report time, allocation state may be inconsistent " , "node" , nodeName )
453479 }
454480 }
455481}
0 commit comments