@@ -35,6 +35,8 @@ import (
3535 "sigs.k8s.io/controller-runtime/pkg/log/zap"
3636)
3737
38+ const TMP_PATH = "/tmp"
39+
3840var Scheme = runtime .NewScheme ()
3941
4042func init () {
@@ -167,27 +169,22 @@ func main() {
167169 availableVRAM .Add (gpu .Status .Available .Vram )
168170 }
169171
170- ns := nodeStatus ()
171- ns .TotalTFlops = totalTFlops
172- ns .TotalVRAM = totalVRAM
173- ns .AvailableTFlops = availableTFlops
174- ns .AvailableVRAM = availableVRAM
175- ns .TotalGPUs = int32 (count )
176- ns .ManagedGPUs = int32 (count )
177- ns .ManagedGPUDeviceIDs = allDeviceIDs
178- ns .NodeInfo .RAMSize = * resource .NewQuantity (getTotalHostRAM (), resource .DecimalSI )
179- ns .NodeInfo .DataDiskSize = * resource .NewQuantity (getDiskInfo (constants .TFDataPath ), resource .DecimalSI )
180- gpunode .Status = * ns
181-
172+ // Use proper patch-based update with retry on conflict
182173 err = retry .RetryOnConflict (retry .DefaultBackoff , func () error {
174+ // Get the latest version of the resource
183175 currentGPUNode := & tfv1.GPUNode {}
184176 if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpunode ), currentGPUNode ); err != nil {
185177 return err
186178 }
187179
188- currentGPUNode .Status = * ns
180+ // Create a patch from the original to the desired state
181+ patch := client .MergeFrom (currentGPUNode .DeepCopy ())
182+
183+ // Update status fields conditionally
184+ updateGPUNodeStatus (& currentGPUNode .Status , totalTFlops , totalVRAM , int32 (count ), allDeviceIDs )
189185
190- return k8sClient .Status ().Update (ctx , currentGPUNode )
186+ // Apply the patch using the status subresource
187+ return k8sClient .Status ().Patch (ctx , currentGPUNode , patch )
191188 })
192189 if err != nil {
193190 ctrl .Log .Error (err , "failed to update status of GPUNode after retries" )
@@ -268,6 +265,9 @@ func createOrUpdateTensorFusionGPU(
268265 if gpu .Status .UsedBy == "" {
269266 gpu .Status .UsedBy = tfv1 .UsedByTensorFusion
270267 }
268+ if gpu .Status .Phase == "" {
269+ gpu .Status .Phase = tfv1 .TensorFusionGPUPhasePending
270+ }
271271 return k8sClient .Status ().Patch (ctx , gpu , client .Merge )
272272 })
273273 if err != nil {
@@ -278,12 +278,6 @@ func createOrUpdateTensorFusionGPU(
278278 return gpu
279279}
280280
281- func nodeStatus () * tfv1.GPUNodeStatus {
282- return & tfv1.GPUNodeStatus {
283- Phase : tfv1 .TensorFusionGPUNodePhaseRunning ,
284- }
285- }
286-
287281func kubeClient () (client.Client , error ) {
288282 kubeConfigEnvVar := os .Getenv ("KUBECONFIG" )
289283 var config * rest.Config
@@ -316,7 +310,7 @@ func kubeClient() (client.Client, error) {
316310func getTotalHostRAM () int64 {
317311 v , err := mem .VirtualMemory ()
318312 if err != nil {
319- fmt .Printf ("error getting memory info: %v\n " , err )
313+ fmt .Printf ("[warning] getting memory info failed : %v\n " , err )
320314 return 0
321315 }
322316 return int64 (v .Total )
@@ -325,7 +319,7 @@ func getTotalHostRAM() int64 {
325319func getDiskInfo (path string ) (total int64 ) {
326320 absPath , err := filepath .Abs (path )
327321 if err != nil {
328- fmt .Printf ("error getting disk path: %v\n " , err )
322+ fmt .Printf ("[warning] getting disk path failed : %v\n " , err )
329323 return 0
330324 }
331325
@@ -335,20 +329,42 @@ func getDiskInfo(path string) (total int64) {
335329 if errors .Is (err , syscall .ENOENT ) {
336330 err = os .MkdirAll (absPath , 0o755 )
337331 if err != nil {
338- fmt .Printf ("error creating folder: %s, err: %v\n " , absPath , err )
332+ fmt .Printf ("[warning] creating folder to discover disk space failed : %s, err: %v\n " , absPath , err )
339333 return 0
340334 }
341335 err = syscall .Statfs (absPath , & stat )
342336 if err != nil {
343- fmt .Printf ("error getting disk stats after creation: %v\n " , err )
337+ fmt .Printf ("[warning] getting disk stats after creation failed : %v\n " , err )
344338 return 0
345339 }
346340 } else {
347- fmt .Printf ("error getting disk stats: %v\n " , err )
341+ fmt .Printf ("[warning] getting disk stats failed : %v\n " , err )
348342 return 0
349343 }
350344 }
351345
352346 total = int64 (stat .Blocks * uint64 (stat .Bsize ))
353347 return total
354348}
349+
350+ // updateGPUNodeStatus conditionally updates GPUNode status fields
351+ // Only updates phase if it's empty, and available resources if they are empty
352+ func updateGPUNodeStatus (
353+ status * tfv1.GPUNodeStatus ,
354+ totalTFlops , totalVRAM resource.Quantity ,
355+ totalGPUs int32 , deviceIDs []string ) {
356+ // Always update these fields as they represent current state
357+ status .TotalTFlops = totalTFlops
358+ status .TotalVRAM = totalVRAM
359+ status .TotalGPUs = totalGPUs
360+ status .ManagedGPUs = totalGPUs
361+ status .ManagedGPUDeviceIDs = deviceIDs
362+ status .NodeInfo = tfv1.GPUNodeInfo {
363+ RAMSize : * resource .NewQuantity (getTotalHostRAM (), resource .DecimalSI ),
364+ DataDiskSize : * resource .NewQuantity (getDiskInfo (TMP_PATH ), resource .DecimalSI ),
365+ }
366+ // Only update phase if it's empty (unset)
367+ if status .Phase == "" {
368+ status .Phase = tfv1 .TensorFusionGPUNodePhasePending
369+ }
370+ }
0 commit comments