@@ -152,71 +152,8 @@ func main() {
152152 } else {
153153 ctrl .Log .Info ("found GPU info from config" , "deviceName" , deviceName , "FP16 TFlops" , tflops , "uuid" , uuid )
154154 }
155- gpu := & tfv1.GPU {
156- ObjectMeta : metav1.ObjectMeta {
157- Name : uuid ,
158- },
159- }
160-
161- gpuStatus := tfv1.GPUStatus {
162- Phase : tfv1 .TensorFusionGPUPhaseRunning ,
163- Capacity : & tfv1.Resource {
164- Vram : resource .MustParse (fmt .Sprintf ("%dKi" , memInfo .Total / 1024 )),
165- Tflops : tflops ,
166- },
167- UUID : uuid ,
168- GPUModel : deviceName ,
169- NodeSelector : map [string ]string {
170- "kubernetes.io/hostname" : k8sNodeName ,
171- },
172- }
173-
174- err = retry .OnError (retry .DefaultBackoff , func (err error ) bool {
175- return true // Retry on all errors for now
176- }, func () error {
177- _ , err := controllerutil .CreateOrUpdate (ctx , k8sClient , gpu , func () error {
178- // Set metadata fields
179- gpu .Labels = map [string ]string {
180- constants .LabelKeyOwner : gpunode .Name ,
181- }
182- gpu .Annotations = map [string ]string {
183- constants .GPULastReportTimeAnnotationKey : time .Now ().Format (time .RFC3339 ),
184- }
185-
186- // Set controller reference
187- return controllerutil .SetControllerReference (gpunode , gpu , Scheme )
188- })
189- return err
190- })
191-
192- if err != nil {
193- ctrl .Log .Error (err , "failed to create or update GPU after retries" , "gpu" , gpu )
194- os .Exit (1 )
195- }
196-
197- available := gpuStatus .Available
198- gpu .Status = gpuStatus
199- if available == nil {
200- gpu .Status .Available = gpuStatus .Capacity
201- } else {
202- gpu .Status .Available = available
203- }
204155
205- err = retry .RetryOnConflict (retry .DefaultBackoff , func () error {
206- currentGPU := & tfv1.GPU {}
207- if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpu ), currentGPU ); err != nil {
208- return err
209- }
210-
211- currentGPU .Status = gpu .Status
212-
213- return k8sClient .Status ().Update (ctx , currentGPU )
214- })
215-
216- if err != nil {
217- ctrl .Log .Error (err , "failed to update status of GPU after retries" , "gpu" , gpu )
218- os .Exit (1 )
219- }
156+ gpu := createOrUpdateTensorFusionGPU (k8sClient , ctx , k8sNodeName , gpunode , uuid , deviceName , memInfo , tflops )
220157
221158 totalTFlops .Add (gpu .Status .Capacity .Tflops )
222159 totalVRAM .Add (gpu .Status .Capacity .Vram )
@@ -246,13 +183,82 @@ func main() {
246183
247184 return k8sClient .Status ().Update (ctx , currentGPUNode )
248185 })
249-
250186 if err != nil {
251187 ctrl .Log .Error (err , "failed to update status of GPUNode after retries" )
252188 os .Exit (1 )
253189 }
254190}
255191
192+ func createOrUpdateTensorFusionGPU (
193+ k8sClient client.Client , ctx context.Context , k8sNodeName string , gpunode * tfv1.GPUNode ,
194+ uuid string , deviceName string , memInfo nvml.Memory_v2 , tflops resource.Quantity ) * tfv1.GPU {
195+ gpu := & tfv1.GPU {
196+ ObjectMeta : metav1.ObjectMeta {
197+ Name : uuid ,
198+ },
199+ }
200+
201+ err := retry .OnError (retry .DefaultBackoff , func (err error ) bool {
202+ return true // Retry on all errors for now
203+ }, func () error {
204+ _ , err := controllerutil .CreateOrUpdate (ctx , k8sClient , gpu , func () error {
205+ // Set metadata fields
206+ gpu .Labels = map [string ]string {
207+ constants .LabelKeyOwner : gpunode .Name ,
208+ }
209+ gpu .Annotations = map [string ]string {
210+ constants .GPULastReportTimeAnnotationKey : time .Now ().Format (time .RFC3339 ),
211+ }
212+
213+ if ! metav1 .IsControlledBy (gpu , gpunode ) {
214+ gpu .OwnerReferences = []metav1.OwnerReference {
215+ * metav1 .NewControllerRef (gpunode , gpunode .GroupVersionKind ()),
216+ }
217+ }
218+
219+ return nil
220+ })
221+ return err
222+ })
223+ if err != nil {
224+ ctrl .Log .Error (err , "failed to create or update GPU after retries" , "gpu" , gpu )
225+ os .Exit (1 )
226+ }
227+
228+ err = retry .RetryOnConflict (retry .DefaultBackoff , func () error {
229+ if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpu ), gpu ); err != nil {
230+ return err
231+ }
232+
233+ newStatus := tfv1.GPUStatus {
234+ Phase : tfv1 .TensorFusionGPUPhaseRunning ,
235+ Capacity : & tfv1.Resource {
236+ Vram : resource .MustParse (fmt .Sprintf ("%dKi" , memInfo .Total / 1024 )),
237+ Tflops : tflops ,
238+ },
239+ UUID : uuid ,
240+ GPUModel : deviceName ,
241+ NodeSelector : map [string ]string {
242+ "kubernetes.io/hostname" : k8sNodeName ,
243+ },
244+ }
245+
246+ if gpu .Status .Available == nil {
247+ newStatus .Available = newStatus .Capacity
248+ } else {
249+ newStatus .Available = gpu .Status .Available
250+ }
251+ gpu .Status = newStatus
252+ return k8sClient .Status ().Update (ctx , gpu )
253+ })
254+ if err != nil {
255+ ctrl .Log .Error (err , "failed to update status of GPU after retries" , "gpu" , gpu )
256+ os .Exit (1 )
257+ }
258+
259+ return gpu
260+ }
261+
256262func nodeStatus (k8sNodeName string ) * tfv1.GPUNodeStatus {
257263 return & tfv1.GPUNodeStatus {
258264 KubernetesNodeName : k8sNodeName ,
@@ -309,7 +315,7 @@ func getDiskInfo(path string) (total int64) {
309315 err = syscall .Statfs (absPath , & stat )
310316 if err != nil {
311317 if errors .Is (err , syscall .ENOENT ) {
312- err = os .MkdirAll (absPath , 0755 )
318+ err = os .MkdirAll (absPath , 0o755 )
313319 if err != nil {
314320 fmt .Printf ("error creating folder: %s, err: %v\n " , absPath , err )
315321 return 0
0 commit comments