@@ -24,6 +24,7 @@ import (
2424 utilruntime "k8s.io/apimachinery/pkg/util/runtime"
2525 "k8s.io/client-go/rest"
2626 "k8s.io/client-go/tools/clientcmd"
27+ "k8s.io/client-go/util/retry"
2728 ctrl "sigs.k8s.io/controller-runtime"
2829 "sigs.k8s.io/controller-runtime/pkg/client"
2930 "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil"
@@ -140,28 +141,23 @@ func main() {
140141 })
141142 tflops := info .Fp16TFlops
142143 if ! ok {
143- tflops = resource.Quantity {}
144- ctrl .Log .Info ("unable to find GPU info from config" , "deviceName" , deviceName , "uuid" , uuid )
144+ ctrl .Log .Info (
145+ "[Error] Unknown GPU model, please update `gpu-public-gpu-info` configMap " +
146+ " to match your GPU model name in `nvidia-smi`, this may cause you workload stuck, " +
147+ "refer this doc to resolve it in detail: " +
148+ "https://tensor-fusion.ai/guide/troubleshooting/handbook" +
149+ "#pod-stuck-in-starting-status-after-enabling-tensorfusion" ,
150+ "deviceName" , deviceName , "uuid" , uuid )
151+ os .Exit (1 )
145152 } else {
146- ctrl .Log .Info ("found GPU info from config" , "deviceName" , deviceName , "baseline FP16 TFlops" , tflops , "uuid" , uuid )
153+ ctrl .Log .Info ("found GPU info from config" , "deviceName" , deviceName , "FP16 TFlops" , tflops , "uuid" , uuid )
147154 }
148155 gpu := & tfv1.GPU {
149156 ObjectMeta : metav1.ObjectMeta {
150157 Name : uuid ,
151- Labels : map [string ]string {
152- constants .LabelKeyOwner : gpunode .Name ,
153- },
154- Annotations : map [string ]string {
155- constants .GPULastReportTimeAnnotationKey : time .Now ().Format (time .RFC3339 ),
156- },
157158 },
158159 }
159160
160- if err := controllerutil .SetControllerReference (gpunode , gpu , Scheme ); err != nil {
161- ctrl .Log .Error (err , "failed to set controller reference" )
162- os .Exit (1 )
163- }
164-
165161 gpuStatus := tfv1.GPUStatus {
166162 Phase : tfv1 .TensorFusionGPUPhaseRunning ,
167163 Capacity : & tfv1.Resource {
@@ -174,11 +170,30 @@ func main() {
174170 "kubernetes.io/hostname" : k8sNodeName ,
175171 },
176172 }
177- _ , err = controllerutil .CreateOrUpdate (ctx , k8sClient , gpu , func () error { return nil })
173+
174+ err = retry .OnError (retry .DefaultBackoff , func (err error ) bool {
175+ return true // Retry on all errors for now
176+ }, func () error {
177+ _ , err := controllerutil .CreateOrUpdate (ctx , k8sClient , gpu , func () error {
178+ // Set metadata fields
179+ gpu .Labels = map [string ]string {
180+ constants .LabelKeyOwner : gpunode .Name ,
181+ }
182+ gpu .Annotations = map [string ]string {
183+ constants .GPULastReportTimeAnnotationKey : time .Now ().Format (time .RFC3339 ),
184+ }
185+
186+ // Set controller reference
187+ return controllerutil .SetControllerReference (gpunode , gpu , Scheme )
188+ })
189+ return err
190+ })
191+
178192 if err != nil {
179- ctrl .Log .Error (err , "failed to create GPU" , "gpu" , gpu )
193+ ctrl .Log .Error (err , "failed to create or update GPU after retries " , "gpu" , gpu )
180194 os .Exit (1 )
181195 }
196+
182197 available := gpuStatus .Available
183198 gpu .Status = gpuStatus
184199 if available == nil {
@@ -187,8 +202,19 @@ func main() {
187202 gpu .Status .Available = available
188203 }
189204
190- if err := k8sClient .Status ().Patch (ctx , gpu , client .Merge ); err != nil {
191- ctrl .Log .Error (err , "failed to update status of GPU" , "gpu" , gpu )
205+ err = retry .RetryOnConflict (retry .DefaultBackoff , func () error {
206+ currentGPU := & tfv1.GPU {}
207+ if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpu ), currentGPU ); err != nil {
208+ return err
209+ }
210+
211+ currentGPU .Status = gpu .Status
212+
213+ return k8sClient .Status ().Update (ctx , currentGPU )
214+ })
215+
216+ if err != nil {
217+ ctrl .Log .Error (err , "failed to update status of GPU after retries" , "gpu" , gpu )
192218 os .Exit (1 )
193219 }
194220
@@ -209,8 +235,20 @@ func main() {
209235 ns .NodeInfo .RAMSize = * resource .NewQuantity (getTotalHostRAM (), resource .DecimalSI )
210236 ns .NodeInfo .DataDiskSize = * resource .NewQuantity (getDiskInfo (constants .TFDataPath ), resource .DecimalSI )
211237 gpunode .Status = * ns
212- if err := k8sClient .Status ().Patch (ctx , gpunode , client .Merge ); err != nil {
213- ctrl .Log .Error (err , "failed to update status of GPUNode" )
238+
239+ err = retry .RetryOnConflict (retry .DefaultBackoff , func () error {
240+ currentGPUNode := & tfv1.GPUNode {}
241+ if err := k8sClient .Get (ctx , client .ObjectKeyFromObject (gpunode ), currentGPUNode ); err != nil {
242+ return err
243+ }
244+
245+ currentGPUNode .Status = * ns
246+
247+ return k8sClient .Status ().Update (ctx , currentGPUNode )
248+ })
249+
250+ if err != nil {
251+ ctrl .Log .Error (err , "failed to update status of GPUNode after retries" )
214252 os .Exit (1 )
215253 }
216254}
0 commit comments