@@ -33,8 +33,8 @@ import (
3333// RetryItem represents a failed exporter instance that needs to be retried
3434type RetryItem struct {
3535 ExporterInstance * api.ExporterInstance
36- HostSSH ssh.HostManager
3736 HostName string
37+ RenderedHost * api.ExporterHost // The rendered host with templates applied
3838 Attempts int
3939 LastError error
4040 LastAttemptTime time.Time
@@ -70,10 +70,11 @@ func NewExporterHostSyncer(cfg *config.Config,
7070 dryRun : dryRun ,
7171 debugConfigs : debugConfigs ,
7272 exporterFilter : exporterFilter ,
73+ // this provides 10 minutes of retries with a max delay of 120 seconds
7374 retryConfig : RetryConfig {
74- MaxAttempts : 3 ,
75+ MaxAttempts : 9 ,
7576 BaseDelay : 5 * time .Second ,
76- MaxDelay : 60 * time .Second ,
77+ MaxDelay : 120 * time .Second ,
7778 BackoffMultiplier : 2.0 ,
7879 },
7980 }
@@ -193,30 +194,73 @@ func (e *ExporterHostSyncer) addToRetryQueue(retryItem *RetryItem, err error, ne
193194 * nextRetryQueue = append (* nextRetryQueue , * retryItem )
194195}
195196
197+ // getRetryItemDescription returns a human-readable description of what is being retried
198+ func getRetryItemDescription (retryItem RetryItem ) string {
199+ if retryItem .ExporterInstance == nil {
200+ return "bootc upgrade"
201+ } else {
202+ return fmt .Sprintf ("instance %s" , retryItem .ExporterInstance .Name )
203+ }
204+ }
205+
196206// processExporterInstancesAndBootc processes exporter instances and adds failures to global retry queue
197- func (e * ExporterHostSyncer ) processExporterInstancesAndBootc (exporterInstances []* api.ExporterInstance , hostSsh ssh.HostManager , hostName string , retryQueue * []RetryItem ) {
207+ func (e * ExporterHostSyncer ) processExporterInstancesAndBootc (exporterInstances []* api.ExporterInstance , hostName string , renderedHost * api.ExporterHost , retryQueue * []RetryItem ) {
208+ // Create SSH connection
209+ hostSsh , err := ssh .NewSSHHostManager (renderedHost )
210+ if err == nil {
211+ _ , err = hostSsh .Status ()
212+ }
213+ if err != nil {
214+ fmt .Printf (" ❌ Failed to create/test SSH connection: %v\n " , err )
215+ // Queue all exporter instances for retry
216+ for _ , exporterInstance := range exporterInstances {
217+ * retryQueue = append (* retryQueue , RetryItem {
218+ ExporterInstance : exporterInstance ,
219+ HostName : hostName ,
220+ RenderedHost : renderedHost ,
221+ Attempts : 1 ,
222+ LastError : err ,
223+ LastAttemptTime : time .Now (),
224+ })
225+ }
226+ // Also queue bootc upgrade for retry
227+ * retryQueue = append (* retryQueue , RetryItem {
228+ ExporterInstance : nil ,
229+ HostName : hostName ,
230+ RenderedHost : renderedHost ,
231+ Attempts : 1 ,
232+ LastError : err ,
233+ LastAttemptTime : time .Now (),
234+ })
235+ return
236+ }
237+
238+ defer func () {
239+ _ = hostSsh .Close ()
240+ }()
198241
242+ // Process exporter instances
199243 for _ , exporterInstance := range exporterInstances {
200244 if err := e .processExporterInstance (exporterInstance , hostSsh ); err != nil {
201245 fmt .Printf (" ❌ Failed to process %s: %v\n " , exporterInstance .Name , err )
202246 * retryQueue = append (* retryQueue , RetryItem {
203247 ExporterInstance : exporterInstance ,
204- HostSSH : hostSsh ,
205248 HostName : hostName ,
249+ RenderedHost : renderedHost ,
206250 Attempts : 1 ,
207251 LastError : err ,
208252 LastAttemptTime : time .Now (),
209253 })
210254 }
211255 }
212256
257+ // Handle bootc upgrade
213258 if err := hostSsh .HandleBootcUpgrade (e .dryRun ); err != nil {
214- // For other errors, just log them and continue
215259 fmt .Printf (" ⚠️ Bootc upgrade error: %v\n " , err )
216260 * retryQueue = append (* retryQueue , RetryItem {
217261 ExporterInstance : nil ,
218- HostSSH : hostSsh ,
219262 HostName : hostName ,
263+ RenderedHost : renderedHost ,
220264 Attempts : 1 ,
221265 LastError : err ,
222266 LastAttemptTime : time .Now (),
@@ -236,9 +280,15 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err
236280 for _ , retryItem := range retryQueue {
237281 // Check if we've exceeded max attempts
238282 if retryItem .Attempts >= e .retryConfig .MaxAttempts {
239- fmt .Printf ("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n " ,
240- retryItem .ExporterInstance .Name , retryItem .HostName , retryItem .LastError )
241- finalErrors = append (finalErrors , fmt .Sprintf ("%s on %s: %v" , retryItem .ExporterInstance .Name , retryItem .HostName , retryItem .LastError ))
283+ if retryItem .ExporterInstance == nil {
284+ fmt .Printf ("💀 Max retry attempts exceeded for bootc upgrade on %s, giving up: %v\n " ,
285+ retryItem .HostName , retryItem .LastError )
286+ finalErrors = append (finalErrors , fmt .Sprintf ("bootc upgrade on %s: %v" , retryItem .HostName , retryItem .LastError ))
287+ } else {
288+ fmt .Printf ("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n " ,
289+ retryItem .ExporterInstance .Name , retryItem .HostName , retryItem .LastError )
290+ finalErrors = append (finalErrors , fmt .Sprintf ("%s on %s: %v" , retryItem .ExporterInstance .Name , retryItem .HostName , retryItem .LastError ))
291+ }
242292 continue
243293 }
244294
@@ -257,21 +307,43 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err
257307
258308 // Second pass: retry items that are ready
259309 for _ , retryItem := range itemsToRetry {
310+ fmt .Printf ("🔄 Retrying %s on %s (attempt %d/%d)...\n " ,
311+ getRetryItemDescription (retryItem ), retryItem .HostName , retryItem .Attempts + 1 , e .retryConfig .MaxAttempts )
312+
313+ // Create a fresh SSH connection
314+ hostSsh , err := ssh .NewSSHHostManager (retryItem .RenderedHost )
315+ if err != nil {
316+ fmt .Printf ("❌ SSH connection failed for %s: %v\n " , retryItem .HostName , err )
317+ e .addToRetryQueue (& retryItem , err , & nextRetryQueue )
318+ continue
319+ }
320+
321+ defer func () {
322+ _ = hostSsh .Close ()
323+ }()
324+
325+ // Test the connection
326+ status , err := hostSsh .Status ()
327+ if err != nil {
328+ fmt .Printf ("❌ SSH connection test failed for %s: %v\n " , retryItem .HostName , err )
329+ e .addToRetryQueue (& retryItem , err , & nextRetryQueue )
330+ continue
331+ }
332+
333+ fmt .Printf ("✅ SSH connection established for %s: %s\n " , retryItem .HostName , status )
334+
335+ // Now perform the actual retry operation
260336 if retryItem .ExporterInstance == nil {
261- fmt .Printf ("🔄 Retrying bootc upgrade on %s (attempt %d/%d)...\n " ,
262- retryItem .HostName , retryItem .Attempts + 1 , e .retryConfig .MaxAttempts )
263- if err := retryItem .HostSSH .HandleBootcUpgrade (e .dryRun ); err != nil {
337+ // This was a bootc upgrade failure
338+ if err := hostSsh .HandleBootcUpgrade (e .dryRun ); err != nil {
264339 fmt .Printf ("❌ Retry failed for bootc upgrade on %s: %v\n " , retryItem .HostName , err )
265340 e .addToRetryQueue (& retryItem , err , & nextRetryQueue )
266341 } else {
267342 fmt .Printf ("✅ Retry succeeded for bootc upgrade on %s\n " , retryItem .HostName )
268343 }
269344 } else {
270- fmt .Printf ("🔄 Retrying instance %s on %s (attempt %d/%d)...\n " ,
271- retryItem .ExporterInstance .Name , retryItem .HostName , retryItem .Attempts + 1 , e .retryConfig .MaxAttempts )
272-
273- if err := e .processExporterInstance (retryItem .ExporterInstance , retryItem .HostSSH ); err != nil {
274- // Still failed, increment attempts and add to next retry queue
345+ // This was an exporter instance failure
346+ if err := e .processExporterInstance (retryItem .ExporterInstance , hostSsh ); err != nil {
275347 fmt .Printf ("❌ Retry failed for %s on %s: %v\n " , retryItem .ExporterInstance .Name , retryItem .HostName , err )
276348 e .addToRetryQueue (& retryItem , err , & nextRetryQueue )
277349 } else {
@@ -332,28 +404,20 @@ func (e *ExporterHostSyncer) SyncExporterHosts() error {
332404 continue
333405 }
334406
407+ // Apply templates to the host
335408 hostCopy := host .DeepCopy ()
336409 if err := e .tapplier .Apply (hostCopy ); err != nil {
337410 return fmt .Errorf ("error applying template for %s: %w" , host .Name , err )
338411 }
339-
340- fmt .Printf ("\n 💻 Exporter host: %s\n " , hostCopy .Spec .Addresses [0 ])
341-
342- hostSsh , err := ssh .NewSSHHostManager (hostCopy )
343- if err != nil {
344- return fmt .Errorf ("error creating SSH host manager for %s: %w" , host .Name , err )
345- }
346-
347- status , err := hostSsh .Status ()
348- if err != nil {
349- return fmt .Errorf ("error getting status for %s: %w" , host .Name , err )
350- }
351- if e .dryRun {
352- fmt .Printf (" ✅ Connection: %s\n " , status )
412+ // if there are no addresses, skip the host
413+ if len (hostCopy .Spec .Addresses ) == 0 {
414+ fmt .Printf (" ❌ Skipping %s - no addresses\n " , host .Name )
415+ continue
353416 }
417+ fmt .Printf ("\n 💻 Exporter host: %s\n " , hostCopy .Spec .Addresses [0 ])
354418
355419 // Process each exporter instance and add failures to global retry queue
356- e .processExporterInstancesAndBootc (exporterInstances , hostSsh , host .Name , & retryQueue )
420+ e .processExporterInstancesAndBootc (exporterInstances , host .Name , hostCopy , & retryQueue )
357421 }
358422
359423 // Second pass: retry all failed instances globally
0 commit comments