Skip to content

Commit 8f608da

Browse files
authored
Merge pull request #20 from jumpstarter-dev/retry-all-ssh
Also retry SSH connections
2 parents e2e0d13 + 490d49f commit 8f608da

File tree

2 files changed

+118
-33
lines changed

2 files changed

+118
-33
lines changed

internal/exporter/host/host.go

Lines changed: 97 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,8 @@ import (
3333
// RetryItem represents a failed exporter instance that needs to be retried
3434
type RetryItem struct {
3535
ExporterInstance *api.ExporterInstance
36-
HostSSH ssh.HostManager
3736
HostName string
37+
RenderedHost *api.ExporterHost // The rendered host with templates applied
3838
Attempts int
3939
LastError error
4040
LastAttemptTime time.Time
@@ -70,10 +70,11 @@ func NewExporterHostSyncer(cfg *config.Config,
7070
dryRun: dryRun,
7171
debugConfigs: debugConfigs,
7272
exporterFilter: exporterFilter,
73+
// this provides 10 minutes of retries with a max delay of 120 seconds
7374
retryConfig: RetryConfig{
74-
MaxAttempts: 3,
75+
MaxAttempts: 9,
7576
BaseDelay: 5 * time.Second,
76-
MaxDelay: 60 * time.Second,
77+
MaxDelay: 120 * time.Second,
7778
BackoffMultiplier: 2.0,
7879
},
7980
}
@@ -193,30 +194,73 @@ func (e *ExporterHostSyncer) addToRetryQueue(retryItem *RetryItem, err error, ne
193194
*nextRetryQueue = append(*nextRetryQueue, *retryItem)
194195
}
195196

197+
// getRetryItemDescription returns a human-readable description of what is being retried
198+
func getRetryItemDescription(retryItem RetryItem) string {
199+
if retryItem.ExporterInstance == nil {
200+
return "bootc upgrade"
201+
} else {
202+
return fmt.Sprintf("instance %s", retryItem.ExporterInstance.Name)
203+
}
204+
}
205+
196206
// processExporterInstancesAndBootc processes exporter instances and adds failures to global retry queue
197-
func (e *ExporterHostSyncer) processExporterInstancesAndBootc(exporterInstances []*api.ExporterInstance, hostSsh ssh.HostManager, hostName string, retryQueue *[]RetryItem) {
207+
func (e *ExporterHostSyncer) processExporterInstancesAndBootc(exporterInstances []*api.ExporterInstance, hostName string, renderedHost *api.ExporterHost, retryQueue *[]RetryItem) {
208+
// Create SSH connection
209+
hostSsh, err := ssh.NewSSHHostManager(renderedHost)
210+
if err == nil {
211+
_, err = hostSsh.Status()
212+
}
213+
if err != nil {
214+
fmt.Printf(" ❌ Failed to create/test SSH connection: %v\n", err)
215+
// Queue all exporter instances for retry
216+
for _, exporterInstance := range exporterInstances {
217+
*retryQueue = append(*retryQueue, RetryItem{
218+
ExporterInstance: exporterInstance,
219+
HostName: hostName,
220+
RenderedHost: renderedHost,
221+
Attempts: 1,
222+
LastError: err,
223+
LastAttemptTime: time.Now(),
224+
})
225+
}
226+
// Also queue bootc upgrade for retry
227+
*retryQueue = append(*retryQueue, RetryItem{
228+
ExporterInstance: nil,
229+
HostName: hostName,
230+
RenderedHost: renderedHost,
231+
Attempts: 1,
232+
LastError: err,
233+
LastAttemptTime: time.Now(),
234+
})
235+
return
236+
}
237+
238+
defer func() {
239+
_ = hostSsh.Close()
240+
}()
198241

242+
// Process exporter instances
199243
for _, exporterInstance := range exporterInstances {
200244
if err := e.processExporterInstance(exporterInstance, hostSsh); err != nil {
201245
fmt.Printf(" ❌ Failed to process %s: %v\n", exporterInstance.Name, err)
202246
*retryQueue = append(*retryQueue, RetryItem{
203247
ExporterInstance: exporterInstance,
204-
HostSSH: hostSsh,
205248
HostName: hostName,
249+
RenderedHost: renderedHost,
206250
Attempts: 1,
207251
LastError: err,
208252
LastAttemptTime: time.Now(),
209253
})
210254
}
211255
}
212256

257+
// Handle bootc upgrade
213258
if err := hostSsh.HandleBootcUpgrade(e.dryRun); err != nil {
214-
// For other errors, just log them and continue
215259
fmt.Printf(" ⚠️ Bootc upgrade error: %v\n", err)
216260
*retryQueue = append(*retryQueue, RetryItem{
217261
ExporterInstance: nil,
218-
HostSSH: hostSsh,
219262
HostName: hostName,
263+
RenderedHost: renderedHost,
220264
Attempts: 1,
221265
LastError: err,
222266
LastAttemptTime: time.Now(),
@@ -236,9 +280,15 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err
236280
for _, retryItem := range retryQueue {
237281
// Check if we've exceeded max attempts
238282
if retryItem.Attempts >= e.retryConfig.MaxAttempts {
239-
fmt.Printf("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n",
240-
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError)
241-
finalErrors = append(finalErrors, fmt.Sprintf("%s on %s: %v", retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError))
283+
if retryItem.ExporterInstance == nil {
284+
fmt.Printf("💀 Max retry attempts exceeded for bootc upgrade on %s, giving up: %v\n",
285+
retryItem.HostName, retryItem.LastError)
286+
finalErrors = append(finalErrors, fmt.Sprintf("bootc upgrade on %s: %v", retryItem.HostName, retryItem.LastError))
287+
} else {
288+
fmt.Printf("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n",
289+
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError)
290+
finalErrors = append(finalErrors, fmt.Sprintf("%s on %s: %v", retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError))
291+
}
242292
continue
243293
}
244294

@@ -257,21 +307,43 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err
257307

258308
// Second pass: retry items that are ready
259309
for _, retryItem := range itemsToRetry {
310+
fmt.Printf("🔄 Retrying %s on %s (attempt %d/%d)...\n",
311+
getRetryItemDescription(retryItem), retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)
312+
313+
// Create a fresh SSH connection
314+
hostSsh, err := ssh.NewSSHHostManager(retryItem.RenderedHost)
315+
if err != nil {
316+
fmt.Printf("❌ SSH connection failed for %s: %v\n", retryItem.HostName, err)
317+
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
318+
continue
319+
}
320+
321+
defer func() {
322+
_ = hostSsh.Close()
323+
}()
324+
325+
// Test the connection
326+
status, err := hostSsh.Status()
327+
if err != nil {
328+
fmt.Printf("❌ SSH connection test failed for %s: %v\n", retryItem.HostName, err)
329+
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
330+
continue
331+
}
332+
333+
fmt.Printf("✅ SSH connection established for %s: %s\n", retryItem.HostName, status)
334+
335+
// Now perform the actual retry operation
260336
if retryItem.ExporterInstance == nil {
261-
fmt.Printf("🔄 Retrying bootc upgrade on %s (attempt %d/%d)...\n",
262-
retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)
263-
if err := retryItem.HostSSH.HandleBootcUpgrade(e.dryRun); err != nil {
337+
// This was a bootc upgrade failure
338+
if err := hostSsh.HandleBootcUpgrade(e.dryRun); err != nil {
264339
fmt.Printf("❌ Retry failed for bootc upgrade on %s: %v\n", retryItem.HostName, err)
265340
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
266341
} else {
267342
fmt.Printf("✅ Retry succeeded for bootc upgrade on %s\n", retryItem.HostName)
268343
}
269344
} else {
270-
fmt.Printf("🔄 Retrying instance %s on %s (attempt %d/%d)...\n",
271-
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)
272-
273-
if err := e.processExporterInstance(retryItem.ExporterInstance, retryItem.HostSSH); err != nil {
274-
// Still failed, increment attempts and add to next retry queue
345+
// This was an exporter instance failure
346+
if err := e.processExporterInstance(retryItem.ExporterInstance, hostSsh); err != nil {
275347
fmt.Printf("❌ Retry failed for %s on %s: %v\n", retryItem.ExporterInstance.Name, retryItem.HostName, err)
276348
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
277349
} else {
@@ -332,28 +404,20 @@ func (e *ExporterHostSyncer) SyncExporterHosts() error {
332404
continue
333405
}
334406

407+
// Apply templates to the host
335408
hostCopy := host.DeepCopy()
336409
if err := e.tapplier.Apply(hostCopy); err != nil {
337410
return fmt.Errorf("error applying template for %s: %w", host.Name, err)
338411
}
339-
340-
fmt.Printf("\n💻 Exporter host: %s\n", hostCopy.Spec.Addresses[0])
341-
342-
hostSsh, err := ssh.NewSSHHostManager(hostCopy)
343-
if err != nil {
344-
return fmt.Errorf("error creating SSH host manager for %s: %w", host.Name, err)
345-
}
346-
347-
status, err := hostSsh.Status()
348-
if err != nil {
349-
return fmt.Errorf("error getting status for %s: %w", host.Name, err)
350-
}
351-
if e.dryRun {
352-
fmt.Printf(" ✅ Connection: %s\n", status)
412+
// if there are no addresses, skip the host
413+
if len(hostCopy.Spec.Addresses) == 0 {
414+
fmt.Printf(" ❌ Skipping %s - no addresses\n", host.Name)
415+
continue
353416
}
417+
fmt.Printf("\n💻 Exporter host: %s\n", hostCopy.Spec.Addresses[0])
354418

355419
// Process each exporter instance and add failures to global retry queue
356-
e.processExporterInstancesAndBootc(exporterInstances, hostSsh, host.Name, &retryQueue)
420+
e.processExporterInstancesAndBootc(exporterInstances, host.Name, hostCopy, &retryQueue)
357421
}
358422

359423
// Second pass: retry all failed instances globally

internal/exporter/ssh/ssh.go

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ type HostManager interface {
3737
RunHostCommand(command string) (*CommandResult, error)
3838
GetBootcStatus() BootcStatus
3939
HandleBootcUpgrade(dryRun bool) error
40+
Close() error
4041
}
4142

4243
// CommandResult represents the result of running a command via SSH
@@ -545,3 +546,23 @@ func (m *SSHHostManager) createSshClient() (*ssh.Client, error) {
545546
return client, nil
546547

547548
}
549+
550+
func (m *SSHHostManager) Close() error {
551+
var sftpCloseError error = nil
552+
var sshCloseError error = nil
553+
if m.sftpClient != nil {
554+
sftpCloseError = m.sftpClient.Close()
555+
}
556+
if m.sshClient != nil {
557+
sshCloseError = m.sshClient.Close()
558+
}
559+
if sshCloseError != nil {
560+
return sshCloseError
561+
}
562+
563+
if sftpCloseError != nil {
564+
return sftpCloseError
565+
}
566+
567+
return nil
568+
}

0 commit comments

Comments
 (0)