Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 74 additions & 19 deletions internal/exporter/host/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ import (
// RetryItem represents a failed exporter instance that needs to be retried
type RetryItem struct {
ExporterInstance *api.ExporterInstance
HostSSH ssh.HostManager
HostName string
RenderedHost *api.ExporterHost // The rendered host with templates applied
Attempts int
LastError error
LastAttemptTime time.Time
Expand Down Expand Up @@ -193,16 +193,24 @@ func (e *ExporterHostSyncer) addToRetryQueue(retryItem *RetryItem, err error, ne
*nextRetryQueue = append(*nextRetryQueue, *retryItem)
}

// processExporterInstancesAndBootc processes exporter instances and adds failures to global retry queue
func (e *ExporterHostSyncer) processExporterInstancesAndBootc(exporterInstances []*api.ExporterInstance, hostSsh ssh.HostManager, hostName string, retryQueue *[]RetryItem) {
// getRetryItemDescription returns a human-readable description of what is being retried
func getRetryItemDescription(retryItem RetryItem) string {
if retryItem.ExporterInstance == nil {
return "bootc upgrade"
} else {
return fmt.Sprintf("instance %s", retryItem.ExporterInstance.Name)
}
}

// processExporterInstancesAndBootc processes exporter instances and adds failures to global retry queue
func (e *ExporterHostSyncer) processExporterInstancesAndBootc(exporterInstances []*api.ExporterInstance, hostSsh ssh.HostManager, hostName string, renderedHost *api.ExporterHost, retryQueue *[]RetryItem) {
for _, exporterInstance := range exporterInstances {
if err := e.processExporterInstance(exporterInstance, hostSsh); err != nil {
fmt.Printf(" ❌ Failed to process %s: %v\n", exporterInstance.Name, err)
*retryQueue = append(*retryQueue, RetryItem{
ExporterInstance: exporterInstance,
HostSSH: hostSsh,
HostName: hostName,
RenderedHost: renderedHost,
Attempts: 1,
LastError: err,
LastAttemptTime: time.Now(),
Expand All @@ -215,8 +223,8 @@ func (e *ExporterHostSyncer) processExporterInstancesAndBootc(exporterInstances
fmt.Printf(" ⚠️ Bootc upgrade error: %v\n", err)
*retryQueue = append(*retryQueue, RetryItem{
ExporterInstance: nil,
HostSSH: hostSsh,
HostName: hostName,
RenderedHost: renderedHost,
Attempts: 1,
LastError: err,
LastAttemptTime: time.Now(),
Expand All @@ -236,9 +244,15 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err
for _, retryItem := range retryQueue {
// Check if we've exceeded max attempts
if retryItem.Attempts >= e.retryConfig.MaxAttempts {
fmt.Printf("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n",
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError)
finalErrors = append(finalErrors, fmt.Sprintf("%s on %s: %v", retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError))
if retryItem.ExporterInstance == nil {
fmt.Printf("💀 Max retry attempts exceeded for bootc upgrade on %s, giving up: %v\n",
retryItem.HostName, retryItem.LastError)
finalErrors = append(finalErrors, fmt.Sprintf("bootc upgrade on %s: %v", retryItem.HostName, retryItem.LastError))
} else {
fmt.Printf("💀 Max retry attempts exceeded for %s on %s, giving up: %v\n",
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError)
finalErrors = append(finalErrors, fmt.Sprintf("%s on %s: %v", retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.LastError))
}
continue
}

Expand All @@ -257,21 +271,40 @@ func (e *ExporterHostSyncer) processGlobalRetryQueue(retryQueue []RetryItem) err

// Second pass: retry items that are ready
for _, retryItem := range itemsToRetry {
// Always create a fresh SSH connection for every retry attempt
fmt.Printf("🔄 Retrying %s on %s (attempt %d/%d)...\n",
getRetryItemDescription(retryItem), retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)

// Create a fresh SSH connection
hostSsh, err := ssh.NewSSHHostManager(retryItem.RenderedHost)
if err != nil {
fmt.Printf("❌ SSH connection failed for %s: %v\n", retryItem.HostName, err)
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
continue
}

// Test the connection
status, err := hostSsh.Status()
if err != nil {
fmt.Printf("❌ SSH connection test failed for %s: %v\n", retryItem.HostName, err)
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
continue
}

fmt.Printf("✅ SSH connection established for %s: %s\n", retryItem.HostName, status)

// Now perform the actual retry operation
if retryItem.ExporterInstance == nil {
fmt.Printf("🔄 Retrying bootc upgrade on %s (attempt %d/%d)...\n",
retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)
if err := retryItem.HostSSH.HandleBootcUpgrade(e.dryRun); err != nil {
// This was a bootc upgrade failure
if err := hostSsh.HandleBootcUpgrade(e.dryRun); err != nil {
fmt.Printf("❌ Retry failed for bootc upgrade on %s: %v\n", retryItem.HostName, err)
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
} else {
fmt.Printf("✅ Retry succeeded for bootc upgrade on %s\n", retryItem.HostName)
}
} else {
fmt.Printf("🔄 Retrying instance %s on %s (attempt %d/%d)...\n",
retryItem.ExporterInstance.Name, retryItem.HostName, retryItem.Attempts+1, e.retryConfig.MaxAttempts)

if err := e.processExporterInstance(retryItem.ExporterInstance, retryItem.HostSSH); err != nil {
// Still failed, increment attempts and add to next retry queue
// This was an exporter instance failure
if err := e.processExporterInstance(retryItem.ExporterInstance, hostSsh); err != nil {
fmt.Printf("❌ Retry failed for %s on %s: %v\n", retryItem.ExporterInstance.Name, retryItem.HostName, err)
e.addToRetryQueue(&retryItem, err, &nextRetryQueue)
} else {
Expand Down Expand Up @@ -332,28 +365,50 @@ func (e *ExporterHostSyncer) SyncExporterHosts() error {
continue
}

// Apply templates to the host
hostCopy := host.DeepCopy()
if err := e.tapplier.Apply(hostCopy); err != nil {
return fmt.Errorf("error applying template for %s: %w", host.Name, err)
}

fmt.Printf("\n💻 Exporter host: %s\n", hostCopy.Spec.Addresses[0])

// Try to create SSH connection
hostSsh, err := ssh.NewSSHHostManager(hostCopy)
if err != nil {
return fmt.Errorf("error creating SSH host manager for %s: %w", host.Name, err)
fmt.Printf(" ❌ Failed to create SSH connection: %v\n", err)
// Add SSH connection failure to retry queue
retryQueue = append(retryQueue, RetryItem{
ExporterInstance: nil,
HostName: host.Name,
RenderedHost: hostCopy,
Attempts: 1,
LastError: err,
LastAttemptTime: time.Now(),
})
continue
}

status, err := hostSsh.Status()
if err != nil {
return fmt.Errorf("error getting status for %s: %w", host.Name, err)
fmt.Printf(" ❌ Failed to get SSH status: %v\n", err)
// Add SSH status failure to retry queue
retryQueue = append(retryQueue, RetryItem{
ExporterInstance: nil,
HostName: host.Name,
RenderedHost: hostCopy,
Attempts: 1,
LastError: err,
LastAttemptTime: time.Now(),
})
continue
}
if e.dryRun {
fmt.Printf(" ✅ Connection: %s\n", status)
}

// Process each exporter instance and add failures to global retry queue
e.processExporterInstancesAndBootc(exporterInstances, hostSsh, host.Name, &retryQueue)
e.processExporterInstancesAndBootc(exporterInstances, hostSsh, host.Name, hostCopy, &retryQueue)
}

// Second pass: retry all failed instances globally
Expand Down