Skip to content

Commit 833c79c

Browse files
authored
Merge pull request #195 from pfeifferj/feat-circuit-breaker-logging
feat: improve logging for circuitbreaker
2 parents 08628e7 + 07efdd2 commit 833c79c

File tree

5 files changed

+21
-7
lines changed

5 files changed

+21
-7
lines changed

docs/troubleshooting.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ kubectl patch ibmnodeclass YOUR-NODECLASS --type='merge' \
104104

105105
**Step 2c: Test Connectivity from Node**
106106
```bash
107-
# Attach floating IP for debugging
107+
# Attach floating IP for debugging
108108

109109
# Then SSH and test
110110
ssh -i ~/.ssh/eb root@FLOATING_IP
@@ -190,7 +190,7 @@ kubectl get ibmnodeclass YOUR-NODECLASS -o yaml
190190
```bash
191191
# Verify instances are created in correct resource group
192192
ibmcloud is instances --output json | \
193-
jq '.[] | select(.name | contains("nodepool")) |
193+
jq '.[] | select(.name | contains("nodepool")) |
194194
{name: .name, resource_group: .resource_group.id}'
195195

196196
# Should match the resource group in IBMNodeClass

pkg/cloudprovider/circuitbreaker.go

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -212,11 +212,20 @@ func (cb *CircuitBreaker) RecordFailure(nodeClass, region string, err error) {
212212
if recentFailures >= cb.config.FailureThreshold {
213213
if cb.state != CircuitBreakerOpen {
214214
cb.transitionToOpen()
215+
// Include recent failure details for better debugging
216+
recentErrors := make([]string, 0, len(cb.failures))
217+
for _, f := range cb.failures {
218+
if f.Timestamp.After(time.Now().Add(-cb.config.FailureWindow)) {
219+
recentErrors = append(recentErrors, fmt.Sprintf("%s: %s", f.Timestamp.Format("15:04:05"), f.Error))
220+
}
221+
}
215222
cb.logger.Error(fmt.Errorf("circuit breaker opened due to failure threshold exceeded"), "Circuit breaker OPENED",
216223
"failures", recentFailures,
217224
"threshold", cb.config.FailureThreshold,
218225
"nodeClass", nodeClass,
219-
"region", region)
226+
"region", region,
227+
"recentErrors", recentErrors,
228+
"recoveryTimeout", cb.config.RecoveryTimeout)
220229
}
221230
} else if cb.state == CircuitBreakerHalfOpen {
222231
// Failure in half-open state - go back to open

pkg/cloudprovider/cloudprovider.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
298298
"nodeClass", nodeClass.Name,
299299
"region", nodeClass.Spec.Region)
300300
c.recorder.Publish(ibmevents.NodeClaimCircuitBreakerBlocked(nodeClaim, cbErr.Error()))
301-
return nil, cloudprovider.NewInsufficientCapacityError(fmt.Errorf("circuit breaker blocked provisioning: %w", cbErr))
301+
return nil, fmt.Errorf("provisioning temporarily blocked by circuit breaker: %w", cbErr)
302302
}
303303

304304
// Get the appropriate instance provider based on NodeClass configuration
@@ -311,7 +311,12 @@ func (c *CloudProvider) Create(ctx context.Context, nodeClaim *karpv1.NodeClaim)
311311

312312
node, err := instanceProvider.Create(ctx, nodeClaim)
313313
if err != nil {
314-
log.Error(err, "Failed to create instance")
314+
// Log the actual error details for better troubleshooting
315+
log.Error(err, "Failed to create instance",
316+
"nodeClass", nodeClass.Name,
317+
"region", nodeClass.Spec.Region,
318+
"zone", nodeClass.Spec.Zone,
319+
"instanceTypes", lo.Map(compatible, func(it *cloudprovider.InstanceType, _ int) string { return it.Name }))
315320
c.circuitBreaker.RecordFailure(nodeClass.Name, nodeClass.Spec.Region, err)
316321
return nil, fmt.Errorf("creating instance, %w", err)
317322
}

pkg/cloudprovider/cloudprovider_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,7 @@ func TestCloudProvider_Create_CircuitBreakerEventPublishing(t *testing.T) {
576576

577577
// Verify error
578578
assert.Error(t, err)
579-
assert.Contains(t, err.Error(), "circuit breaker blocked provisioning")
579+
assert.Contains(t, err.Error(), "provisioning temporarily blocked by circuit breaker")
580580

581581
// Verify the correct event was published
582582
assert.Len(t, eventRecorder.events, 1)

pkg/providers/common/types/bootstrap_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,7 @@ func TestParseKubeconfigWithSpacing(t *testing.T) {
248248
clusters:
249249
- cluster:
250250
certificate-authority-data: ` + base64.StdEncoding.EncodeToString([]byte("test-ca-data")) + `
251-
server: https://test.example.com:6443
251+
server: https://test.example.com:6443
252252
name: test-cluster`
253253

254254
endpoint, caData, err := ParseKubeconfig(kubeconfigWithSpacing)

0 commit comments

Comments
 (0)