ConductorOne · ggreer · May 12, 2025 · May 12, 2025 · May 12, 2025 · May 12, 2025
diff --git a/pkg/connectorbuilder/connectorbuilder.go b/pkg/connectorbuilder/connectorbuilder.go
@@ -4,7 +4,6 @@ import (
 	"context"
 	"errors"
 	"fmt"
-	"math"
 	"slices"
 	"sort"
 	"time"
@@ -23,6 +22,7 @@ import (
 	"github.com/conductorone/baton-sdk/pkg/crypto"
 	"github.com/conductorone/baton-sdk/pkg/metrics"
 	"github.com/conductorone/baton-sdk/pkg/pagination"
+	"github.com/conductorone/baton-sdk/pkg/retry"
 	"github.com/conductorone/baton-sdk/pkg/types"
 	"github.com/conductorone/baton-sdk/pkg/types/tasks"
 	"github.com/conductorone/baton-sdk/pkg/uhttp"
@@ -915,54 +915,48 @@ func (b *builderImpl) Grant(ctx context.Context, request *v2.GrantManagerService
 	tt := tasks.GrantType
 	l := ctxzap.Extract(ctx)
 
-	var (
-		attempt   = 0
-		baseDelay = 30 * time.Second
-		rt        = request.Entitlement.Resource.Id.ResourceType
-	)
+	rt := request.Entitlement.Resource.Id.ResourceType
 
-	provisioner, v1ok := b.resourceProvisioners[rt]
-	provisionerV2, v2ok := b.resourceProvisionersV2[rt]
-	if !v1ok && !v2ok {
+	retryer := retry.NewRetryer(ctx, retry.RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 15 * time.Second,
+		MaxDelay:     60 * time.Second,
+	})
+
+	var grantFunc func(ctx context.Context, principal *v2.Resource, entitlement *v2.Entitlement) ([]*v2.Grant, annotations.Annotations, error)
+	provisioner, ok := b.resourceProvisioners[rt]
+	if ok {
+		grantFunc = func(ctx context.Context, principal *v2.Resource, entitlement *v2.Entitlement) ([]*v2.Grant, annotations.Annotations, error) {
+			annos, err := provisioner.Grant(ctx, principal, entitlement)
+			if err != nil {
+				return nil, annos, err
+			}
+			return nil, annos, nil
+		}
+	}
+	provisionerV2, ok := b.resourceProvisionersV2[rt]
+	if ok {
+		grantFunc = provisionerV2.Grant
+	}
+
+	if grantFunc == nil {
 		l.Error("error: resource type does not have provisioner configured", zap.String("resource_type", rt))
 		b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
 		return nil, fmt.Errorf("error: resource type does not have provisioner configured")
 	}
 
 	for {
-		if v1ok {
-			annos, err := provisioner.Grant(ctx, request.Principal, request.Entitlement)
-			if err != nil {
-				l.Error("error: grant failed", zap.Error(err))
-				if !b.shouldWaitAndRetry(ctx, err, baseDelay) || attempt >= 2 {
-					b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
-					return nil, fmt.Errorf("err: grant failed: %w", err)
-				}
-
-				attempt++
-				baseDelay *= 2
-				continue
-			}
-
+		grants, annos, err := grantFunc(ctx, request.Principal, request.Entitlement)
+		if err == nil {
 			b.m.RecordTaskSuccess(ctx, tt, b.nowFunc().Sub(start))
-			return &v2.GrantManagerServiceGrantResponse{Annotations: annos}, nil
+			return &v2.GrantManagerServiceGrantResponse{Annotations: annos, Grants: grants}, nil
 		}
-
-		grants, annos, err := provisionerV2.Grant(ctx, request.Principal, request.Entitlement)
-		if err != nil {
-			l.Error("error: grant failed", zap.Error(err))
-			if !b.shouldWaitAndRetry(ctx, err, baseDelay) || attempt >= 2 {
-				b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
-				return nil, fmt.Errorf("err: grant failed: %w", err)
-			}
-
-			attempt++
-			baseDelay *= 2
+		if retryer.ShouldWaitAndRetry(ctx, err) {
 			continue
 		}
-
-		b.m.RecordTaskSuccess(ctx, tt, b.nowFunc().Sub(start))
-		return &v2.GrantManagerServiceGrantResponse{Annotations: annos, Grants: grants}, nil
+		l.Error("error: grant failed", zap.Error(err))
+		b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
+		return nil, fmt.Errorf("err: grant failed: %w", err)
 	}
 }
 
@@ -975,54 +969,42 @@ func (b *builderImpl) Revoke(ctx context.Context, request *v2.GrantManagerServic
 
 	l := ctxzap.Extract(ctx)
 
-	var (
-		attempt   = 0
-		baseDelay = 30 * time.Second
-		rt        = request.Grant.Entitlement.Resource.Id.ResourceType
-	)
+	rt := request.Grant.Entitlement.Resource.Id.ResourceType
+
+	retryer := retry.NewRetryer(ctx, retry.RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 15 * time.Second,
+		MaxDelay:     60 * time.Second,
+	})
 
-	provisioner, v1ok := b.resourceProvisioners[rt]
-	provisionerV2, v2ok := b.resourceProvisionersV2[rt]
-	if !v1ok && !v2ok {
+	var revokeFunc func(ctx context.Context, grant *v2.Grant) (annotations.Annotations, error)
+	provisioner, ok := b.resourceProvisioners[rt]
+	if ok {
+		revokeFunc = provisioner.Revoke
+	}
+	provisionerV2, ok := b.resourceProvisionersV2[rt]
+	if ok {
+		revokeFunc = provisionerV2.Revoke
+	}
+
+	if revokeFunc == nil {
 		l.Error("error: resource type does not have provisioner configured", zap.String("resource_type", rt))
 		b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
 		return nil, fmt.Errorf("error: resource type does not have provisioner configured")
 	}
 
 	for {
-		if v1ok {
-			annos, err := provisioner.Revoke(ctx, request.Grant)
-			if err != nil {
-				l.Error("error: revoke failed", zap.Error(err))
-
-				if !b.shouldWaitAndRetry(ctx, err, baseDelay) || attempt >= 2 {
-					b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
-					return nil, fmt.Errorf("error: revoke failed: %w", err)
-				}
-
-				attempt++
-				baseDelay *= 2
-				continue
-			}
+		annos, err := revokeFunc(ctx, request.Grant)
+		if err == nil {
+			b.m.RecordTaskSuccess(ctx, tt, b.nowFunc().Sub(start))
 			return &v2.GrantManagerServiceRevokeResponse{Annotations: annos}, nil
 		}
-
-		annos, err := provisionerV2.Revoke(ctx, request.Grant)
-		if err != nil {
-			l.Error("error: revoke failed", zap.Error(err))
-
-			if !b.shouldWaitAndRetry(ctx, err, baseDelay) || attempt >= 2 {
-				b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
-				return nil, fmt.Errorf("error: revoke failed: %w", err)
-			}
-
-			attempt++
-			baseDelay *= 2
+		if retryer.ShouldWaitAndRetry(ctx, err) {
 			continue
 		}
-
-		b.m.RecordTaskSuccess(ctx, tt, b.nowFunc().Sub(start))
-		return &v2.GrantManagerServiceRevokeResponse{Annotations: annos}, nil
+		l.Error("error: revoke failed", zap.Error(err))
+		b.m.RecordTaskFailure(ctx, tt, b.nowFunc().Sub(start))
+		return nil, fmt.Errorf("error: revoke failed: %w", err)
 	}
 }
 
@@ -1368,49 +1350,3 @@ func (b *builderImpl) GetActionStatus(ctx context.Context, request *v2.GetAction
 	b.m.RecordTaskSuccess(ctx, tt, b.nowFunc().Sub(start))
 	return resp, nil
 }
-
-func (b *builderImpl) shouldWaitAndRetry(ctx context.Context, err error, baseDelay time.Duration) bool {
-	ctx, span := tracer.Start(ctx, "provisioner.shouldWaitAndRetry")
-	defer span.End()
-
-	if err == nil {
-		return false
-	}
-
-	if status.Code(err) != codes.Unavailable && status.Code(err) != codes.DeadlineExceeded {
-		return false
-	}
-
-	// If error contains rate limit data, use that instead
-	if st, ok := status.FromError(err); ok {
-		details := st.Details()
-		for _, detail := range details {
-			if rlData, ok := detail.(*v2.RateLimitDescription); ok {
-				waitResetAt := time.Until(rlData.ResetAt.AsTime())
-				if waitResetAt <= 0 {
-					continue
-				}
-				duration := time.Duration(rlData.Limit)
-				if duration <= 0 {
-					continue
-				}
-				waitResetAt /= duration
-				// Round up to the nearest second to make sure we don't hit the rate limit again
-				waitResetAt = time.Duration(math.Ceil(waitResetAt.Seconds())) * time.Second
-				if waitResetAt > 0 {
-					baseDelay = waitResetAt
-					break
-				}
-			}
-		}
-	}
-
-	for {
-		select {
-		case <-time.After(baseDelay):
-			return true
-		case <-ctx.Done():
-			return false
-		}
-	}
-}
diff --git a/pkg/retry/retry.go b/pkg/retry/retry.go
@@ -0,0 +1,113 @@
+package retry
+
+import (
+	"context"
+	"math"
+	"time"
+
+	v2 "github.com/conductorone/baton-sdk/pb/c1/connector/v2"
+	"github.com/grpc-ecosystem/go-grpc-middleware/logging/zap/ctxzap"
+	"go.opentelemetry.io/otel"
+	"go.uber.org/zap"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+var tracer = otel.Tracer("baton-sdk/retry")
+
+type Retryer struct {
+	attempts     uint
+	maxAttempts  uint
+	initialDelay time.Duration
+	maxDelay     time.Duration
+}
+
+type RetryConfig struct {
+	MaxAttempts  uint          // 0 means no limit (which is also the default).
+	InitialDelay time.Duration // Default is 1 second.
+	MaxDelay     time.Duration // Default is 60 seconds. 0 means no limit.
+}
+
+func NewRetryer(ctx context.Context, config RetryConfig) *Retryer {
+	r := &Retryer{
+		attempts:     0,
+		maxAttempts:  config.MaxAttempts,
+		initialDelay: config.InitialDelay,
+		maxDelay:     config.MaxDelay,
+	}
+	if r.initialDelay == 0 {
+		r.initialDelay = time.Second
+	}
+	if r.maxDelay == 0 {
+		r.maxDelay = 60 * time.Second
+	}
+	return r
+}
+
+func (r *Retryer) ShouldWaitAndRetry(ctx context.Context, err error) bool {
+	ctx, span := tracer.Start(ctx, "retry.ShouldWaitAndRetry")
+	defer span.End()
+
+	if err == nil {
+		r.attempts = 0
+		return true
+	}
+	if status.Code(err) != codes.Unavailable && status.Code(err) != codes.DeadlineExceeded {
+		return false
+	}
+
+	r.attempts++
+	l := ctxzap.Extract(ctx)
+
+	if r.maxAttempts > 0 && r.attempts > r.maxAttempts {
+		l.Warn("max attempts reached", zap.Error(err), zap.Uint("max_attempts", r.maxAttempts))
+		return false
+	}
+
+	// use linear backoff by default
+	var wait time.Duration
+	if r.attempts > math.MaxInt64 {
+		wait = r.maxDelay
+	} else {
+		wait = time.Duration(int64(r.attempts)) * r.initialDelay
+	}
+
+	// If error contains rate limit data, use that instead
+	if st, ok := status.FromError(err); ok {
+		details := st.Details()
+		for _, detail := range details {
+			if rlData, ok := detail.(*v2.RateLimitDescription); ok {
+				waitResetAt := time.Until(rlData.ResetAt.AsTime())
+				if waitResetAt <= 0 {
+					continue
+				}
+				duration := time.Duration(rlData.Limit)
+				if duration <= 0 {
+					continue
+				}
+				waitResetAt /= duration
+				// Round up to the nearest second to make sure we don't hit the rate limit again
+				waitResetAt = time.Duration(math.Ceil(waitResetAt.Seconds())) * time.Second
+				if waitResetAt > 0 {
+					wait = waitResetAt
+					break
+				}
+			}
+		}
+	}
+
+	if wait > r.maxDelay {
+		wait = r.maxDelay
+	}
+
+	l.Warn("retrying operation", zap.Error(err), zap.Duration("wait", wait))
+
+	for {
+		select {
+		case <-time.After(wait):
+			return true
+		case <-ctx.Done():
+			return false
+		}
+	}
+}
diff --git a/pkg/retry/retry_test.go b/pkg/retry/retry_test.go
@@ -0,0 +1,50 @@
+package retry
+
+import (
+	"context"
+	"errors"
+	"testing"
+	"time"
+
+	"github.com/stretchr/testify/require"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+func TestBasicRetry(t *testing.T) {
+	ctx := context.Background()
+	retryer := NewRetryer(ctx, RetryConfig{
+		MaxAttempts:  3,
+		InitialDelay: 100 * time.Millisecond,
+		MaxDelay:     1 * time.Second,
+	})
+
+	shouldRetry := retryer.ShouldWaitAndRetry(ctx, errors.New("generic unrecoverable error"))
+	require.False(t, shouldRetry, "generic unrecoverable error should not be retried")
+
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unavailable, "recoverable error"))
+	require.True(t, shouldRetry, "recoverable error should be retried")
+
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unknown, "unknown error"))
+	require.False(t, shouldRetry, "unknown error should not be retried")
+
+	// This has the side effect of resetting attempts to 0.
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, nil)
+	require.True(t, shouldRetry, "nil error should be retried")
+
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unavailable, "first attempt"))
+	require.True(t, shouldRetry, "first attempt should be retried")
+
+	startTime := time.Now()
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unavailable, "second attempt"))
+	require.True(t, shouldRetry, "second attempt should be retried")
+	elapsed := time.Since(startTime)
+	require.Greater(t, elapsed, 100*time.Millisecond, "second attempt should take longer than 100ms")
+	require.Less(t, elapsed, 300*time.Millisecond, "second attempt should take less than 300ms")
+
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unavailable, "third attempt"))
+	require.True(t, shouldRetry, "third attempt should be retried")
+
+	shouldRetry = retryer.ShouldWaitAndRetry(ctx, status.Error(codes.Unavailable, "fourth attempt"))
+	require.False(t, shouldRetry, "fourth attempt should not be retried")
+}