[azservicebus] Make it so retry sleeps can be cancelled. (Azure#19216)

richardpark-msft · web-flow · commit eb78c5ea5f83 · 2022-09-27T14:31:17.000-07:00
Make it so retry sleeps can be cancelled. Without this the user will have to wait for the entire sleep before they can exit the function.
diff --git a/sdk/messaging/azservicebus/CHANGELOG.md b/sdk/messaging/azservicebus/CHANGELOG.md
@@ -10,7 +10,8 @@
 
 - AcceptNextSessionForQueue and AcceptNextSessionForSubscription now return an azservicebus.Error with 
   Code set to CodeTimeout when they fail due to no sessions being available. Examples for this have 
-  been added for `AcceptNextSessionForQueue`. PR#TBD.
+  been added for `AcceptNextSessionForQueue`. PR#19113.
+- Retries now respect cancellation when they're in the "delay before next try" phase.
 
 ### Other Changes
 
diff --git a/sdk/messaging/azservicebus/internal/utils/retrier.go b/sdk/messaging/azservicebus/internal/utils/retrier.go
@@ -51,7 +51,12 @@ func Retry(ctx context.Context, eventName log.Event, operation string, fn func(c
 		if i > 0 {
 			sleep := calcDelay(ro, i)
 			log.Writef(eventName, "(%s) Retry attempt %d sleeping for %s", operation, i, sleep)
-			time.Sleep(sleep)
+
+			select {
+			case <-ctx.Done():
+				return ctx.Err()
+			case <-time.After(sleep):
+			}
 		}
 
 		args := RetryFnArgs{
diff --git a/sdk/messaging/azservicebus/internal/utils/retrier_test.go b/sdk/messaging/azservicebus/internal/utils/retrier_test.go
@@ -86,30 +86,6 @@ func TestRetrier(t *testing.T) {
 		require.EqualValues(t, 1, called)
 	})
 
-	t.Run("Cancellation", func(t *testing.T) {
-		ctx, cancel := context.WithCancel(context.Background())
-		cancel()
-
-		isFatalFn := func(err error) bool {
-			return errors.Is(err, context.Canceled)
-		}
-
-		// it's up to
-		err := Retry(ctx, testLogEvent, "notused", func(ctx context.Context, args *RetryFnArgs) error {
-			// NOTE: it's up to the underlying function to handle cancellation. `Retry` doesn't
-			// do anything but propagate it.
-			select {
-			case <-ctx.Done():
-			default:
-				require.Fail(t, "Context should have been cancelled")
-			}
-
-			return context.Canceled
-		}, isFatalFn, exported.RetryOptions{})
-
-		require.ErrorIs(t, context.Canceled, err)
-	})
-
 	t.Run("ResetAttempts", func(t *testing.T) {
 		isFatalFn := func(err error) bool {
 			return errors.Is(err, context.Canceled)
@@ -165,6 +141,77 @@ func TestRetrier(t *testing.T) {
 	})
 }
 
+func TestCancellationCancelsSleep(t *testing.T) {
+	ctx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	isFatalFn := func(err error) bool {
+		return errors.Is(err, context.Canceled)
+	}
+
+	called := 0
+
+	err := Retry(ctx, testLogEvent, "notused", func(ctx context.Context, args *RetryFnArgs) error {
+		called++
+		return errors.New("try again")
+	}, isFatalFn, exported.RetryOptions{
+		RetryDelay: time.Hour,
+	})
+
+	require.Error(t, err)
+	require.ErrorIs(t, err, context.Canceled)
+	require.Equal(t, called, 1)
+}
+
+func TestCancellationFromUserFunc(t *testing.T) {
+	alreadyCancelledCtx, cancel := context.WithCancel(context.Background())
+	cancel()
+
+	canceledfromFunc := errors.New("the user func got the cancellation signal")
+
+	isFatalFn := func(err error) bool {
+		return errors.Is(err, canceledfromFunc)
+	}
+
+	called := 0
+
+	err := Retry(alreadyCancelledCtx, testLogEvent, "notused", func(ctx context.Context, args *RetryFnArgs) error {
+		called++
+
+		select {
+		case <-ctx.Done():
+			return canceledfromFunc
+		default:
+			panic("Context should have been cancelled")
+		}
+	}, isFatalFn, exported.RetryOptions{})
+
+	require.Error(t, err)
+	require.ErrorIs(t, err, canceledfromFunc)
+}
+
+func TestCancellationTimeoutsArentPropagatedToUser(t *testing.T) {
+	isFatalFn := func(err error) bool {
+		// we want to exhaust all retries and run through the "sleep between retries" logic.
+		return false
+	}
+
+	tryAgainErr := errors.New("try again")
+	called := 0
+
+	err := Retry(context.Background(), testLogEvent, "notused", func(ctx context.Context, args *RetryFnArgs) error {
+		called++
+		require.NoError(t, ctx.Err(), "our sleep/timeout doesn't show up for users")
+		return tryAgainErr
+	}, isFatalFn, exported.RetryOptions{
+		RetryDelay: time.Millisecond,
+	})
+
+	require.Error(t, err)
+	require.ErrorIs(t, err, tryAgainErr, "error should be propagated from user callback")
+	require.Equal(t, called, 1+3, "all attempts exhausted since we never returned a fatal error")
+}
+
 func Test_calcDelay(t *testing.T) {
 	t.Run("can't exceed max retry delay", func(t *testing.T) {
 		duration := calcDelay(exported.RetryOptions{