FluidFramework/packages/loader/driver-utils/src/runWithRetry.ts at e9ebaebf151e7d956621490d641d804b20d77b2c · kian-thompson/FluidFramework · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
/*!
 * Copyright (c) Microsoft Corporation and contributors. All rights reserved.
 * Licensed under the MIT License.
 */

import { performanceNow } from "@fluid-internal/client-utils";
import { delay } from "@fluidframework/core-utils/internal";
import { DriverErrorTypes } from "@fluidframework/driver-definitions/internal";
import {
	isFluidError,
	wrapError,
	type ITelemetryLoggerExt,
} from "@fluidframework/telemetry-utils/internal";

import { NonRetryableError, canRetryOnError, getRetryDelayFromError } from "./network.js";
import { pkgVersion } from "./packageVersion.js";

/**
 * Interface describing an object passed to various network APIs.
 * It allows caller to control cancellation, as well as learn about any delays.
 * @internal
 */
export interface IProgress {
	/**
	 * Abort signal used to cancel operation.
	 *
	 * @remarks Note that most of the layers do not use this signal yet. We need to change that over time.
	 * Please consult with API documentation / implementation.
	 * Note that  number of layers may not check this signal while holding this request in a queue,
	 * so it may take a while it takes effect. This can be improved in the future.
	 *
	 * The layers in question are:
	 *
	 * - driver (RateLimiter)
	 *
	 * - runWithRetry
	 */
	cancel?: AbortSignal;

	/**
	 * Called whenever api returns cancellable error and the call is going to be retried.
	 * Any exception thrown from this call back result in cancellation of operation
	 * and propagation of thrown exception.
	 * @param delayInMs - delay before next retry. This value will depend on internal back-off logic,
	 * as well as information provided by service (like 429 error asking to wait for some time before retry)
	 * @param error - error object returned from the call.
	 */
	onRetry?(delayInMs: number, error: unknown): void;
}

/**
 * Runs the provided API call with automatic retry logic.
 *
 * @internal
 */
export async function runWithRetry<T>(
	api: (cancel?: AbortSignal) => Promise<T>,
	fetchCallName: string,
	logger: ITelemetryLoggerExt,
	progress: IProgress,
	maxRetries?: number,
): Promise<T> {
	let result: T | undefined;
	let success = false;
	// We double this value in first try in when we calculate time to wait for in "calculateMaxWaitTime" function.
	let retryAfterMs = 500; // has to be positive!
	let numRetries = 0;
	const startTime = performanceNow();
	let lastError: unknown;
	do {
		try {
			result = await api(progress.cancel);
			success = true;
		} catch (error) {
			// If it is not retriable, then just throw the error.
			if (!canRetryOnError(error)) {
				logger.sendTelemetryEvent(
					{
						eventName: `${fetchCallName}_cancel`,
						retry: numRetries,
						duration: performanceNow() - startTime,
						fetchCallName,
					},
					error,
				);
				throw error;
			}

			if (progress.cancel?.aborted === true) {
				const abortReason = progress.cancel.reason as string;
				logger.sendTelemetryEvent(
					{
						eventName: `${fetchCallName}_runWithRetryAborted`,
						retry: numRetries,
						duration: performanceNow() - startTime,
						fetchCallName,
						reason: abortReason,
					},
					error,
				);
				throw new NonRetryableError(
					"runWithRetry was Aborted",
					DriverErrorTypes.genericError,
					{
						driverVersion: pkgVersion,
						fetchCallName,
						reason: abortReason,
					},
				);
			}

			// logging the first failed retry instead of every attempt. We want to avoid filling telemetry
			// when we have tight loop of retrying in offline mode, but we also want to know what caused
			// the failure in the first place
			if (numRetries === 0) {
				logger.sendTelemetryEvent(
					{
						eventName: `${fetchCallName}_firstFailed`,
						duration: performanceNow() - startTime,
						fetchCallName,
					},
					error,
				);
			}

			numRetries++;

			// Check if max retries limit has been reached
			if (maxRetries !== undefined && numRetries > maxRetries) {
				logger.sendTelemetryEvent(
					{
						eventName: `${fetchCallName}_maxRetriesExceeded`,
						retry: numRetries - 1,
						maxRetries,
						duration: performanceNow() - startTime,
						fetchCallName,
					},
					error,
				);
				// Wrap the original error to preserve its details while marking it non-retriable
				throw wrapError(
					error,
					(message) =>
						new NonRetryableError(
							`runWithRetry failed after max retries: ${message}`,
							DriverErrorTypes.genericError,
							{
								driverVersion: pkgVersion,
								fetchCallName,
								maxRetries,
							},
						),
				);
			}

			lastError = error;
			// Wait for the calculated time before retrying.
			retryAfterMs = calculateMaxWaitTime(retryAfterMs, error);
			if (progress.onRetry) {
				progress.onRetry(retryAfterMs, error);
			}
			await delay(retryAfterMs);
		}
	} while (!success);
	if (numRetries > 0) {
		logger.sendTelemetryEvent(
			{
				eventName: `${fetchCallName}_lastError`,
				retry: numRetries,
				duration: performanceNow() - startTime,
				fetchCallName,
			},
			lastError,
		);
	}
	// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
	return result!;
}

const MaxReconnectDelayInMsWhenEndpointIsReachable = 60000;
const MaxReconnectDelayInMsWhenEndpointIsNotReachable = 8000;

/**
 * Calculates time to wait for after an error based on the error and wait time for previous iteration.
 * In case endpoint(service or socket) is not reachable, then we maybe offline or may have got some
 * transient error not related to endpoint, in that case we want to try at faster pace and hence the
 * max wait is lesser 8s as compared to when endpoint is reachable in which case it is 60s.
 * @param delayMs - wait time for previous iteration
 * @param error - error based on which we decide wait time.
 * @returns Wait time to wait for.
 * @internal
 */
export function calculateMaxWaitTime(delayMs: number, error: unknown): number {
	const retryDelayFromError = getRetryDelayFromError(error);
	let newDelayMs = Math.max(retryDelayFromError ?? 0, delayMs * 2);
	newDelayMs = Math.min(
		newDelayMs,
		isFluidError(error) && error.getTelemetryProperties().endpointReached === true
			? MaxReconnectDelayInMsWhenEndpointIsReachable
			: MaxReconnectDelayInMsWhenEndpointIsNotReachable,
	);
	return newDelayMs;
}