Skip to content

Commit f84aa34

Browse files
Extend configuration for consumer handling of retry on container attach (#26698)
Currently, we continue to retry retriable errors indefinitely on container creation. However in case of certain consumers where there is limited time (30/60 seconds) for the create, we should not have infinite retries even on retriable errors. If the consumer of `attach` wants to handle the retry mechanism, they can enable the `"Fluid.Container.DisableCloseOnAttachFailure"` flag. This flag will cause no internal retries to occur and let any error surface to the consumer of `attach`. [AB#57593](https://dev.azure.com/fluidframework/235294da-091d-4c29-84fc-cdfc3d90890b/_workitems/edit/57593)
1 parent 56ba312 commit f84aa34

File tree

8 files changed

+146
-8
lines changed

8 files changed

+146
-8
lines changed

packages/common/container-definitions/src/loader.ts

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,7 +421,7 @@ export interface IContainer extends IEventProvider<IContainerEvents> {
421421
*
422422
* By default, the container will close if attach fails.
423423
* However, closure can now be avoided in most cased by setting:
424-
* Fluid.Container.RetryOnAttachFailure to true
424+
* Fluid.Container.DisableCloseOnAttachFailure to true
425425
* via the config provider passed to the loader.
426426
*
427427
* If attach fails, check the closed property to discover if retry is possible.
@@ -431,7 +431,9 @@ export interface IContainer extends IEventProvider<IContainerEvents> {
431431
*/
432432
attach(
433433
request: IRequest,
434-
attachProps?: { deltaConnection?: "none" | "delayed" },
434+
attachProps?: {
435+
deltaConnection?: "none" | "delayed";
436+
},
435437
): Promise<void>;
436438

437439
/**

packages/loader/container-loader/src/container.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1301,7 +1301,9 @@ export class Container
13011301
});
13021302

13031303
// only enable the new behavior if the config is set
1304-
if (this.mc.config.getBoolean("Fluid.Container.RetryOnAttachFailure") !== true) {
1304+
if (
1305+
this.mc.config.getBoolean("Fluid.Container.DisableCloseOnAttachFailure") !== true
1306+
) {
13051307
attachP = attachP.catch((error) => {
13061308
throw normalizeErrorAndClose(error);
13071309
});
@@ -1547,6 +1549,12 @@ export class Container
15471549
service.on("metadataUpdate", this.metadataUpdateHandler);
15481550
}
15491551
} else {
1552+
// When DisableCloseOnAttachFailure is enabled, use no internal retries
1553+
// The consumer will own the retry policy
1554+
const disableCloseOnAttachFailure =
1555+
this.mc.config.getBoolean("Fluid.Container.DisableCloseOnAttachFailure") === true;
1556+
const maxRetries = disableCloseOnAttachFailure ? 0 : undefined;
1557+
15501558
service = await runWithRetry(
15511559
async () =>
15521560
this.serviceFactory.createContainer(
@@ -1560,6 +1568,7 @@ export class Container
15601568
{
15611569
cancel: this._deltaManager.closeAbortController.signal,
15621570
}, // progress
1571+
maxRetries,
15631572
);
15641573
}
15651574
return service;

packages/loader/container-loader/src/test/loader.spec.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ describe("loader unit test", () => {
9898
urlResolver: failProxy(),
9999
configProvider: {
100100
getRawConfig: (name): ConfigTypes =>
101-
name === "Fluid.Container.RetryOnAttachFailure" ? true : undefined,
101+
name === "Fluid.Container.DisableCloseOnAttachFailure" ? true : undefined,
102102
},
103103
});
104104
const detached = await loader.createDetachedContainer({ package: "none" });
@@ -136,7 +136,7 @@ describe("loader unit test", () => {
136136
}),
137137
configProvider: {
138138
getRawConfig: (name): ConfigTypes =>
139-
name === "Fluid.Container.RetryOnAttachFailure" ? true : undefined,
139+
name === "Fluid.Container.DisableCloseOnAttachFailure" ? true : undefined,
140140
},
141141
});
142142
const detached = await loader.createDetachedContainer({ package: "none" });

packages/loader/driver-utils/src/runWithRetry.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ import { delay } from "@fluidframework/core-utils/internal";
88
import { DriverErrorTypes } from "@fluidframework/driver-definitions/internal";
99
import {
1010
isFluidError,
11+
wrapError,
1112
type ITelemetryLoggerExt,
1213
} from "@fluidframework/telemetry-utils/internal";
1314

@@ -57,6 +58,7 @@ export async function runWithRetry<T>(
5758
fetchCallName: string,
5859
logger: ITelemetryLoggerExt,
5960
progress: IProgress,
61+
maxRetries?: number,
6062
): Promise<T> {
6163
let result: T | undefined;
6264
let success = false;
@@ -122,6 +124,35 @@ export async function runWithRetry<T>(
122124
}
123125

124126
numRetries++;
127+
128+
// Check if max retries limit has been reached
129+
if (maxRetries !== undefined && numRetries > maxRetries) {
130+
logger.sendTelemetryEvent(
131+
{
132+
eventName: `${fetchCallName}_maxRetriesExceeded`,
133+
retry: numRetries - 1,
134+
maxRetries,
135+
duration: performanceNow() - startTime,
136+
fetchCallName,
137+
},
138+
error,
139+
);
140+
// Wrap the original error to preserve its details while marking it non-retriable
141+
throw wrapError(
142+
error,
143+
(message) =>
144+
new NonRetryableError(
145+
`runWithRetry failed after max retries: ${message}`,
146+
DriverErrorTypes.genericError,
147+
{
148+
driverVersion: pkgVersion,
149+
fetchCallName,
150+
maxRetries,
151+
},
152+
),
153+
);
154+
}
155+
125156
lastError = error;
126157
// Wait for the calculated time before retrying.
127158
retryAfterMs = calculateMaxWaitTime(retryAfterMs, error);

packages/loader/driver-utils/src/test/runWithRetry.spec.ts

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,4 +221,100 @@ describe("runWithRetry Tests", () => {
221221
assert.strictEqual((error as any).reason, "Sample abort reason");
222222
}
223223
});
224+
225+
it("Should stop retrying after maxRetries is exceeded", async () => {
226+
let retryTimes = 0;
227+
const api = async (): Promise<boolean> => {
228+
retryTimes += 1;
229+
const error = new Error("Throw error");
230+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
231+
(error as any).canRetry = true;
232+
throw error;
233+
};
234+
235+
try {
236+
await runWithFastSetTimeout(async () => runWithRetry(api, "test", logger, {}, 3));
237+
assert.fail("Should not succeed");
238+
} catch (error) {
239+
// Verify the wrapped error includes the original error message
240+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
241+
const errorMessage = (error as any).message;
242+
assert.strictEqual(errorMessage, "runWithRetry failed after max retries: Throw error");
243+
// Verify the original error is preserved in the cause property
244+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
245+
const causeMessage = (error as any).cause?.message;
246+
assert.strictEqual(causeMessage, "Throw error");
247+
}
248+
// Initial call + maxRetries attempts
249+
assert.strictEqual(retryTimes, 4, "Should retry exactly maxRetries times");
250+
});
251+
252+
it("Should succeed before maxRetries is exceeded", async () => {
253+
let retryTimes = 0;
254+
const api = async (): Promise<boolean> => {
255+
retryTimes += 1;
256+
// Succeed on the 3rd attempt (after 2 failures)
257+
if (retryTimes < 3) {
258+
const error = new Error("Throw error");
259+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
260+
(error as any).canRetry = true;
261+
throw error;
262+
}
263+
return true;
264+
};
265+
266+
const success = await runWithFastSetTimeout(async () =>
267+
runWithRetry(api, "test", logger, {}, 5),
268+
);
269+
assert.strictEqual(success, true, "Should succeed");
270+
assert.strictEqual(retryTimes, 3, "Should take 3 attempts to succeed");
271+
});
272+
273+
it("Should retry infinitely when maxRetries is undefined", async () => {
274+
const totalRetries = 10;
275+
let retryTimes = 0;
276+
const api = async (): Promise<boolean> => {
277+
retryTimes += 1;
278+
if (retryTimes <= totalRetries) {
279+
const error = new Error("Throw error");
280+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
281+
(error as any).canRetry = true;
282+
throw error;
283+
}
284+
return true;
285+
};
286+
287+
const success = await runWithFastSetTimeout(async () =>
288+
runWithRetry(api, "test", logger, {}),
289+
);
290+
assert.strictEqual(success, true, "Should succeed");
291+
assert.strictEqual(retryTimes, totalRetries + 1, "Should retry until success");
292+
});
293+
294+
it("Should fail immediately with maxRetries set to 0", async () => {
295+
let retryTimes = 0;
296+
const api = async (): Promise<boolean> => {
297+
retryTimes += 1;
298+
const error = new Error("Throw error");
299+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
300+
(error as any).canRetry = true;
301+
throw error;
302+
};
303+
304+
try {
305+
await runWithFastSetTimeout(async () => runWithRetry(api, "test", logger, {}, 0));
306+
assert.fail("Should not succeed");
307+
} catch (error) {
308+
// Verify the wrapped error includes the original error message
309+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
310+
const errorMessage = (error as any).message;
311+
assert.strictEqual(errorMessage, "runWithRetry failed after max retries: Throw error");
312+
// Verify the original error is preserved in the cause property
313+
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
314+
const causeMessage = (error as any).cause?.message;
315+
assert.strictEqual(causeMessage, "Throw error");
316+
}
317+
// Only the initial call, no retries
318+
assert.strictEqual(retryTimes, 1, "Should not retry at all");
319+
});
224320
});

packages/test/test-end-to-end-tests/src/test/blobs.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,7 +583,7 @@ function serializationTests({
583583
loaderProps: {
584584
documentServiceFactory,
585585
configProvider: createTestConfigProvider({
586-
"Fluid.Container.RetryOnAttachFailure": true,
586+
"Fluid.Container.DisableCloseOnAttachFailure": true,
587587
}),
588588
},
589589
});

packages/test/test-end-to-end-tests/src/test/detachedContainerTests.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -956,7 +956,7 @@ describeCompat("Detached Container", "NoCompat", (getTestObjectProvider, apis) =
956956
{
957957
configProvider: {
958958
getRawConfig: (name) =>
959-
name === "Fluid.Container.RetryOnAttachFailure" ? true : undefined,
959+
name === "Fluid.Container.DisableCloseOnAttachFailure" ? true : undefined,
960960
},
961961
},
962962
);

packages/test/test-end-to-end-tests/src/test/serializeAfterFailedAttach.spec.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ describeCompat(
8787
logger: provider.logger,
8888
configProvider: {
8989
getRawConfig: (name) =>
90-
name === "Fluid.Container.RetryOnAttachFailure" ? true : undefined,
90+
name === "Fluid.Container.DisableCloseOnAttachFailure" ? true : undefined,
9191
},
9292
});
9393
return testLoader;

0 commit comments

Comments
 (0)