Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion packages/common/container-definitions/src/loader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,9 @@ export interface IContainer extends IEventProvider<IContainerEvents> {
*/
attach(
request: IRequest,
attachProps?: { deltaConnection?: "none" | "delayed" },
attachProps?: {
deltaConnection?: "none" | "delayed";
},
): Promise<void>;

/**
Expand Down
11 changes: 10 additions & 1 deletion packages/loader/container-loader/src/container.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1214,7 +1214,9 @@ export class Container
public readonly attach = runSingle(
async (
request: IRequest,
attachProps?: { deltaConnection?: "none" | "delayed" },
attachProps?: {
deltaConnection?: "none" | "delayed";
},
): Promise<void> => {
await PerformanceEvent.timedExecAsync(
this.mc.logger,
Expand Down Expand Up @@ -1547,6 +1549,12 @@ export class Container
service.on("metadataUpdate", this.metadataUpdateHandler);
}
} else {
// When RetryOnAttachFailure is enabled, use no internal retries
// The consumer will own the retry policy
const retryOnAttachFailure =
this.mc.config.getBoolean("Fluid.Container.RetryOnAttachFailure") === true;
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this flag name already locked in? To me its naming is unintuitive -- it sounds like it's telling the Fluid Container to do retries, but instead its saying the caller will do the retries.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, does it make sense to give the caller control over how many retries to do? If they want to disable it, set it to zero, but that way it gives them the opportunity to optimize for their specific startup time requirements

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1, MaxRetriesOnAttachFailure seems nice, otherwise DisableRetryOnAttachFailure maybe

Copy link
Copy Markdown
Contributor Author

@kian-thompson kian-thompson Mar 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@anthony-murphy Thoughts on changing the name to something like Fluid.Container.DisableRetryOnAttachFailure?

Also, does it make sense to give the caller control over how many retries to do?

The important thing here is that the errors are bubbled up to the consumer of attach. I don't have any strong preference on how consumers can implement a retry mechanism, but retries are not necessary for this ask.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the flag primarily allows attach to be retired by the consumer, as without it the container will close when attach fails

Copy link
Copy Markdown
Contributor

@anthony-murphy anthony-murphy Mar 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the goal is to get out of the container level retry game here, and let the consumer decide what they want to do, retry calling, serialize the container and save it somewhere, or dispose the container throw it away

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed the flag name to DisableCloseOnAttachFailure to be extra clear about what it does

const maxRetries = retryOnAttachFailure ? 0 : undefined;

service = await runWithRetry(
async () =>
this.serviceFactory.createContainer(
Expand All @@ -1559,6 +1567,7 @@ export class Container
this.mc.logger,
{
cancel: this._deltaManager.closeAbortController.signal,
maxRetries,
}, // progress
);
}
Expand Down
36 changes: 36 additions & 0 deletions packages/loader/driver-utils/src/runWithRetry.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { delay } from "@fluidframework/core-utils/internal";
import { DriverErrorTypes } from "@fluidframework/driver-definitions/internal";
import {
isFluidError,
wrapError,
type ITelemetryLoggerExt,
} from "@fluidframework/telemetry-utils/internal";

Expand Down Expand Up @@ -45,6 +46,12 @@ export interface IProgress {
* @param error - error object returned from the call.
*/
onRetry?(delayInMs: number, error: unknown): void;

/**
* Maximum number of retries before giving up on a retriable error.
* If undefined, retries will continue indefinitely (default behavior).
*/
maxRetries?: number;
}

/**
Expand Down Expand Up @@ -122,6 +129,35 @@ export async function runWithRetry<T>(
}

numRetries++;

// Check if max retries limit has been reached
if (progress.maxRetries !== undefined && numRetries > progress.maxRetries) {
logger.sendTelemetryEvent(
Comment on lines +128 to +130
Copy link

Copilot AI Mar 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

progress.maxRetries is used without validation. If a caller passes NaN, Infinity, a negative value, or a non-integer, the comparison numRetries > progress.maxRetries can behave unexpectedly (e.g., NaN makes the limit never trigger, reintroducing infinite retries). Consider normalizing the option (e.g., require a finite, non-negative integer; otherwise treat as undefined or throw a non-retriable UsageError).

Copilot uses AI. Check for mistakes.
{
eventName: `${fetchCallName}_maxRetriesExceeded`,
retry: numRetries - 1,
maxRetries: progress.maxRetries,
duration: performanceNow() - startTime,
fetchCallName,
},
error,
);
// Wrap the original error to preserve its details while marking it non-retriable
throw wrapError(
error,
(message) =>
new NonRetryableError(
`runWithRetry failed after max retries: ${message}`,
DriverErrorTypes.genericError,
{
driverVersion: pkgVersion,
fetchCallName,
maxRetries: progress.maxRetries,
},
),
);
}

lastError = error;
// Wait for the calculated time before retrying.
retryAfterMs = calculateMaxWaitTime(retryAfterMs, error);
Expand Down
108 changes: 108 additions & 0 deletions packages/loader/driver-utils/src/test/runWithRetry.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -221,4 +221,112 @@ describe("runWithRetry Tests", () => {
assert.strictEqual((error as any).reason, "Sample abort reason");
}
});

it("Should stop retrying after maxRetries is exceeded", async () => {
const maxRetries = 3;
let retryTimes = 0;
const api = async (): Promise<boolean> => {
retryTimes += 1;
const error = new Error("Throw error");
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
(error as any).canRetry = true;
throw error;
};

try {
await runWithFastSetTimeout(async () =>
runWithRetry(api, "test", logger, {
maxRetries,
}),
);
assert.fail("Should not succeed");
} catch (error) {
// Verify the wrapped error includes the original error message
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
const errorMessage = (error as any).message;
assert.strictEqual(errorMessage, "runWithRetry failed after max retries: Throw error");
// Verify the original error is preserved in the cause property
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
const causeMessage = (error as any).cause?.message;
assert.strictEqual(causeMessage, "Throw error");
}
// Initial call + maxRetries attempts
assert.strictEqual(retryTimes, maxRetries + 1, "Should retry exactly maxRetries times");
});

it("Should succeed before maxRetries is exceeded", async () => {
const maxRetries = 5;
let retryTimes = 0;
const api = async (): Promise<boolean> => {
retryTimes += 1;
// Succeed on the 3rd attempt (after 2 failures)
if (retryTimes < 3) {
const error = new Error("Throw error");
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
(error as any).canRetry = true;
throw error;
}
return true;
};

const success = await runWithFastSetTimeout(async () =>
runWithRetry(api, "test", logger, {
maxRetries,
}),
);
assert.strictEqual(success, true, "Should succeed");
assert.strictEqual(retryTimes, 3, "Should take 3 attempts to succeed");
});

it("Should retry infinitely when maxRetries is undefined", async () => {
const totalRetries = 10;
let retryTimes = 0;
const api = async (): Promise<boolean> => {
retryTimes += 1;
if (retryTimes <= totalRetries) {
const error = new Error("Throw error");
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
(error as any).canRetry = true;
throw error;
}
return true;
};

const success = await runWithFastSetTimeout(async () =>
runWithRetry(api, "test", logger, {}),
);
assert.strictEqual(success, true, "Should succeed");
assert.strictEqual(retryTimes, totalRetries + 1, "Should retry until success");
});

it("Should fail immediately with maxRetries set to 0", async () => {
let retryTimes = 0;
const api = async (): Promise<boolean> => {
retryTimes += 1;
const error = new Error("Throw error");
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access -- TODO: use a real type
(error as any).canRetry = true;
throw error;
};

try {
await runWithFastSetTimeout(async () =>
runWithRetry(api, "test", logger, {
maxRetries: 0,
}),
);
assert.fail("Should not succeed");
} catch (error) {
// Verify the wrapped error includes the original error message
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
const errorMessage = (error as any).message;
assert.strictEqual(errorMessage, "runWithRetry failed after max retries: Throw error");
// Verify the original error is preserved in the cause property
// eslint-disable-next-line @typescript-eslint/no-explicit-any, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-assignment -- TODO: use a real type
const causeMessage = (error as any).cause?.message;
assert.strictEqual(causeMessage, "Throw error");
}
// Only the initial call, no retries
assert.strictEqual(retryTimes, 1, "Should not retry at all");
});
});
Loading