Skip to content

Commit 73c5a63

Browse files
Add timeouts to summarizer stopping (microsoft#26271)
There are some peculiar cases of documents hitting 10k ops where a summarizer doesn't shut down properly. The parent container continually recognizes it is the elected client, but it still has a summarizer hanging somewhere. Some timeouts were added in various places to ensure the summarizer closes properly for the parent to spawn a new one. Looking at telemetry, the P99.9 of a summarizer's first event to proper connection was 35 seconds. Thus, a timeout of 2 minutes should be more than enough. One interesting potential case is the following: 1. Parent container spawns summarizer 2. Summarizer starts load flow 3. Parent container disconnects (`summarizer?.stop(...)` is not called since load flow has not finished and returned the `summarizer` instance) 4. Summarizer finishes load flow, and we attempt to call `summarizer.run(...)` 5. At the start of `summarizer.run(...)`, it waits to see a `"connected"` event from the ContainerRuntime i. This may not always happen if the summarizer gets into some weird state and isn't able to get a connection from service, causing the whole summarizer flow to hang [AB#54693](https://dev.azure.com/fluidframework/235294da-091d-4c29-84fc-cdfc3d90890b/_workitems/edit/54693)
1 parent eec5a71 commit 73c5a63

File tree

2 files changed

+85
-14
lines changed

2 files changed

+85
-14
lines changed

packages/runtime/container-runtime/src/summary/summaryDelayLoadedModule/summarizer.ts

Lines changed: 46 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ import type {
99
SummarizerStopReason,
1010
} from "@fluidframework/container-runtime-definitions/internal";
1111
import type { IFluidHandleContext } from "@fluidframework/core-interfaces/internal";
12-
import { Deferred } from "@fluidframework/core-utils/internal";
12+
import { assert, Deferred } from "@fluidframework/core-utils/internal";
1313
import {
1414
type IFluidErrorBase,
1515
type ITelemetryLoggerExt,
@@ -165,9 +165,37 @@ export class Summarizer extends TypedEventEmitter<ISummarizerEvents> implements
165165
}
166166

167167
private async runCore(onBehalfOf: string): Promise<SummarizerStopReason> {
168-
const runCoordinator: ICancellableSummarizerController = await this.runCoordinatorCreateFn(
169-
this.runtime,
170-
);
168+
// Race coordinator creation against a timeout to ensure we don't hang indefinitely
169+
// if the summarizer container fails to connect.
170+
let coordinatorTimedOut = false;
171+
const coordinatorTimeoutMs = 2 * 60 * 1000; // 2 minutes
172+
const coordinatorResult = await Promise.race([
173+
this.runCoordinatorCreateFn(this.runtime).then((coordinator) => {
174+
if (coordinatorTimedOut) {
175+
coordinator.stop("summarizerClientDisconnected");
176+
}
177+
return coordinator;
178+
}),
179+
new Promise<undefined>((resolve) =>
180+
setTimeout(() => {
181+
coordinatorTimedOut = true;
182+
resolve(undefined);
183+
}, coordinatorTimeoutMs),
184+
),
185+
]);
186+
187+
// If we timed out before coordinator was created, exit early
188+
if (coordinatorTimedOut) {
189+
this.logger.sendTelemetryEvent({
190+
eventName: "CreateRunCoordinatorTimeout",
191+
onBehalfOf,
192+
timeoutMs: coordinatorTimeoutMs,
193+
});
194+
return "summarizerClientDisconnected";
195+
}
196+
197+
assert(coordinatorResult !== undefined, "Expect coordinatorResult to be defined");
198+
const runCoordinator = coordinatorResult;
171199

172200
// Wait for either external signal to cancel, or loss of connectivity.
173201
const stopP = Promise.race([runCoordinator.waitCancelled, this.stopDeferred.promise]);
@@ -216,10 +244,22 @@ export class Summarizer extends TypedEventEmitter<ISummarizerEvents> implements
216244
// summarizer client to not be created until current summarizer fully moves to exit, and that would reduce
217245
// cons of #2 substantially.
218246

219-
// Cleanup after running
220-
await runningSummarizer.waitStop(
247+
// Cleanup after running with a timeout to prevent hanging
248+
const waitStopPromise = runningSummarizer.waitStop(
221249
!runCoordinator.cancelled && Summarizer.stopReasonCanRunLastSummary(stopReason),
222250
);
251+
const summarizerStopTimeoutMs = 2 * 60 * 1000; // 2 minutes
252+
const timeoutPromise = new Promise<"timeout">((resolve) =>
253+
setTimeout(() => resolve("timeout"), summarizerStopTimeoutMs),
254+
);
255+
const waitStopResult = await Promise.race([waitStopPromise, timeoutPromise]);
256+
if (waitStopResult === "timeout") {
257+
this.logger.sendTelemetryEvent({
258+
eventName: "SummarizerStopTimeout",
259+
onBehalfOf,
260+
timeoutMs: summarizerStopTimeoutMs,
261+
});
262+
}
223263

224264
// Propagate reason and ensure that if someone is waiting for cancellation token, they are moving to exit
225265
runCoordinator.stop(stopReason);

packages/runtime/container-runtime/src/summary/summaryManager.ts

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ export class SummaryManager
103103
private state = SummaryManagerState.Off;
104104
private summarizer?: ISummarizer;
105105
private _disposed = false;
106+
private summarizerStopTimeout?: ReturnType<typeof setTimeout>;
106107

107108
public get disposed(): boolean {
108109
return this._disposed;
@@ -347,16 +348,26 @@ export class SummaryManager
347348
})
348349
.finally(() => {
349350
assert(this.state !== SummaryManagerState.Off, 0x264 /* "Expected: Not Off" */);
350-
this.state = SummaryManagerState.Off;
351+
this.cleanupAfterSummarizerStop();
352+
});
353+
}
351354

352-
this.cleanupForwardedEvents();
353-
this.summarizer?.close();
354-
this.summarizer = undefined;
355+
private cleanupAfterSummarizerStop(): void {
356+
this.state = SummaryManagerState.Off;
355357

356-
if (this.getShouldSummarizeState().shouldSummarize) {
357-
this.startSummarization();
358-
}
359-
});
358+
// Clear any pending stop timeout to avoid it firing for a different summarizer
359+
if (this.summarizerStopTimeout !== undefined) {
360+
clearTimeout(this.summarizerStopTimeout);
361+
this.summarizerStopTimeout = undefined;
362+
}
363+
364+
this.cleanupForwardedEvents();
365+
this.summarizer?.close();
366+
this.summarizer = undefined;
367+
368+
if (this.getShouldSummarizeState().shouldSummarize) {
369+
this.startSummarization();
370+
}
360371
}
361372

362373
private stop(reason: SummarizerStopReason): void {
@@ -368,6 +379,23 @@ export class SummaryManager
368379
// Stopping the running summarizer client should trigger a change
369380
// in states when the running summarizer closes
370381
this.summarizer?.stop(reason);
382+
383+
const summarizerCloseTimeoutMs = 2 * 60 * 1000; // 2 minutes
384+
// Clear any existing timeout before setting a new one
385+
if (this.summarizerStopTimeout !== undefined) {
386+
clearTimeout(this.summarizerStopTimeout);
387+
}
388+
// Set a timeout to force cleanup if the summarizer doesn't close in time
389+
this.summarizerStopTimeout = setTimeout(() => {
390+
if (this.state === SummaryManagerState.Stopping && this.summarizer !== undefined) {
391+
this.logger.sendTelemetryEvent({
392+
eventName: "SummarizerStopTimeout",
393+
timeoutMs: summarizerCloseTimeoutMs,
394+
stopReason: reason,
395+
});
396+
this.cleanupAfterSummarizerStop();
397+
}
398+
}, summarizerCloseTimeoutMs);
371399
}
372400

373401
/**
@@ -453,6 +481,9 @@ export class SummaryManager
453481
this.connectedState.off("connected", this.handleConnected);
454482
this.connectedState.off("disconnected", this.handleDisconnected);
455483
this.cleanupForwardedEvents();
484+
if (this.summarizerStopTimeout !== undefined) {
485+
clearTimeout(this.summarizerStopTimeout);
486+
}
456487
this._disposed = true;
457488
}
458489

0 commit comments

Comments
 (0)