Skip to content

Commit 7b1e3a0

Browse files
authored
feat(cloud): Add telemetry retry queue for network resilience (#7597)
* feat(cloud): Add telemetry retry queue for network resilience - Implement RetryQueue class with workspace-scoped persistence - Queue failed telemetry events for automatic retry - Retry events every 60 seconds with fresh auth tokens - FIFO eviction when queue reaches 100 events - Persist queue across VS Code restarts This ensures telemetry data isn't lost during network failures or temporary server issues. Migrated from RooCodeInc/Roo-Code-Cloud#744 * fix: address PR review feedback for retry queue - Fix retry order to use consistent FIFO processing - Add retry limit enforcement with max retries check - Add configurable request timeout (default 30s) - Add comprehensive tests for retryAll() method - Add request-max-retries-exceeded event - Fix timeout test to avoid timing issues * fix: resolve TypeScript errors in RetryQueue tests * fix(cloud): Address PR feedback for telemetry retry queue - Handle HTTP error status codes (500s, 401/403, 429) as failures that trigger retry - Remove queuing of backfill operations since they're user-initiated - Fix race condition in concurrent retry processing with isProcessing flag - Add specialized retry logic for 429 with Retry-After header support - Clean up unnecessary comments - Add comprehensive tests for new status code handling - Add temporary debug logs with emojis for testing * refactor: address PR feedback for telemetry retry queue - Remove unused X-Organization-Id header from auth header provider - Simplify enqueue() API by removing operation parameter - Fix error retry logic: only retry 5xx, 429, and network failures - Stop retrying 4xx client errors (400, 401, 403, 404, 422) - Implement queue-wide pause for 429 rate limiting - Add auth state management integration: - Pause queue when not in active-session - Clear queue on logout or user change - Preserve queue when same user logs back in - Remove debug comments - Fix ESLint no-case-declarations error with proper block scope - Update tests for all new behaviors
1 parent 6ac6a92 commit 7b1e3a0

File tree

8 files changed

+1263
-48
lines changed

8 files changed

+1263
-48
lines changed

packages/cloud/src/CloudService.ts

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import { StaticSettingsService } from "./StaticSettingsService.js"
2424
import { CloudTelemetryClient as TelemetryClient } from "./TelemetryClient.js"
2525
import { CloudShareService } from "./CloudShareService.js"
2626
import { CloudAPI } from "./CloudAPI.js"
27+
import { RetryQueue } from "./retry-queue/index.js"
2728

2829
type AuthStateChangedPayload = CloudServiceEvents["auth-state-changed"][0]
2930
type AuthUserInfoPayload = CloudServiceEvents["user-info"][0]
@@ -75,13 +76,21 @@ export class CloudService extends EventEmitter<CloudServiceEvents> implements Di
7576
return this._cloudAPI
7677
}
7778

79+
private _retryQueue: RetryQueue | null = null
80+
81+
public get retryQueue() {
82+
return this._retryQueue
83+
}
84+
7885
private constructor(context: ExtensionContext, log?: (...args: unknown[]) => void) {
7986
super()
8087

8188
this.context = context
8289
this.log = log || console.log
8390

8491
this.authStateListener = (data: AuthStateChangedPayload) => {
92+
// Handle retry queue based on auth state changes
93+
this.handleAuthStateChangeForRetryQueue(data)
8594
this.emit("auth-state-changed", data)
8695
}
8796

@@ -131,7 +140,24 @@ export class CloudService extends EventEmitter<CloudServiceEvents> implements Di
131140

132141
this._cloudAPI = new CloudAPI(this._authService, this.log)
133142

134-
this._telemetryClient = new TelemetryClient(this._authService, this._settingsService)
143+
// Initialize retry queue with auth header provider
144+
this._retryQueue = new RetryQueue(
145+
this.context,
146+
undefined, // Use default config
147+
this.log,
148+
() => {
149+
// Provide fresh auth headers for retries
150+
const sessionToken = this._authService?.getSessionToken()
151+
if (sessionToken) {
152+
return {
153+
Authorization: `Bearer ${sessionToken}`,
154+
}
155+
}
156+
return undefined
157+
},
158+
)
159+
160+
this._telemetryClient = new TelemetryClient(this._authService, this._settingsService, this._retryQueue)
135161

136162
this._shareService = new CloudShareService(this._cloudAPI, this._settingsService, this.log)
137163

@@ -303,6 +329,10 @@ export class CloudService extends EventEmitter<CloudServiceEvents> implements Di
303329
this.settingsService.dispose()
304330
}
305331

332+
if (this._retryQueue) {
333+
this._retryQueue.dispose()
334+
}
335+
306336
this.isInitialized = false
307337
}
308338

@@ -365,4 +395,67 @@ export class CloudService extends EventEmitter<CloudServiceEvents> implements Di
365395
static isEnabled(): boolean {
366396
return !!this._instance?.isAuthenticated()
367397
}
398+
399+
/**
400+
* Handle auth state changes for the retry queue
401+
* - Pause queue when not in 'active-session' state
402+
* - Clear queue when user logs out or logs in as different user
403+
* - Resume queue when returning to active-session with same user
404+
*/
405+
private handleAuthStateChangeForRetryQueue(data: AuthStateChangedPayload): void {
406+
if (!this._retryQueue) {
407+
return
408+
}
409+
410+
const newState = data.state
411+
const userInfo = this.getUserInfo()
412+
const newUserId = userInfo?.id
413+
414+
this.log(`[CloudService] Auth state changed to: ${newState}, user: ${newUserId}`)
415+
416+
// Handle different auth states
417+
switch (newState) {
418+
case "active-session": {
419+
// Check if user changed (different user logged in)
420+
const wasCleared = this._retryQueue.clearIfUserChanged(newUserId)
421+
422+
if (!wasCleared) {
423+
// Same user or first login, resume the queue
424+
this._retryQueue.resume()
425+
this.log("[CloudService] Resuming retry queue for active session")
426+
} else {
427+
// Different user, queue was cleared, but we can resume processing
428+
this._retryQueue.resume()
429+
this.log("[CloudService] Retry queue cleared for new user, resuming processing")
430+
}
431+
break
432+
}
433+
434+
case "logged-out":
435+
// User is logged out, clear the queue
436+
this._retryQueue.clearIfUserChanged(undefined)
437+
this._retryQueue.pause()
438+
this.log("[CloudService] Pausing and clearing retry queue for logged-out state")
439+
break
440+
441+
case "initializing":
442+
case "attempting-session":
443+
// Transitional states, pause the queue but don't clear
444+
this._retryQueue.pause()
445+
this.log(`[CloudService] Pausing retry queue during ${newState}`)
446+
break
447+
448+
case "inactive-session":
449+
// Session is inactive (possibly expired), pause but don't clear
450+
// The queue might resume if the session becomes active again
451+
this._retryQueue.pause()
452+
this.log("[CloudService] Pausing retry queue for inactive session")
453+
break
454+
455+
default:
456+
// Unknown state, pause as a safety measure
457+
this._retryQueue.pause()
458+
this.log(`[CloudService] Pausing retry queue for unknown state: ${newState}`)
459+
}
460+
}
368461
}

packages/cloud/src/TelemetryClient.ts

Lines changed: 59 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import {
1111
} from "@roo-code/types"
1212

1313
import { getRooCodeApiUrl } from "./config.js"
14+
import type { RetryQueue } from "./retry-queue/index.js"
1415

1516
abstract class BaseTelemetryClient implements TelemetryClient {
1617
protected providerRef: WeakRef<TelemetryPropertiesProvider> | null = null
@@ -82,21 +83,21 @@ abstract class BaseTelemetryClient implements TelemetryClient {
8283
}
8384

8485
export class CloudTelemetryClient extends BaseTelemetryClient {
86+
private retryQueue: RetryQueue | null = null
87+
8588
constructor(
8689
private authService: AuthService,
8790
private settingsService: SettingsService,
88-
debug = false,
91+
retryQueue?: RetryQueue,
8992
) {
90-
super(
91-
{
92-
type: "exclude",
93-
events: [TelemetryEventName.TASK_CONVERSATION_MESSAGE],
94-
},
95-
debug,
96-
)
93+
super({
94+
type: "exclude",
95+
events: [TelemetryEventName.TASK_CONVERSATION_MESSAGE],
96+
})
97+
this.retryQueue = retryQueue || null
9798
}
9899

99-
private async fetch(path: string, options: RequestInit) {
100+
private async fetch(path: string, options: RequestInit, allowQueueing = true) {
100101
if (!this.authService.isAuthenticated()) {
101102
return
102103
}
@@ -108,18 +109,46 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
108109
return
109110
}
110111

111-
const response = await fetch(`${getRooCodeApiUrl()}/api/${path}`, {
112+
const url = `${getRooCodeApiUrl()}/api/${path}`
113+
const fetchOptions: RequestInit = {
112114
...options,
113115
headers: {
114116
Authorization: `Bearer ${token}`,
115117
"Content-Type": "application/json",
116118
},
117-
})
119+
}
118120

119-
if (!response.ok) {
120-
console.error(
121-
`[TelemetryClient#fetch] ${options.method} ${path} -> ${response.status} ${response.statusText}`,
122-
)
121+
try {
122+
const response = await fetch(url, fetchOptions)
123+
124+
if (!response.ok) {
125+
console.error(
126+
`[TelemetryClient#fetch] ${options.method} ${path} -> ${response.status} ${response.statusText}`,
127+
)
128+
129+
// Queue for retry on server errors (5xx) or rate limiting (429)
130+
// Do NOT retry on client errors (4xx) except 429 - they won't succeed
131+
if (this.retryQueue && allowQueueing && (response.status >= 500 || response.status === 429)) {
132+
await this.retryQueue.enqueue(url, fetchOptions, "telemetry")
133+
}
134+
}
135+
136+
return response
137+
} catch (error) {
138+
console.error(`[TelemetryClient#fetch] Network error for ${options.method} ${path}: ${error}`)
139+
140+
// Queue for retry on network failures (typically TypeError with "fetch failed" message)
141+
// These are transient network issues that may succeed on retry
142+
if (
143+
this.retryQueue &&
144+
allowQueueing &&
145+
error instanceof TypeError &&
146+
error.message.includes("fetch failed")
147+
) {
148+
await this.retryQueue.enqueue(url, fetchOptions, "telemetry")
149+
}
150+
151+
throw error
123152
}
124153
}
125154

@@ -158,6 +187,7 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
158187
})
159188
} catch (error) {
160189
console.error(`[TelemetryClient#capture] Error sending telemetry event: ${error}`)
190+
// Error is already queued for retry in the fetch method
161191
}
162192
}
163193

@@ -199,22 +229,26 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
199229
)
200230
}
201231

202-
// Custom fetch for multipart - don't set Content-Type header (let browser set it)
203-
const response = await fetch(`${getRooCodeApiUrl()}/api/events/backfill`, {
232+
const url = `${getRooCodeApiUrl()}/api/events/backfill`
233+
const fetchOptions: RequestInit = {
204234
method: "POST",
205235
headers: {
206236
Authorization: `Bearer ${token}`,
207-
// Note: No Content-Type header - browser will set multipart/form-data with boundary
208237
},
209238
body: formData,
210-
})
239+
}
211240

212-
if (!response.ok) {
213-
console.error(
214-
`[TelemetryClient#backfillMessages] POST events/backfill -> ${response.status} ${response.statusText}`,
215-
)
216-
} else if (this.debug) {
217-
console.info(`[TelemetryClient#backfillMessages] Successfully uploaded messages for task ${taskId}`)
241+
try {
242+
const response = await fetch(url, fetchOptions)
243+
244+
if (!response.ok) {
245+
console.error(
246+
`[TelemetryClient#backfillMessages] POST events/backfill -> ${response.status} ${response.statusText}`,
247+
)
248+
}
249+
} catch (fetchError) {
250+
console.error(`[TelemetryClient#backfillMessages] Network error: ${fetchError}`)
251+
throw fetchError
218252
}
219253
} catch (error) {
220254
console.error(`[TelemetryClient#backfillMessages] Error uploading messages: ${error}`)

packages/cloud/src/__tests__/TelemetryClient.test.ts

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -641,28 +641,6 @@ describe("TelemetryClient", () => {
641641
)
642642
})
643643

644-
it("should log debug information when debug is enabled", async () => {
645-
const client = new TelemetryClient(mockAuthService, mockSettingsService, true)
646-
647-
const messages = [
648-
{
649-
ts: 1,
650-
type: "say" as const,
651-
say: "text" as const,
652-
text: "test message",
653-
},
654-
]
655-
656-
await client.backfillMessages(messages, "test-task-id")
657-
658-
expect(console.info).toHaveBeenCalledWith(
659-
"[TelemetryClient#backfillMessages] Uploading 1 messages for task test-task-id",
660-
)
661-
expect(console.info).toHaveBeenCalledWith(
662-
"[TelemetryClient#backfillMessages] Successfully uploaded messages for task test-task-id",
663-
)
664-
})
665-
666644
it("should handle empty messages array", async () => {
667645
const client = new TelemetryClient(mockAuthService, mockSettingsService)
668646

packages/cloud/src/index.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,6 @@ export * from "./config.js"
33
export { CloudService } from "./CloudService.js"
44

55
export { BridgeOrchestrator } from "./bridge/index.js"
6+
7+
export { RetryQueue } from "./retry-queue/index.js"
8+
export type { QueuedRequest, QueueStats, RetryQueueConfig, RetryQueueEvents } from "./retry-queue/index.js"

0 commit comments

Comments
 (0)