Skip to content

Commit a14a0da

Browse files
committed
fix(cloud): Address PR feedback for telemetry retry queue
- Handle HTTP error status codes (500s, 401/403, 429) as failures that trigger retry - Remove queuing of backfill operations since they're user-initiated - Fix race condition in concurrent retry processing with isProcessing flag - Add specialized retry logic for 429 with Retry-After header support - Clean up unnecessary comments - Add comprehensive tests for new status code handling - Add temporary debug logs with emojis for testing
1 parent 59393d8 commit a14a0da

File tree

4 files changed

+253
-41
lines changed

4 files changed

+253
-41
lines changed

packages/cloud/src/TelemetryClient.ts

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
9797
this.retryQueue = retryQueue || null
9898
}
9999

100-
private async fetch(path: string, options: RequestInit) {
100+
private async fetch(path: string, options: RequestInit, allowQueueing = true) {
101101
if (!this.authService.isAuthenticated()) {
102102
return
103103
}
@@ -125,14 +125,36 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
125125
console.error(
126126
`[TelemetryClient#fetch] ${options.method} ${path} -> ${response.status} ${response.statusText}`,
127127
)
128+
129+
// Queue for retry on server errors (5xx), rate limiting (429), or auth errors (401/403)
130+
if (
131+
this.retryQueue &&
132+
allowQueueing &&
133+
(response.status >= 500 ||
134+
response.status === 429 ||
135+
response.status === 401 ||
136+
response.status === 403)
137+
) {
138+
await this.retryQueue.enqueue(
139+
url,
140+
fetchOptions,
141+
"telemetry",
142+
`Telemetry: ${options.method} /api/${path}`,
143+
)
144+
}
128145
}
129146

130147
return response
131148
} catch (error) {
132149
console.error(`[TelemetryClient#fetch] Network error for ${options.method} ${path}: ${error}`)
133150

134151
// Queue for retry if we have a retry queue and it's a network error
135-
if (this.retryQueue && error instanceof TypeError && error.message.includes("fetch failed")) {
152+
if (
153+
this.retryQueue &&
154+
allowQueueing &&
155+
error instanceof TypeError &&
156+
error.message.includes("fetch failed")
157+
) {
136158
await this.retryQueue.enqueue(
137159
url,
138160
fetchOptions,
@@ -222,13 +244,11 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
222244
)
223245
}
224246

225-
// Custom fetch for multipart - don't set Content-Type header (let browser set it)
226247
const url = `${getRooCodeApiUrl()}/api/events/backfill`
227248
const fetchOptions: RequestInit = {
228249
method: "POST",
229250
headers: {
230251
Authorization: `Bearer ${token}`,
231-
// Note: No Content-Type header - browser will set multipart/form-data with boundary
232252
},
233253
body: formData,
234254
}
@@ -242,15 +262,7 @@ export class CloudTelemetryClient extends BaseTelemetryClient {
242262
)
243263
}
244264
} catch (fetchError) {
245-
// For backfill, also queue for retry on network errors
246-
if (this.retryQueue && fetchError instanceof TypeError && fetchError.message.includes("fetch failed")) {
247-
await this.retryQueue.enqueue(
248-
url,
249-
fetchOptions,
250-
"telemetry",
251-
`Telemetry: Backfill messages for task ${taskId}`,
252-
)
253-
}
265+
console.error(`[TelemetryClient#backfillMessages] Network error: ${fetchError}`)
254266
throw fetchError
255267
}
256268
} catch (error) {

packages/cloud/src/retry-queue/RetryQueue.ts

Lines changed: 82 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ export class RetryQueue extends EventEmitter<RetryQueueEvents> {
3131
maxQueueSize: 100,
3232
persistQueue: true,
3333
networkCheckInterval: 60000,
34-
requestTimeout: 30000, // Make timeout configurable
34+
requestTimeout: 30000,
3535
...config,
3636
}
3737

@@ -98,6 +98,7 @@ export class RetryQueue extends EventEmitter<RetryQueueEvents> {
9898

9999
public async retryAll(): Promise<void> {
100100
if (this.isProcessing) {
101+
this.log("[RetryQueue] Already processing, skipping retry cycle")
101102
return
102103
}
103104

@@ -108,38 +109,76 @@ export class RetryQueue extends EventEmitter<RetryQueueEvents> {
108109

109110
this.isProcessing = true
110111

111-
// Sort by timestamp to process in FIFO order (oldest first)
112-
requests.sort((a, b) => a.timestamp - b.timestamp)
113-
114-
// Process all requests in FIFO order
115-
for (const request of requests) {
116-
try {
117-
await this.retryRequest(request)
118-
this.queue.delete(request.id)
119-
this.emit("request-retry-success", request)
120-
} catch (error) {
121-
request.retryCount++
122-
request.lastError = error instanceof Error ? error.message : String(error)
123-
124-
// Check if we've exceeded max retries
125-
if (this.config.maxRetries > 0 && request.retryCount >= this.config.maxRetries) {
112+
try {
113+
// Sort by timestamp to process in FIFO order (oldest first)
114+
requests.sort((a, b) => a.timestamp - b.timestamp)
115+
116+
// Process all requests in FIFO order
117+
for (const request of requests) {
118+
// Skip if request should not be retried yet (rate limiting)
119+
if (request.nextRetryAfter && Date.now() < request.nextRetryAfter) {
126120
this.log(
127-
`[RetryQueue] Max retries (${this.config.maxRetries}) reached for request: ${request.operation || request.url}`,
121+
`[RetryQueue] Skipping rate-limited request until ${new Date(request.nextRetryAfter).toISOString()}`,
128122
)
129-
this.queue.delete(request.id)
130-
this.emit("request-max-retries-exceeded", request, error as Error)
131-
} else {
132-
this.queue.set(request.id, request)
133-
this.emit("request-retry-failed", request, error as Error)
123+
continue
134124
}
135125

136-
// Add a small delay between retry attempts
137-
await this.delay(100)
126+
try {
127+
const response = await this.retryRequest(request)
128+
129+
// Check if we got a Retry-After header for rate limiting
130+
if (response && response.status === 429) {
131+
const retryAfter = response.headers.get("Retry-After")
132+
if (retryAfter) {
133+
// Parse Retry-After (could be seconds or a date)
134+
let delayMs: number
135+
const retryAfterSeconds = parseInt(retryAfter, 10)
136+
if (!isNaN(retryAfterSeconds)) {
137+
delayMs = retryAfterSeconds * 1000
138+
} else {
139+
// Try parsing as a date
140+
const retryDate = new Date(retryAfter)
141+
if (!isNaN(retryDate.getTime())) {
142+
delayMs = retryDate.getTime() - Date.now()
143+
} else {
144+
delayMs = 60000 // Default to 1 minute if we can't parse
145+
}
146+
}
147+
request.nextRetryAfter = Date.now() + delayMs
148+
this.log(`[RetryQueue] Rate limited, will retry after ${delayMs}ms`)
149+
this.queue.set(request.id, request)
150+
continue
151+
}
152+
}
153+
154+
this.queue.delete(request.id)
155+
this.emit("request-retry-success", request)
156+
} catch (error) {
157+
request.retryCount++
158+
request.lastError = error instanceof Error ? error.message : String(error)
159+
160+
// Check if we've exceeded max retries
161+
if (this.config.maxRetries > 0 && request.retryCount >= this.config.maxRetries) {
162+
this.log(
163+
`[RetryQueue] Max retries (${this.config.maxRetries}) reached for request: ${request.operation || request.url}`,
164+
)
165+
this.queue.delete(request.id)
166+
this.emit("request-max-retries-exceeded", request, error as Error)
167+
} else {
168+
this.queue.set(request.id, request)
169+
this.emit("request-retry-failed", request, error as Error)
170+
}
171+
172+
// Add a small delay between retry attempts
173+
await this.delay(100)
174+
}
138175
}
139-
}
140176

141-
await this.persistQueue()
142-
this.isProcessing = false
177+
await this.persistQueue()
178+
} finally {
179+
// Always reset the processing flag, even if an error occurs
180+
this.isProcessing = false
181+
}
143182
}
144183

145184
private async retryRequest(request: QueuedRequest): Promise<Response> {
@@ -171,8 +210,23 @@ export class RetryQueue extends EventEmitter<RetryQueueEvents> {
171210

172211
clearTimeout(timeoutId)
173212

213+
// Check for error status codes that should trigger retry
174214
if (!response.ok) {
175-
throw new Error(`Request failed with status ${response.status}`)
215+
// Handle different status codes appropriately
216+
if (response.status >= 500) {
217+
// Server errors should be retried
218+
throw new Error(`Server error: ${response.status} ${response.statusText}`)
219+
} else if (response.status === 429) {
220+
// Rate limiting - return response to let caller handle Retry-After
221+
return response
222+
} else if (response.status === 401 || response.status === 403) {
223+
// Auth errors - retry with fresh auth headers from provider
224+
throw new Error(`Auth error: ${response.status}`)
225+
} else if (response.status >= 400 && response.status < 500) {
226+
// Other client errors (400, 404, etc.) should not be retried
227+
this.log(`[RetryQueue] Non-retryable status ${response.status}, removing from queue`)
228+
return response
229+
}
176230
}
177231

178232
return response

packages/cloud/src/retry-queue/__tests__/RetryQueue.test.ts

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,5 +357,150 @@ describe("RetryQueue", () => {
357357
// The timeout configuration is being used (verified by the constructor accepting it)
358358
// The actual timeout behavior is handled by the browser's AbortController
359359
})
360+
361+
it("should retry on 500+ status codes", async () => {
362+
const failListener = vi.fn()
363+
const successListener = vi.fn()
364+
retryQueue.on("request-retry-failed", failListener)
365+
retryQueue.on("request-retry-success", successListener)
366+
367+
await retryQueue.enqueue("https://api.example.com/test", { method: "POST" }, "telemetry")
368+
369+
// First attempt: 500 error
370+
fetchMock.mockResolvedValueOnce({ ok: false, status: 500, statusText: "Internal Server Error" })
371+
372+
await retryQueue.retryAll()
373+
374+
// Should fail and remain in queue
375+
expect(failListener).toHaveBeenCalledWith(
376+
expect.objectContaining({
377+
url: "https://api.example.com/test",
378+
retryCount: 1,
379+
lastError: "Server error: 500 Internal Server Error",
380+
}),
381+
expect.any(Error),
382+
)
383+
384+
let stats = retryQueue.getStats()
385+
expect(stats.totalQueued).toBe(1)
386+
387+
// Second attempt: success
388+
fetchMock.mockResolvedValueOnce({ ok: true, status: 200 })
389+
390+
await retryQueue.retryAll()
391+
392+
// Should succeed and be removed from queue
393+
expect(successListener).toHaveBeenCalled()
394+
stats = retryQueue.getStats()
395+
expect(stats.totalQueued).toBe(0)
396+
})
397+
398+
it("should handle 429 rate limiting with Retry-After header", async () => {
399+
await retryQueue.enqueue("https://api.example.com/test", { method: "POST" }, "telemetry")
400+
401+
// Mock 429 response with Retry-After header (in seconds)
402+
const retryAfterResponse = {
403+
ok: false,
404+
status: 429,
405+
headers: {
406+
get: vi.fn((header: string) => {
407+
if (header === "Retry-After") return "2" // 2 seconds
408+
return null
409+
}),
410+
},
411+
}
412+
413+
fetchMock.mockResolvedValueOnce(retryAfterResponse)
414+
415+
await retryQueue.retryAll()
416+
417+
// Request should still be in queue with nextRetryAfter set
418+
const stats = retryQueue.getStats()
419+
expect(stats.totalQueued).toBe(1)
420+
421+
// Try to retry immediately - should be skipped due to rate limiting
422+
fetchMock.mockClear()
423+
await retryQueue.retryAll()
424+
425+
// Fetch should not be called because request is rate-limited
426+
expect(fetchMock).not.toHaveBeenCalled()
427+
})
428+
429+
it("should retry on 401/403 auth errors", async () => {
430+
const failListener = vi.fn()
431+
retryQueue.on("request-retry-failed", failListener)
432+
433+
await retryQueue.enqueue("https://api.example.com/test", { method: "POST" }, "telemetry")
434+
435+
// Mock 401 error
436+
fetchMock.mockResolvedValueOnce({ ok: false, status: 401, statusText: "Unauthorized" })
437+
438+
await retryQueue.retryAll()
439+
440+
// Should fail and remain in queue for retry
441+
expect(failListener).toHaveBeenCalledWith(
442+
expect.objectContaining({
443+
url: "https://api.example.com/test",
444+
retryCount: 1,
445+
lastError: "Auth error: 401",
446+
}),
447+
expect.any(Error),
448+
)
449+
450+
const stats = retryQueue.getStats()
451+
expect(stats.totalQueued).toBe(1)
452+
})
453+
454+
it("should not retry on 400/404 client errors", async () => {
455+
const successListener = vi.fn()
456+
retryQueue.on("request-retry-success", successListener)
457+
458+
await retryQueue.enqueue("https://api.example.com/test", { method: "POST" }, "telemetry")
459+
460+
// Mock 404 error
461+
fetchMock.mockResolvedValueOnce({ ok: false, status: 404, statusText: "Not Found" })
462+
463+
await retryQueue.retryAll()
464+
465+
// Should be removed from queue without retry
466+
expect(successListener).toHaveBeenCalled()
467+
const stats = retryQueue.getStats()
468+
expect(stats.totalQueued).toBe(0)
469+
})
470+
471+
it("should prevent concurrent processing", async () => {
472+
// Add a single request
473+
await retryQueue.enqueue("https://api.example.com/test1", { method: "POST" }, "telemetry")
474+
475+
// Mock slow response
476+
let resolveFirst: () => void
477+
const firstPromise = new Promise<{ ok: boolean }>((resolve) => {
478+
resolveFirst = () => resolve({ ok: true })
479+
})
480+
481+
fetchMock.mockReturnValueOnce(firstPromise)
482+
483+
// Start first retryAll (don't await)
484+
const firstCall = retryQueue.retryAll()
485+
486+
// Try to call retryAll again immediately - should return immediately without processing
487+
const secondCall = retryQueue.retryAll()
488+
489+
// Second call should return immediately
490+
await secondCall
491+
492+
// Fetch should only be called once (from first call)
493+
expect(fetchMock).toHaveBeenCalledTimes(1)
494+
495+
// Resolve the promise
496+
resolveFirst!()
497+
498+
// Wait for first call to complete
499+
await firstCall
500+
501+
// Queue should be empty
502+
const stats = retryQueue.getStats()
503+
expect(stats.totalQueued).toBe(0)
504+
})
360505
})
361506
})

packages/cloud/src/retry-queue/types.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ export interface QueuedRequest {
77
type: "api-call" | "telemetry" | "settings" | "other"
88
operation?: string
99
lastError?: string
10+
nextRetryAfter?: number // Timestamp for when to retry next (for rate limiting)
1011
}
1112

1213
export interface QueueStats {

0 commit comments

Comments
 (0)