fix: properly handle exponential backoff for rate limiting in embedders

roomote · roomote · commit ac1770808481 · 2025-08-13T00:14:20.000Z
- Fixed issue where multiple concurrent requests hitting rate limits would all log retry messages simultaneously - Improved global rate limit state management to coordinate retry delays across parallel requests - Added proper delay calculation that considers both global consecutive errors and per-request attempt numbers - Added success callback to reset consecutive error count when requests succeed - Ensures exponential backoff delays are applied sequentially rather than all at once Fixes #7029
diff --git a/src/services/code-index/embedders/__tests__/openai-compatible-rate-limit.spec.ts b/src/services/code-index/embedders/__tests__/openai-compatible-rate-limit.spec.ts
@@ -182,8 +182,8 @@ describe("OpenAICompatibleEmbedder - Global Rate Limiting", () => {
 			usage: { prompt_tokens: 10, total_tokens: 15 },
 		})
 
-		// Trigger the updateGlobalRateLimitState method
-		await (embedder as any).updateGlobalRateLimitState(rateLimitError)
+		// Trigger the updateGlobalRateLimitState method with attempt number
+		await (embedder as any).updateGlobalRateLimitState(rateLimitError, 0)
 
 		// Should reset to 1 since more than 60 seconds passed
 		expect(state.consecutiveRateLimitErrors).toBe(1)
@@ -199,12 +199,8 @@ describe("OpenAICompatibleEmbedder - Global Rate Limiting", () => {
 		const rateLimitError = new Error("Rate limit exceeded") as any
 		rateLimitError.status = 429
 
-		// Trigger the updateGlobalRateLimitState method
-		await (embedder as any).updateGlobalRateLimitState(rateLimitError)
-
-		// Calculate the expected delay
-		const now = Date.now()
-		const delay = state.rateLimitResetTime - now
+		// Trigger the updateGlobalRateLimitState method with attempt number
+		const delay = await (embedder as any).updateGlobalRateLimitState(rateLimitError, 0)
 
 		// Should be capped at 5 minutes (300000ms)
 		expect(delay).toBeLessThanOrEqual(300000)
diff --git a/src/services/code-index/embedders/openai-compatible.ts b/src/services/code-index/embedders/openai-compatible.ts
@@ -294,6 +294,9 @@ export class OpenAICompatibleEmbedder implements IEmbedder {
 
 				const embeddings = response.data.map((item) => item.embedding as number[])
 
+				// Reset consecutive errors on success
+				await this.resetGlobalRateLimitOnSuccess()
+
 				return {
 					embeddings: embeddings,
 					usage: {
@@ -315,14 +318,9 @@ export class OpenAICompatibleEmbedder implements IEmbedder {
 				// Check if it's a rate limit error
 				const httpError = error as HttpError
 				if (httpError?.status === 429) {
-					// Update global rate limit state
-					await this.updateGlobalRateLimitState(httpError)
-
 					if (hasMoreAttempts) {
-						// Calculate delay based on global rate limit state
-						const baseDelay = INITIAL_DELAY_MS * Math.pow(2, attempts)
-						const globalDelay = await this.getGlobalRateLimitDelay()
-						const delayMs = Math.max(baseDelay, globalDelay)
+						// Update global rate limit state and get the delay
+						const delayMs = await this.updateGlobalRateLimitState(httpError, attempts)
 
 						console.warn(
 							t("embeddings:rateLimitRetry", {
@@ -434,14 +432,20 @@ export class OpenAICompatibleEmbedder implements IEmbedder {
 	}
 
 	/**
-	 * Updates global rate limit state when a 429 error occurs
+	 * Updates global rate limit state when a 429 error occurs and returns the delay to use
 	 */
-	private async updateGlobalRateLimitState(error: HttpError): Promise<void> {
+	private async updateGlobalRateLimitState(error: HttpError, attemptNumber: number): Promise<number> {
 		const release = await OpenAICompatibleEmbedder.globalRateLimitState.mutex.acquire()
 		try {
 			const state = OpenAICompatibleEmbedder.globalRateLimitState
 			const now = Date.now()
 
+			// Check if we're already in a rate limit period
+			if (state.isRateLimited && state.rateLimitResetTime > now) {
+				// Return the remaining wait time
+				return state.rateLimitResetTime - now
+			}
+
 			// Increment consecutive rate limit errors
 			if (now - state.lastRateLimitError < 60000) {
 				// Within 1 minute
@@ -452,16 +456,47 @@ export class OpenAICompatibleEmbedder implements IEmbedder {
 
 			state.lastRateLimitError = now
 
-			// Calculate exponential backoff based on consecutive errors
+			// Calculate exponential backoff based on consecutive errors AND attempt number
+			// Use the maximum of the two to ensure proper backoff
 			const baseDelay = 5000 // 5 seconds base
 			const maxDelay = 300000 // 5 minutes max
-			const exponentialDelay = Math.min(baseDelay * Math.pow(2, state.consecutiveRateLimitErrors - 1), maxDelay)
+
+			// Calculate delay based on consecutive errors across all requests
+			const globalExponentialDelay = Math.min(
+				baseDelay * Math.pow(2, state.consecutiveRateLimitErrors - 1),
+				maxDelay,
+			)
+
+			// Calculate delay based on this specific request's attempt number
+			const attemptExponentialDelay = Math.min(INITIAL_DELAY_MS * Math.pow(2, attemptNumber), maxDelay)
+
+			// Use the larger of the two delays
+			const exponentialDelay = Math.max(globalExponentialDelay, attemptExponentialDelay)
 
 			// Set global rate limit
 			state.isRateLimited = true
 			state.rateLimitResetTime = now + exponentialDelay
 
-			// Silent rate limit activation - no logging to prevent flooding
+			return exponentialDelay
+		} finally {
+			release()
+		}
+	}
+
+	/**
+	 * Resets the consecutive error count on successful request
+	 */
+	private async resetGlobalRateLimitOnSuccess(): Promise<void> {
+		const release = await OpenAICompatibleEmbedder.globalRateLimitState.mutex.acquire()
+		try {
+			const state = OpenAICompatibleEmbedder.globalRateLimitState
+
+			// Reset rate limit state on success
+			if (state.consecutiveRateLimitErrors > 0) {
+				state.consecutiveRateLimitErrors = 0
+				state.isRateLimited = false
+				state.rateLimitResetTime = 0
+			}
 		} finally {
 			release()
 		}