fix: improve codebase indexing reliability and restart functionality

roomote · roomote · commit a597784a29ad · 2025-07-17T17:05:42.000Z
- Enhanced OpenAI embedder with better rate limit detection for text-embedding-3-large - Added exponential backoff with longer delays and jitter for large models - Improved error propagation from batch processing to UI layer with specific guidance - Fixed UI state management to allow restarting indexing after failures - Added progress persistence to cache manager for potential future resume functionality - Updated orchestrator to handle partial failures more gracefully with better error messages - Added model-specific error handling and recovery suggestions Fixes #5819
diff --git a/src/services/code-index/cache-manager.ts b/src/services/code-index/cache-manager.ts
@@ -11,8 +11,22 @@ import { TelemetryEventName } from "@roo-code/types"
  */
 export class CacheManager implements ICacheManager {
 	private cachePath: vscode.Uri
+	private progressPath: vscode.Uri
 	private fileHashes: Record<string, string> = {}
+	private indexingProgress: {
+		lastIndexedBlock: number
+		totalBlocks: number
+		failedBatches: string[]
+		lastError?: string
+		timestamp: number
+	} = {
+		lastIndexedBlock: 0,
+		totalBlocks: 0,
+		failedBatches: [],
+		timestamp: Date.now()
+	}
 	private _debouncedSaveCache: () => void
+	private _debouncedSaveProgress: () => void
 
 	/**
 	 * Creates a new cache manager
@@ -23,17 +37,25 @@ export class CacheManager implements ICacheManager {
 		private context: vscode.ExtensionContext,
 		private workspacePath: string,
 	) {
+		const workspaceHash = createHash("sha256").update(workspacePath).digest("hex")
 		this.cachePath = vscode.Uri.joinPath(
 			context.globalStorageUri,
-			`roo-index-cache-${createHash("sha256").update(workspacePath).digest("hex")}.json`,
+			`roo-index-cache-${workspaceHash}.json`,
+		)
+		this.progressPath = vscode.Uri.joinPath(
+			context.globalStorageUri,
+			`roo-index-progress-${workspaceHash}.json`,
 		)
 		this._debouncedSaveCache = debounce(async () => {
 			await this._performSave()
 		}, 1500)
+		this._debouncedSaveProgress = debounce(async () => {
+			await this._performProgressSave()
+		}, 1000)
 	}
 
 	/**
-	 * Initializes the cache manager by loading the cache file
+	 * Initializes the cache manager by loading the cache file and progress
 	 */
 	async initialize(): Promise<void> {
 		try {
@@ -44,9 +66,23 @@ export class CacheManager implements ICacheManager {
 			TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
 				error: error instanceof Error ? error.message : String(error),
 				stack: error instanceof Error ? error.stack : undefined,
-				location: "initialize",
+				location: "initialize:cache",
 			})
 		}
+
+		// Load progress data
+		try {
+			const progressData = await vscode.workspace.fs.readFile(this.progressPath)
+			this.indexingProgress = JSON.parse(progressData.toString())
+		} catch (error) {
+			// Progress file doesn't exist or is corrupted - start fresh
+			this.indexingProgress = {
+				lastIndexedBlock: 0,
+				totalBlocks: 0,
+				failedBatches: [],
+				timestamp: Date.now()
+			}
+		}
 	}
 
 	/**
@@ -117,4 +153,86 @@ export class CacheManager implements ICacheManager {
 	getAllHashes(): Record<string, string> {
 		return { ...this.fileHashes }
 	}
+
+	/**
+	 * Saves progress data to disk
+	 */
+	private async _performProgressSave(): Promise<void> {
+		try {
+			await safeWriteJson(this.progressPath.fsPath, this.indexingProgress)
+		} catch (error) {
+			console.error("Failed to save progress:", error)
+			TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
+				error: error instanceof Error ? error.message : String(error),
+				stack: error instanceof Error ? error.stack : undefined,
+				location: "_performProgressSave",
+			})
+		}
+	}
+
+	/**
+	 * Updates indexing progress
+	 * @param indexedBlocks Number of blocks indexed so far
+	 * @param totalBlocks Total number of blocks to index
+	 */
+	updateProgress(indexedBlocks: number, totalBlocks: number): void {
+		this.indexingProgress.lastIndexedBlock = indexedBlocks
+		this.indexingProgress.totalBlocks = totalBlocks
+		this.indexingProgress.timestamp = Date.now()
+		this._debouncedSaveProgress()
+	}
+
+	/**
+	 * Records a failed batch for potential retry
+	 * @param batchId Identifier for the failed batch
+	 * @param error Error message
+	 */
+	recordFailedBatch(batchId: string, error: string): void {
+		if (!this.indexingProgress.failedBatches.includes(batchId)) {
+			this.indexingProgress.failedBatches.push(batchId)
+		}
+		this.indexingProgress.lastError = error
+		this.indexingProgress.timestamp = Date.now()
+		this._debouncedSaveProgress()
+	}
+
+	/**
+	 * Gets the current indexing progress
+	 * @returns Progress information
+	 */
+	getProgress(): {
+		lastIndexedBlock: number
+		totalBlocks: number
+		failedBatches: string[]
+		lastError?: string
+		timestamp: number
+	} {
+		return { ...this.indexingProgress }
+	}
+
+	/**
+	 * Clears progress data
+	 */
+	clearProgress(): void {
+		this.indexingProgress = {
+			lastIndexedBlock: 0,
+			totalBlocks: 0,
+			failedBatches: [],
+			timestamp: Date.now()
+		}
+		this._debouncedSaveProgress()
+	}
+
+	/**
+	 * Clears both cache and progress files
+	 */
+	async clearAll(): Promise<void> {
+		await this.clearCacheFile()
+		this.clearProgress()
+		try {
+			await vscode.workspace.fs.delete(this.progressPath)
+		} catch (error) {
+			// Progress file might not exist, which is fine
+		}
+	}
 }
diff --git a/src/services/code-index/embedders/openai.ts b/src/services/code-index/embedders/openai.ts
@@ -126,6 +126,10 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
 		batchTexts: string[],
 		model: string,
 	): Promise<{ embeddings: number[][]; usage: { promptTokens: number; totalTokens: number } }> {
+		// Use longer delays for text-embedding-3-large model due to stricter rate limits
+		const isLargeModel = model.includes("text-embedding-3-large")
+		const baseDelayMs = isLargeModel ? INITIAL_DELAY_MS * 4 : INITIAL_DELAY_MS // 2 seconds for large model
+		
 		for (let attempts = 0; attempts < MAX_RETRIES; attempts++) {
 			try {
 				const response = await this.embeddingsClient.embeddings.create({
@@ -143,17 +147,46 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
 			} catch (error: any) {
 				const hasMoreAttempts = attempts < MAX_RETRIES - 1
 
-				// Check if it's a rate limit error
+				// Enhanced rate limit detection
 				const httpError = error as HttpError
-				if (httpError?.status === 429 && hasMoreAttempts) {
-					const delayMs = INITIAL_DELAY_MS * Math.pow(2, attempts)
+				const isRateLimit = httpError?.status === 429 ||
+					(error?.message && (
+						error.message.includes("rate limit") ||
+						error.message.includes("Rate limit") ||
+						error.message.includes("too many requests") ||
+						error.message.includes("quota exceeded")
+					))
+
+				if (isRateLimit && hasMoreAttempts) {
+					// Use longer exponential backoff for large models and rate limit errors
+					const multiplier = isLargeModel ? 3 : 2
+					const delayMs = baseDelayMs * Math.pow(multiplier, attempts)
+					
 					console.warn(
 						t("embeddings:rateLimitRetry", {
 							delayMs,
 							attempt: attempts + 1,
 							maxRetries: MAX_RETRIES,
 						}),
 					)
+					
+					// Add jitter to prevent thundering herd
+					const jitter = Math.random() * 1000
+					await new Promise((resolve) => setTimeout(resolve, delayMs + jitter))
+					continue
+				}
+
+				// Check for other retryable errors (network issues, timeouts)
+				const isRetryableError = error?.code === 'ECONNRESET' ||
+					error?.code === 'ETIMEDOUT' ||
+					error?.code === 'ENOTFOUND' ||
+					(httpError?.status && httpError.status >= 500)
+
+				if (isRetryableError && hasMoreAttempts) {
+					const delayMs = baseDelayMs * Math.pow(2, attempts)
+					console.warn(
+						`Retrying OpenAI request due to ${error?.code || httpError?.status} (attempt ${attempts + 1}/${MAX_RETRIES}) after ${delayMs}ms`,
+					)
 					await new Promise((resolve) => setTimeout(resolve, delayMs))
 					continue
 				}
@@ -164,13 +197,24 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
 					stack: error instanceof Error ? error.stack : undefined,
 					location: "OpenAiEmbedder:_embedBatchWithRetries",
 					attempt: attempts + 1,
+					model: model,
+					batchSize: batchTexts.length,
+					isRateLimit,
+					isRetryableError,
 				})
 
 				// Log the error for debugging
 				console.error(`OpenAI embedder error (attempt ${attempts + 1}/${MAX_RETRIES}):`, error)
 
-				// Format and throw the error
-				throw formatEmbeddingError(error, MAX_RETRIES)
+				// Format and throw the error with more context
+				const formattedError = formatEmbeddingError(error, MAX_RETRIES)
+				
+				// Add model-specific context to error message
+				if (isLargeModel && isRateLimit) {
+					throw new Error(`${formattedError.message} Note: text-embedding-3-large has stricter rate limits. Consider using text-embedding-3-small for large codebases.`)
+				}
+				
+				throw formattedError
 			}
 		}
 
diff --git a/src/services/code-index/orchestrator.ts b/src/services/code-index/orchestrator.ts
@@ -100,6 +100,7 @@ export class CodeIndexOrchestrator {
 			return
 		}
 
+		// Allow restarting from Error state - this fixes the restart issue
 		if (
 			this._isProcessing ||
 			(this.stateManager.state !== "Standby" &&
@@ -112,6 +113,12 @@ export class CodeIndexOrchestrator {
 			return
 		}
 
+		// Reset error state when restarting
+		if (this.stateManager.state === "Error") {
+			console.log("[CodeIndexOrchestrator] Restarting indexing from error state")
+			this.stateManager.setSystemState("Standby", "Restarting indexing...")
+		}
+
 		this._isProcessing = true
 		this.stateManager.setSystemState("Indexing", "Initializing services...")
 
@@ -157,42 +164,64 @@ export class CodeIndexOrchestrator {
 
 			const { stats } = result
 
+			// Enhanced failure handling with better error messages and recovery options
+			const totalBlocksFound = cumulativeBlocksFoundSoFar
+			const totalBlocksIndexed = cumulativeBlocksIndexed
+			const failureRate = totalBlocksFound > 0 ? (totalBlocksFound - totalBlocksIndexed) / totalBlocksFound : 0
+
 			// Check if any blocks were actually indexed successfully
-			// If no blocks were indexed but blocks were found, it means all batches failed
-			if (cumulativeBlocksIndexed === 0 && cumulativeBlocksFoundSoFar > 0) {
+			if (totalBlocksIndexed === 0 && totalBlocksFound > 0) {
 				if (batchErrors.length > 0) {
 					// Use the first batch error as it's likely representative of the main issue
 					const firstError = batchErrors[0]
-					throw new Error(`Indexing failed: ${firstError.message}`)
+					const errorMessage = `Indexing failed: ${firstError.message}`
+					
+					// Add specific guidance for common issues
+					if (firstError.message.includes("rate limit") || firstError.message.includes("429")) {
+						throw new Error(`${errorMessage}\n\nSuggestion: The API rate limit was exceeded. Try again in a few minutes, or consider using a smaller embedding model like text-embedding-3-small for large codebases.`)
+					} else if (firstError.message.includes("authentication") || firstError.message.includes("401")) {
+						throw new Error(`${errorMessage}\n\nSuggestion: Check your API key configuration in the settings.`)
+					} else if (firstError.message.includes("quota") || firstError.message.includes("billing")) {
+						throw new Error(`${errorMessage}\n\nSuggestion: Check your OpenAI account billing and usage limits.`)
+					}
+					
+					throw new Error(errorMessage)
 				} else {
 					throw new Error(
-						"Indexing failed: No code blocks were successfully indexed. This usually indicates an embedder configuration issue.",
+						"Indexing failed: No code blocks were successfully indexed. This usually indicates an embedder configuration issue.\n\nSuggestion: Verify your API settings and try again.",
 					)
 				}
 			}
 
-			// Check for partial failures - if a significant portion of blocks failed
-			const failureRate = (cumulativeBlocksFoundSoFar - cumulativeBlocksIndexed) / cumulativeBlocksFoundSoFar
-			if (batchErrors.length > 0 && failureRate > 0.1) {
-				// More than 10% of blocks failed to index
+			// Handle partial failures more gracefully
+			if (batchErrors.length > 0) {
 				const firstError = batchErrors[0]
-				throw new Error(
-					`Indexing partially failed: Only ${cumulativeBlocksIndexed} of ${cumulativeBlocksFoundSoFar} blocks were indexed. ${firstError.message}`,
-				)
-			}
-
-			// CRITICAL: If there were ANY batch errors and NO blocks were successfully indexed,
-			// this is a complete failure regardless of the failure rate calculation
-			if (batchErrors.length > 0 && cumulativeBlocksIndexed === 0) {
-				const firstError = batchErrors[0]
-				throw new Error(`Indexing failed completely: ${firstError.message}`)
+				
+				// If failure rate is high (>50%), treat as critical failure
+				if (failureRate > 0.5) {
+					throw new Error(
+						`Indexing mostly failed: Only ${totalBlocksIndexed} of ${totalBlocksFound} blocks were indexed (${Math.round(failureRate * 100)}% failure rate). ${firstError.message}\n\nSuggestion: Check your network connection and API configuration, then try again.`,
+					)
+				}
+				
+				// If failure rate is moderate (10-50%), log warning but continue
+				if (failureRate > 0.1) {
+					console.warn(
+						`[CodeIndexOrchestrator] Partial indexing failure: ${totalBlocksIndexed}/${totalBlocksFound} blocks indexed (${Math.round(failureRate * 100)}% failure rate). Error: ${firstError.message}`,
+					)
+					
+					// Set a warning state but don't fail completely
+					this.stateManager.setSystemState("Indexed",
+						`Indexing completed with warnings: ${totalBlocksIndexed}/${totalBlocksFound} blocks indexed. Some files may have been skipped due to API issues.`
+					)
+				}
 			}
 
 			// Final sanity check: If we found blocks but indexed none and somehow no errors were reported,
 			// this is still a failure
-			if (cumulativeBlocksFoundSoFar > 0 && cumulativeBlocksIndexed === 0) {
+			if (totalBlocksFound > 0 && totalBlocksIndexed === 0 && batchErrors.length === 0) {
 				throw new Error(
-					"Indexing failed: No code blocks were successfully indexed despite finding files to process. This indicates a critical embedder failure.",
+					"Indexing failed: No code blocks were successfully indexed despite finding files to process. This indicates a critical embedder failure.\n\nSuggestion: Check your embedder configuration and try again.",
 				)
 			}
 
diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts
@@ -415,15 +415,24 @@ export class DirectoryScanner implements IDirectoryScanner {
 				// Preserve the original error message from embedders which now have detailed i18n messages
 				const errorMessage = lastError.message || "Unknown error"
 
-				// For other errors, provide context
-				onError(
-					new Error(
-						t("embeddings:scanner.failedToProcessBatchWithError", {
-							maxRetries: MAX_BATCH_RETRIES,
-							errorMessage,
-						}),
-					),
-				)
+				// Enhanced error context with recovery suggestions
+				let enhancedMessage = t("embeddings:scanner.failedToProcessBatchWithError", {
+					maxRetries: MAX_BATCH_RETRIES,
+					errorMessage,
+				})
+
+				// Add specific guidance based on error type
+				if (errorMessage.includes("rate limit") || errorMessage.includes("429")) {
+					enhancedMessage += "\n\nThis appears to be a rate limiting issue. The indexing process can be restarted once the rate limit resets."
+				} else if (errorMessage.includes("authentication") || errorMessage.includes("401")) {
+					enhancedMessage += "\n\nThis appears to be an authentication issue. Please check your API key configuration."
+				} else if (errorMessage.includes("network") || errorMessage.includes("timeout") || errorMessage.includes("ECONNRESET")) {
+					enhancedMessage += "\n\nThis appears to be a network connectivity issue. Please check your internet connection and try again."
+				} else {
+					enhancedMessage += "\n\nYou can restart the indexing process from the Code Index settings."
+				}
+
+				onError(new Error(enhancedMessage))
 			}
 		}
 	}