Skip to content

Commit a597784

Browse files
committed
fix: improve codebase indexing reliability and restart functionality
- Enhanced OpenAI embedder with better rate limit detection for text-embedding-3-large - Added exponential backoff with longer delays and jitter for large models - Improved error propagation from batch processing to UI layer with specific guidance - Fixed UI state management to allow restarting indexing after failures - Added progress persistence to cache manager for potential future resume functionality - Updated orchestrator to handle partial failures more gracefully with better error messages - Added model-specific error handling and recovery suggestions Fixes #5819
1 parent fb374b3 commit a597784

File tree

4 files changed

+237
-37
lines changed

4 files changed

+237
-37
lines changed

src/services/code-index/cache-manager.ts

Lines changed: 121 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,22 @@ import { TelemetryEventName } from "@roo-code/types"
1111
*/
1212
export class CacheManager implements ICacheManager {
1313
private cachePath: vscode.Uri
14+
private progressPath: vscode.Uri
1415
private fileHashes: Record<string, string> = {}
16+
private indexingProgress: {
17+
lastIndexedBlock: number
18+
totalBlocks: number
19+
failedBatches: string[]
20+
lastError?: string
21+
timestamp: number
22+
} = {
23+
lastIndexedBlock: 0,
24+
totalBlocks: 0,
25+
failedBatches: [],
26+
timestamp: Date.now()
27+
}
1528
private _debouncedSaveCache: () => void
29+
private _debouncedSaveProgress: () => void
1630

1731
/**
1832
* Creates a new cache manager
@@ -23,17 +37,25 @@ export class CacheManager implements ICacheManager {
2337
private context: vscode.ExtensionContext,
2438
private workspacePath: string,
2539
) {
40+
const workspaceHash = createHash("sha256").update(workspacePath).digest("hex")
2641
this.cachePath = vscode.Uri.joinPath(
2742
context.globalStorageUri,
28-
`roo-index-cache-${createHash("sha256").update(workspacePath).digest("hex")}.json`,
43+
`roo-index-cache-${workspaceHash}.json`,
44+
)
45+
this.progressPath = vscode.Uri.joinPath(
46+
context.globalStorageUri,
47+
`roo-index-progress-${workspaceHash}.json`,
2948
)
3049
this._debouncedSaveCache = debounce(async () => {
3150
await this._performSave()
3251
}, 1500)
52+
this._debouncedSaveProgress = debounce(async () => {
53+
await this._performProgressSave()
54+
}, 1000)
3355
}
3456

3557
/**
36-
* Initializes the cache manager by loading the cache file
58+
* Initializes the cache manager by loading the cache file and progress
3759
*/
3860
async initialize(): Promise<void> {
3961
try {
@@ -44,9 +66,23 @@ export class CacheManager implements ICacheManager {
4466
TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
4567
error: error instanceof Error ? error.message : String(error),
4668
stack: error instanceof Error ? error.stack : undefined,
47-
location: "initialize",
69+
location: "initialize:cache",
4870
})
4971
}
72+
73+
// Load progress data
74+
try {
75+
const progressData = await vscode.workspace.fs.readFile(this.progressPath)
76+
this.indexingProgress = JSON.parse(progressData.toString())
77+
} catch (error) {
78+
// Progress file doesn't exist or is corrupted - start fresh
79+
this.indexingProgress = {
80+
lastIndexedBlock: 0,
81+
totalBlocks: 0,
82+
failedBatches: [],
83+
timestamp: Date.now()
84+
}
85+
}
5086
}
5187

5288
/**
@@ -117,4 +153,86 @@ export class CacheManager implements ICacheManager {
117153
getAllHashes(): Record<string, string> {
118154
return { ...this.fileHashes }
119155
}
156+
157+
/**
158+
* Saves progress data to disk
159+
*/
160+
private async _performProgressSave(): Promise<void> {
161+
try {
162+
await safeWriteJson(this.progressPath.fsPath, this.indexingProgress)
163+
} catch (error) {
164+
console.error("Failed to save progress:", error)
165+
TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
166+
error: error instanceof Error ? error.message : String(error),
167+
stack: error instanceof Error ? error.stack : undefined,
168+
location: "_performProgressSave",
169+
})
170+
}
171+
}
172+
173+
/**
174+
* Updates indexing progress
175+
* @param indexedBlocks Number of blocks indexed so far
176+
* @param totalBlocks Total number of blocks to index
177+
*/
178+
updateProgress(indexedBlocks: number, totalBlocks: number): void {
179+
this.indexingProgress.lastIndexedBlock = indexedBlocks
180+
this.indexingProgress.totalBlocks = totalBlocks
181+
this.indexingProgress.timestamp = Date.now()
182+
this._debouncedSaveProgress()
183+
}
184+
185+
/**
186+
* Records a failed batch for potential retry
187+
* @param batchId Identifier for the failed batch
188+
* @param error Error message
189+
*/
190+
recordFailedBatch(batchId: string, error: string): void {
191+
if (!this.indexingProgress.failedBatches.includes(batchId)) {
192+
this.indexingProgress.failedBatches.push(batchId)
193+
}
194+
this.indexingProgress.lastError = error
195+
this.indexingProgress.timestamp = Date.now()
196+
this._debouncedSaveProgress()
197+
}
198+
199+
/**
200+
* Gets the current indexing progress
201+
* @returns Progress information
202+
*/
203+
getProgress(): {
204+
lastIndexedBlock: number
205+
totalBlocks: number
206+
failedBatches: string[]
207+
lastError?: string
208+
timestamp: number
209+
} {
210+
return { ...this.indexingProgress }
211+
}
212+
213+
/**
214+
* Clears progress data
215+
*/
216+
clearProgress(): void {
217+
this.indexingProgress = {
218+
lastIndexedBlock: 0,
219+
totalBlocks: 0,
220+
failedBatches: [],
221+
timestamp: Date.now()
222+
}
223+
this._debouncedSaveProgress()
224+
}
225+
226+
/**
227+
* Clears both cache and progress files
228+
*/
229+
async clearAll(): Promise<void> {
230+
await this.clearCacheFile()
231+
this.clearProgress()
232+
try {
233+
await vscode.workspace.fs.delete(this.progressPath)
234+
} catch (error) {
235+
// Progress file might not exist, which is fine
236+
}
237+
}
120238
}

src/services/code-index/embedders/openai.ts

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,10 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
126126
batchTexts: string[],
127127
model: string,
128128
): Promise<{ embeddings: number[][]; usage: { promptTokens: number; totalTokens: number } }> {
129+
// Use longer delays for text-embedding-3-large model due to stricter rate limits
130+
const isLargeModel = model.includes("text-embedding-3-large")
131+
const baseDelayMs = isLargeModel ? INITIAL_DELAY_MS * 4 : INITIAL_DELAY_MS // 2 seconds for large model
132+
129133
for (let attempts = 0; attempts < MAX_RETRIES; attempts++) {
130134
try {
131135
const response = await this.embeddingsClient.embeddings.create({
@@ -143,17 +147,46 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
143147
} catch (error: any) {
144148
const hasMoreAttempts = attempts < MAX_RETRIES - 1
145149

146-
// Check if it's a rate limit error
150+
// Enhanced rate limit detection
147151
const httpError = error as HttpError
148-
if (httpError?.status === 429 && hasMoreAttempts) {
149-
const delayMs = INITIAL_DELAY_MS * Math.pow(2, attempts)
152+
const isRateLimit = httpError?.status === 429 ||
153+
(error?.message && (
154+
error.message.includes("rate limit") ||
155+
error.message.includes("Rate limit") ||
156+
error.message.includes("too many requests") ||
157+
error.message.includes("quota exceeded")
158+
))
159+
160+
if (isRateLimit && hasMoreAttempts) {
161+
// Use longer exponential backoff for large models and rate limit errors
162+
const multiplier = isLargeModel ? 3 : 2
163+
const delayMs = baseDelayMs * Math.pow(multiplier, attempts)
164+
150165
console.warn(
151166
t("embeddings:rateLimitRetry", {
152167
delayMs,
153168
attempt: attempts + 1,
154169
maxRetries: MAX_RETRIES,
155170
}),
156171
)
172+
173+
// Add jitter to prevent thundering herd
174+
const jitter = Math.random() * 1000
175+
await new Promise((resolve) => setTimeout(resolve, delayMs + jitter))
176+
continue
177+
}
178+
179+
// Check for other retryable errors (network issues, timeouts)
180+
const isRetryableError = error?.code === 'ECONNRESET' ||
181+
error?.code === 'ETIMEDOUT' ||
182+
error?.code === 'ENOTFOUND' ||
183+
(httpError?.status && httpError.status >= 500)
184+
185+
if (isRetryableError && hasMoreAttempts) {
186+
const delayMs = baseDelayMs * Math.pow(2, attempts)
187+
console.warn(
188+
`Retrying OpenAI request due to ${error?.code || httpError?.status} (attempt ${attempts + 1}/${MAX_RETRIES}) after ${delayMs}ms`,
189+
)
157190
await new Promise((resolve) => setTimeout(resolve, delayMs))
158191
continue
159192
}
@@ -164,13 +197,24 @@ export class OpenAiEmbedder extends OpenAiNativeHandler implements IEmbedder {
164197
stack: error instanceof Error ? error.stack : undefined,
165198
location: "OpenAiEmbedder:_embedBatchWithRetries",
166199
attempt: attempts + 1,
200+
model: model,
201+
batchSize: batchTexts.length,
202+
isRateLimit,
203+
isRetryableError,
167204
})
168205

169206
// Log the error for debugging
170207
console.error(`OpenAI embedder error (attempt ${attempts + 1}/${MAX_RETRIES}):`, error)
171208

172-
// Format and throw the error
173-
throw formatEmbeddingError(error, MAX_RETRIES)
209+
// Format and throw the error with more context
210+
const formattedError = formatEmbeddingError(error, MAX_RETRIES)
211+
212+
// Add model-specific context to error message
213+
if (isLargeModel && isRateLimit) {
214+
throw new Error(`${formattedError.message} Note: text-embedding-3-large has stricter rate limits. Consider using text-embedding-3-small for large codebases.`)
215+
}
216+
217+
throw formattedError
174218
}
175219
}
176220

src/services/code-index/orchestrator.ts

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ export class CodeIndexOrchestrator {
100100
return
101101
}
102102

103+
// Allow restarting from Error state - this fixes the restart issue
103104
if (
104105
this._isProcessing ||
105106
(this.stateManager.state !== "Standby" &&
@@ -112,6 +113,12 @@ export class CodeIndexOrchestrator {
112113
return
113114
}
114115

116+
// Reset error state when restarting
117+
if (this.stateManager.state === "Error") {
118+
console.log("[CodeIndexOrchestrator] Restarting indexing from error state")
119+
this.stateManager.setSystemState("Standby", "Restarting indexing...")
120+
}
121+
115122
this._isProcessing = true
116123
this.stateManager.setSystemState("Indexing", "Initializing services...")
117124

@@ -157,42 +164,64 @@ export class CodeIndexOrchestrator {
157164

158165
const { stats } = result
159166

167+
// Enhanced failure handling with better error messages and recovery options
168+
const totalBlocksFound = cumulativeBlocksFoundSoFar
169+
const totalBlocksIndexed = cumulativeBlocksIndexed
170+
const failureRate = totalBlocksFound > 0 ? (totalBlocksFound - totalBlocksIndexed) / totalBlocksFound : 0
171+
160172
// Check if any blocks were actually indexed successfully
161-
// If no blocks were indexed but blocks were found, it means all batches failed
162-
if (cumulativeBlocksIndexed === 0 && cumulativeBlocksFoundSoFar > 0) {
173+
if (totalBlocksIndexed === 0 && totalBlocksFound > 0) {
163174
if (batchErrors.length > 0) {
164175
// Use the first batch error as it's likely representative of the main issue
165176
const firstError = batchErrors[0]
166-
throw new Error(`Indexing failed: ${firstError.message}`)
177+
const errorMessage = `Indexing failed: ${firstError.message}`
178+
179+
// Add specific guidance for common issues
180+
if (firstError.message.includes("rate limit") || firstError.message.includes("429")) {
181+
throw new Error(`${errorMessage}\n\nSuggestion: The API rate limit was exceeded. Try again in a few minutes, or consider using a smaller embedding model like text-embedding-3-small for large codebases.`)
182+
} else if (firstError.message.includes("authentication") || firstError.message.includes("401")) {
183+
throw new Error(`${errorMessage}\n\nSuggestion: Check your API key configuration in the settings.`)
184+
} else if (firstError.message.includes("quota") || firstError.message.includes("billing")) {
185+
throw new Error(`${errorMessage}\n\nSuggestion: Check your OpenAI account billing and usage limits.`)
186+
}
187+
188+
throw new Error(errorMessage)
167189
} else {
168190
throw new Error(
169-
"Indexing failed: No code blocks were successfully indexed. This usually indicates an embedder configuration issue.",
191+
"Indexing failed: No code blocks were successfully indexed. This usually indicates an embedder configuration issue.\n\nSuggestion: Verify your API settings and try again.",
170192
)
171193
}
172194
}
173195

174-
// Check for partial failures - if a significant portion of blocks failed
175-
const failureRate = (cumulativeBlocksFoundSoFar - cumulativeBlocksIndexed) / cumulativeBlocksFoundSoFar
176-
if (batchErrors.length > 0 && failureRate > 0.1) {
177-
// More than 10% of blocks failed to index
196+
// Handle partial failures more gracefully
197+
if (batchErrors.length > 0) {
178198
const firstError = batchErrors[0]
179-
throw new Error(
180-
`Indexing partially failed: Only ${cumulativeBlocksIndexed} of ${cumulativeBlocksFoundSoFar} blocks were indexed. ${firstError.message}`,
181-
)
182-
}
183-
184-
// CRITICAL: If there were ANY batch errors and NO blocks were successfully indexed,
185-
// this is a complete failure regardless of the failure rate calculation
186-
if (batchErrors.length > 0 && cumulativeBlocksIndexed === 0) {
187-
const firstError = batchErrors[0]
188-
throw new Error(`Indexing failed completely: ${firstError.message}`)
199+
200+
// If failure rate is high (>50%), treat as critical failure
201+
if (failureRate > 0.5) {
202+
throw new Error(
203+
`Indexing mostly failed: Only ${totalBlocksIndexed} of ${totalBlocksFound} blocks were indexed (${Math.round(failureRate * 100)}% failure rate). ${firstError.message}\n\nSuggestion: Check your network connection and API configuration, then try again.`,
204+
)
205+
}
206+
207+
// If failure rate is moderate (10-50%), log warning but continue
208+
if (failureRate > 0.1) {
209+
console.warn(
210+
`[CodeIndexOrchestrator] Partial indexing failure: ${totalBlocksIndexed}/${totalBlocksFound} blocks indexed (${Math.round(failureRate * 100)}% failure rate). Error: ${firstError.message}`,
211+
)
212+
213+
// Set a warning state but don't fail completely
214+
this.stateManager.setSystemState("Indexed",
215+
`Indexing completed with warnings: ${totalBlocksIndexed}/${totalBlocksFound} blocks indexed. Some files may have been skipped due to API issues.`
216+
)
217+
}
189218
}
190219

191220
// Final sanity check: If we found blocks but indexed none and somehow no errors were reported,
192221
// this is still a failure
193-
if (cumulativeBlocksFoundSoFar > 0 && cumulativeBlocksIndexed === 0) {
222+
if (totalBlocksFound > 0 && totalBlocksIndexed === 0 && batchErrors.length === 0) {
194223
throw new Error(
195-
"Indexing failed: No code blocks were successfully indexed despite finding files to process. This indicates a critical embedder failure.",
224+
"Indexing failed: No code blocks were successfully indexed despite finding files to process. This indicates a critical embedder failure.\n\nSuggestion: Check your embedder configuration and try again.",
196225
)
197226
}
198227

src/services/code-index/processors/scanner.ts

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -415,15 +415,24 @@ export class DirectoryScanner implements IDirectoryScanner {
415415
// Preserve the original error message from embedders which now have detailed i18n messages
416416
const errorMessage = lastError.message || "Unknown error"
417417

418-
// For other errors, provide context
419-
onError(
420-
new Error(
421-
t("embeddings:scanner.failedToProcessBatchWithError", {
422-
maxRetries: MAX_BATCH_RETRIES,
423-
errorMessage,
424-
}),
425-
),
426-
)
418+
// Enhanced error context with recovery suggestions
419+
let enhancedMessage = t("embeddings:scanner.failedToProcessBatchWithError", {
420+
maxRetries: MAX_BATCH_RETRIES,
421+
errorMessage,
422+
})
423+
424+
// Add specific guidance based on error type
425+
if (errorMessage.includes("rate limit") || errorMessage.includes("429")) {
426+
enhancedMessage += "\n\nThis appears to be a rate limiting issue. The indexing process can be restarted once the rate limit resets."
427+
} else if (errorMessage.includes("authentication") || errorMessage.includes("401")) {
428+
enhancedMessage += "\n\nThis appears to be an authentication issue. Please check your API key configuration."
429+
} else if (errorMessage.includes("network") || errorMessage.includes("timeout") || errorMessage.includes("ECONNRESET")) {
430+
enhancedMessage += "\n\nThis appears to be a network connectivity issue. Please check your internet connection and try again."
431+
} else {
432+
enhancedMessage += "\n\nYou can restart the indexing process from the Code Index settings."
433+
}
434+
435+
onError(new Error(enhancedMessage))
427436
}
428437
}
429438
}

0 commit comments

Comments
 (0)