Skip to content

Commit a7a6bcb

Browse files
authored
fix: resolve DirectoryScanner memory leak and improve file limit handling (RooCodeInc#5785)
1 parent 0f994fc commit a7a6bcb

File tree

4 files changed

+55
-34
lines changed

4 files changed

+55
-34
lines changed

src/services/code-index/constants/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479
1515
export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB
1616

1717
/**Directory Scanner */
18-
export const MAX_LIST_FILES_LIMIT = 3_000
18+
export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000
1919
export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts
2020
export const MAX_BATCH_RETRIES = 3
2121
export const INITIAL_RETRY_DELAY_MS = 500

src/services/code-index/interfaces/file-processor.ts

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@ export interface IDirectoryScanner {
3838
onBlocksIndexed?: (indexedCount: number) => void,
3939
onFileParsed?: (fileBlockCount: number) => void,
4040
): Promise<{
41-
codeBlocks: CodeBlock[]
4241
stats: {
4342
processed: number
4443
skipped: number

src/services/code-index/processors/__tests__/scanner.spec.ts

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,16 @@ describe("DirectoryScanner", () => {
168168
expect(mockCodeParser.parseFile).not.toHaveBeenCalled()
169169
})
170170

171-
it("should parse changed files and return code blocks", async () => {
171+
it("should parse changed files and return empty codeBlocks array", async () => {
172+
// Create scanner without embedder to test the non-embedding path
173+
const scannerNoEmbeddings = new DirectoryScanner(
174+
null as any, // No embedder
175+
null as any, // No vector store
176+
mockCodeParser,
177+
mockCacheManager,
178+
mockIgnoreInstance,
179+
)
180+
172181
const { listFiles } = await import("../../../glob/list-files")
173182
vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false])
174183
const mockBlocks: any[] = [
@@ -185,8 +194,7 @@ describe("DirectoryScanner", () => {
185194
]
186195
;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks)
187196

188-
const result = await scanner.scanDirectory("/test")
189-
expect(result.codeBlocks).toEqual(mockBlocks)
197+
const result = await scannerNoEmbeddings.scanDirectory("/test")
190198
expect(result.stats.processed).toBe(1)
191199
})
192200

@@ -252,6 +260,15 @@ describe("DirectoryScanner", () => {
252260
})
253261

254262
it("should process markdown files alongside code files", async () => {
263+
// Create scanner without embedder to test the non-embedding path
264+
const scannerNoEmbeddings = new DirectoryScanner(
265+
null as any, // No embedder
266+
null as any, // No vector store
267+
mockCodeParser,
268+
mockCacheManager,
269+
mockIgnoreInstance,
270+
)
271+
255272
const { listFiles } = await import("../../../glob/list-files")
256273
vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false])
257274

@@ -306,24 +323,15 @@ describe("DirectoryScanner", () => {
306323
return []
307324
})
308325

309-
const result = await scanner.scanDirectory("/test")
326+
const result = await scannerNoEmbeddings.scanDirectory("/test")
310327

311328
// Verify all files were processed
312329
expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3)
313330
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/README.md", expect.any(Object))
314331
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object))
315332
expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object))
316333

317-
// Verify code blocks include both markdown and code content
318-
expect(result.codeBlocks).toHaveLength(3)
319-
expect(result.codeBlocks).toEqual(
320-
expect.arrayContaining([
321-
expect.objectContaining({ type: "markdown_header_h1" }),
322-
expect.objectContaining({ type: "function" }),
323-
expect.objectContaining({ type: "markdown_header_h2" }),
324-
]),
325-
)
326-
334+
// Verify processing still works without codeBlocks accumulation
327335
expect(result.stats.processed).toBe(3)
328336
})
329337

src/services/code-index/processors/scanner.ts

Lines changed: 32 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import { t } from "../../../i18n"
1717
import {
1818
QDRANT_CODE_BLOCK_NAMESPACE,
1919
MAX_FILE_SIZE_BYTES,
20-
MAX_LIST_FILES_LIMIT,
20+
MAX_LIST_FILES_LIMIT_CODE_INDEX,
2121
BATCH_SEGMENT_THRESHOLD,
2222
MAX_BATCH_RETRIES,
2323
INITIAL_RETRY_DELAY_MS,
@@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner {
5151
onError?: (error: Error) => void,
5252
onBlocksIndexed?: (indexedCount: number) => void,
5353
onFileParsed?: (fileBlockCount: number) => void,
54-
): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> {
54+
): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> {
5555
const directoryPath = directory
5656
// Capture workspace context at scan start
5757
const scanWorkspace = getWorkspacePathForContext(directoryPath)
5858

5959
// Get all files recursively (handles .gitignore automatically)
60-
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT)
60+
const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX)
6161

6262
// Filter out directories (marked with trailing '/')
6363
const filePaths = allPaths.filter((p) => !p.endsWith("/"))
@@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner {
8585

8686
// Initialize tracking variables
8787
const processedFiles = new Set<string>()
88-
const codeBlocks: CodeBlock[] = []
8988
let processedCount = 0
9089
let skippedCount = 0
9190

@@ -98,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner {
9897
let currentBatchBlocks: CodeBlock[] = []
9998
let currentBatchTexts: string[] = []
10099
let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = []
101-
const activeBatchPromises: Promise<void>[] = []
100+
const activeBatchPromises = new Set<Promise<void>>()
102101

103102
// Initialize block counter
104103
let totalBlockCount = 0
@@ -125,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner {
125124

126125
// Check against cache
127126
const cachedFileHash = this.cacheManager.getHash(filePath)
127+
const isNewFile = !cachedFileHash
128128
if (cachedFileHash === currentFileHash) {
129129
// File is unchanged
130130
skippedCount++
@@ -135,7 +135,6 @@ export class DirectoryScanner implements IDirectoryScanner {
135135
const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash })
136136
const fileBlockCount = blocks.length
137137
onFileParsed?.(fileBlockCount)
138-
codeBlocks.push(...blocks)
139138
processedCount++
140139

141140
// Process embeddings if configured
@@ -146,20 +145,11 @@ export class DirectoryScanner implements IDirectoryScanner {
146145
const trimmedContent = block.content.trim()
147146
if (trimmedContent) {
148147
const release = await mutex.acquire()
149-
totalBlockCount += fileBlockCount
150148
try {
151149
currentBatchBlocks.push(block)
152150
currentBatchTexts.push(trimmedContent)
153151
addedBlocksFromFile = true
154152

155-
if (addedBlocksFromFile) {
156-
currentBatchFileInfos.push({
157-
filePath,
158-
fileHash: currentFileHash,
159-
isNew: !this.cacheManager.getHash(filePath),
160-
})
161-
}
162-
163153
// Check if batch threshold is met
164154
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
165155
// Copy current batch data and clear accumulators
@@ -181,13 +171,33 @@ export class DirectoryScanner implements IDirectoryScanner {
181171
onBlocksIndexed,
182172
),
183173
)
184-
activeBatchPromises.push(batchPromise)
174+
activeBatchPromises.add(batchPromise)
175+
176+
// Clean up completed promises to prevent memory accumulation
177+
batchPromise.finally(() => {
178+
activeBatchPromises.delete(batchPromise)
179+
})
185180
}
186181
} finally {
187182
release()
188183
}
189184
}
190185
}
186+
187+
// Add file info once per file (outside the block loop)
188+
if (addedBlocksFromFile) {
189+
const release = await mutex.acquire()
190+
try {
191+
totalBlockCount += fileBlockCount
192+
currentBatchFileInfos.push({
193+
filePath,
194+
fileHash: currentFileHash,
195+
isNew: isNewFile,
196+
})
197+
} finally {
198+
release()
199+
}
200+
}
191201
} else {
192202
// Only update hash if not being processed in a batch
193203
await this.cacheManager.updateHash(filePath, currentFileHash)
@@ -232,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner {
232242
const batchPromise = batchLimiter(() =>
233243
this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed),
234244
)
235-
activeBatchPromises.push(batchPromise)
245+
activeBatchPromises.add(batchPromise)
246+
247+
// Clean up completed promises to prevent memory accumulation
248+
batchPromise.finally(() => {
249+
activeBatchPromises.delete(batchPromise)
250+
})
236251
} finally {
237252
release()
238253
}
@@ -280,7 +295,6 @@ export class DirectoryScanner implements IDirectoryScanner {
280295
}
281296

282297
return {
283-
codeBlocks,
284298
stats: {
285299
processed: processedCount,
286300
skipped: skippedCount,

0 commit comments

Comments
 (0)