From 66f84f677bc716265aaf51d0bba81725a8c17f7e Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Wed, 16 Jul 2025 09:22:12 -0500 Subject: [PATCH 1/5] Fix memory leak in DirectoryScanner - Remove codeBlocks accumulation that was causing memory exhaustion - Fix batch processing bugs where file info was added multiple times per file - Move totalBlockCount increment outside block loop to fix counting bug - Return empty codeBlocks array since it's not used by main orchestrator logic - Update tests to expect empty codeBlocks array This fixes the extension running out of memory during indexing of large codebases. The memory usage should drop from ~500MB-1GB to ~10-50MB for large projects. --- .../processors/__tests__/scanner.spec.ts | 37 ++++++++++++------- src/services/code-index/processors/scanner.ts | 23 ++++++------ 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/src/services/code-index/processors/__tests__/scanner.spec.ts b/src/services/code-index/processors/__tests__/scanner.spec.ts index f90a6c8159..b1114ba9fd 100644 --- a/src/services/code-index/processors/__tests__/scanner.spec.ts +++ b/src/services/code-index/processors/__tests__/scanner.spec.ts @@ -168,7 +168,16 @@ describe("DirectoryScanner", () => { expect(mockCodeParser.parseFile).not.toHaveBeenCalled() }) - it("should parse changed files and return code blocks", async () => { + it("should parse changed files and return empty codeBlocks array", async () => { + // Create scanner without embedder to test the non-embedding path + const scannerNoEmbeddings = new DirectoryScanner( + null as any, // No embedder + null as any, // No vector store + mockCodeParser, + mockCacheManager, + mockIgnoreInstance, + ) + const { listFiles } = await import("../../../glob/list-files") vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false]) const mockBlocks: any[] = [ @@ -185,8 +194,8 @@ describe("DirectoryScanner", () => { ] ;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks) - const result = await scanner.scanDirectory("/test") - expect(result.codeBlocks).toEqual(mockBlocks) + const result = await scannerNoEmbeddings.scanDirectory("/test") + expect(result.codeBlocks).toEqual([]) // Now returns empty array for memory optimization expect(result.stats.processed).toBe(1) }) @@ -252,6 +261,15 @@ describe("DirectoryScanner", () => { }) it("should process markdown files alongside code files", async () => { + // Create scanner without embedder to test the non-embedding path + const scannerNoEmbeddings = new DirectoryScanner( + null as any, // No embedder + null as any, // No vector store + mockCodeParser, + mockCacheManager, + mockIgnoreInstance, + ) + const { listFiles } = await import("../../../glob/list-files") vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false]) @@ -306,7 +324,7 @@ describe("DirectoryScanner", () => { return [] }) - const result = await scanner.scanDirectory("/test") + const result = await scannerNoEmbeddings.scanDirectory("/test") // Verify all files were processed expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3) @@ -314,15 +332,8 @@ describe("DirectoryScanner", () => { expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object)) expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object)) - // Verify code blocks include both markdown and code content - expect(result.codeBlocks).toHaveLength(3) - expect(result.codeBlocks).toEqual( - expect.arrayContaining([ - expect.objectContaining({ type: "markdown_header_h1" }), - expect.objectContaining({ type: "function" }), - expect.objectContaining({ type: "markdown_header_h2" }), - ]), - ) + // Verify codeBlocks is empty (memory optimization) but processing still works + expect(result.codeBlocks).toEqual([]) expect(result.stats.processed).toBe(3) }) diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 538a1252d7..a2a5543450 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner { // Initialize tracking variables const processedFiles = new Set() - const codeBlocks: CodeBlock[] = [] let processedCount = 0 let skippedCount = 0 @@ -135,7 +134,6 @@ export class DirectoryScanner implements IDirectoryScanner { const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash }) const fileBlockCount = blocks.length onFileParsed?.(fileBlockCount) - codeBlocks.push(...blocks) processedCount++ // Process embeddings if configured @@ -146,20 +144,11 @@ export class DirectoryScanner implements IDirectoryScanner { const trimmedContent = block.content.trim() if (trimmedContent) { const release = await mutex.acquire() - totalBlockCount += fileBlockCount try { currentBatchBlocks.push(block) currentBatchTexts.push(trimmedContent) addedBlocksFromFile = true - if (addedBlocksFromFile) { - currentBatchFileInfos.push({ - filePath, - fileHash: currentFileHash, - isNew: !this.cacheManager.getHash(filePath), - }) - } - // Check if batch threshold is met if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) { // Copy current batch data and clear accumulators @@ -188,6 +177,16 @@ export class DirectoryScanner implements IDirectoryScanner { } } } + + // Add file info once per file (outside the block loop) + if (addedBlocksFromFile) { + totalBlockCount += fileBlockCount + currentBatchFileInfos.push({ + filePath, + fileHash: currentFileHash, + isNew: !this.cacheManager.getHash(filePath), + }) + } } else { // Only update hash if not being processed in a batch await this.cacheManager.updateHash(filePath, currentFileHash) @@ -280,7 +279,7 @@ export class DirectoryScanner implements IDirectoryScanner { } return { - codeBlocks, + codeBlocks: [], // Return empty array to prevent memory accumulation stats: { processed: processedCount, skipped: skippedCount, From 13ad9cb6b55bd1edad0de5f69f5387855395ce40 Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Wed, 16 Jul 2025 09:23:39 -0500 Subject: [PATCH 2/5] Remove unused codeBlocks from scanner interface and implementation - Remove codeBlocks property from IDirectoryScanner interface - Update scanner implementation to not return codeBlocks - Update tests to remove codeBlocks assertions - This completes the memory optimization by eliminating the unused return value The scanner now only returns stats and totalBlockCount, which are the only values actually used by the orchestrator. This further reduces memory usage and simplifies the interface. --- src/services/code-index/interfaces/file-processor.ts | 1 - src/services/code-index/processors/__tests__/scanner.spec.ts | 5 +---- src/services/code-index/processors/scanner.ts | 3 +-- 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/services/code-index/interfaces/file-processor.ts b/src/services/code-index/interfaces/file-processor.ts index f00c19c619..88b19007c3 100644 --- a/src/services/code-index/interfaces/file-processor.ts +++ b/src/services/code-index/interfaces/file-processor.ts @@ -38,7 +38,6 @@ export interface IDirectoryScanner { onBlocksIndexed?: (indexedCount: number) => void, onFileParsed?: (fileBlockCount: number) => void, ): Promise<{ - codeBlocks: CodeBlock[] stats: { processed: number skipped: number diff --git a/src/services/code-index/processors/__tests__/scanner.spec.ts b/src/services/code-index/processors/__tests__/scanner.spec.ts index b1114ba9fd..4d4150b443 100644 --- a/src/services/code-index/processors/__tests__/scanner.spec.ts +++ b/src/services/code-index/processors/__tests__/scanner.spec.ts @@ -195,7 +195,6 @@ describe("DirectoryScanner", () => { ;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks) const result = await scannerNoEmbeddings.scanDirectory("/test") - expect(result.codeBlocks).toEqual([]) // Now returns empty array for memory optimization expect(result.stats.processed).toBe(1) }) @@ -332,9 +331,7 @@ describe("DirectoryScanner", () => { expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object)) expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object)) - // Verify codeBlocks is empty (memory optimization) but processing still works - expect(result.codeBlocks).toEqual([]) - + // Verify processing still works without codeBlocks accumulation expect(result.stats.processed).toBe(3) }) diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index a2a5543450..f3b415b74d 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -51,7 +51,7 @@ export class DirectoryScanner implements IDirectoryScanner { onError?: (error: Error) => void, onBlocksIndexed?: (indexedCount: number) => void, onFileParsed?: (fileBlockCount: number) => void, - ): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> { + ): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> { const directoryPath = directory // Capture workspace context at scan start const scanWorkspace = getWorkspacePathForContext(directoryPath) @@ -279,7 +279,6 @@ export class DirectoryScanner implements IDirectoryScanner { } return { - codeBlocks: [], // Return empty array to prevent memory accumulation stats: { processed: processedCount, skipped: skippedCount, From 128f45d0a344a3e5e3cfcf4857e1f6bfa3bfbd58 Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Wed, 16 Jul 2025 12:12:39 -0500 Subject: [PATCH 3/5] feat: increase file limit for code indexing and create dedicated constant - Rename MAX_LIST_FILES_LIMIT to MAX_LIST_FILES_LIMIT_CODE_INDEX for clarity - Increase limit from 3,000 to 50,000 files to handle larger codebases - This complements the memory leak fixes by allowing proper scanning of enterprise projects --- src/services/code-index/constants/index.ts | 2 +- src/services/code-index/processors/scanner.ts | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/services/code-index/constants/index.ts b/src/services/code-index/constants/index.ts index c2567f5635..706a73935a 100644 --- a/src/services/code-index/constants/index.ts +++ b/src/services/code-index/constants/index.ts @@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479 export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB /**Directory Scanner */ -export const MAX_LIST_FILES_LIMIT = 3_000 +export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000 export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts export const MAX_BATCH_RETRIES = 3 export const INITIAL_RETRY_DELAY_MS = 500 diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index f3b415b74d..5ff29f8e37 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -17,7 +17,7 @@ import { t } from "../../../i18n" import { QDRANT_CODE_BLOCK_NAMESPACE, MAX_FILE_SIZE_BYTES, - MAX_LIST_FILES_LIMIT, + MAX_LIST_FILES_LIMIT_CODE_INDEX, BATCH_SEGMENT_THRESHOLD, MAX_BATCH_RETRIES, INITIAL_RETRY_DELAY_MS, @@ -57,7 +57,7 @@ export class DirectoryScanner implements IDirectoryScanner { const scanWorkspace = getWorkspacePathForContext(directoryPath) // Get all files recursively (handles .gitignore automatically) - const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT) + const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX) // Filter out directories (marked with trailing '/') const filePaths = allPaths.filter((p) => !p.endsWith("/")) From e4863fc8c7e270c5aaa84af9940f5b3bf7da8d56 Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Wed, 16 Jul 2025 12:19:06 -0500 Subject: [PATCH 4/5] fix: address race condition and duplicate hash call in DirectoryScanner - Wrap totalBlockCount and currentBatchFileInfos updates in mutex lock to prevent race conditions - Cache isNewFile result to avoid duplicate cacheManager.getHash() calls - Ensures thread-safe batch processing in concurrent file parsing --- src/services/code-index/processors/scanner.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 5ff29f8e37..29f65e8bc7 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -124,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner { // Check against cache const cachedFileHash = this.cacheManager.getHash(filePath) + const isNewFile = !cachedFileHash if (cachedFileHash === currentFileHash) { // File is unchanged skippedCount++ @@ -180,12 +181,17 @@ export class DirectoryScanner implements IDirectoryScanner { // Add file info once per file (outside the block loop) if (addedBlocksFromFile) { - totalBlockCount += fileBlockCount - currentBatchFileInfos.push({ - filePath, - fileHash: currentFileHash, - isNew: !this.cacheManager.getHash(filePath), - }) + const release = await mutex.acquire() + try { + totalBlockCount += fileBlockCount + currentBatchFileInfos.push({ + filePath, + fileHash: currentFileHash, + isNew: isNewFile, + }) + } finally { + release() + } } } else { // Only update hash if not being processed in a batch From b16dc448286c03bd9d74061c33d6ff9e60751355 Mon Sep 17 00:00:00 2001 From: Daniel Riccio Date: Wed, 16 Jul 2025 12:43:56 -0500 Subject: [PATCH 5/5] fix: prevent memory accumulation in activeBatchPromises - Convert activeBatchPromises from Array to Set for efficient removal - Clean up completed promises immediately after they finish - Remove unnecessary Array.from() when passing Set to Promise.all - Prevents unbounded growth of promise references during large scans --- src/services/code-index/processors/scanner.ts | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 29f65e8bc7..e6ca297399 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -97,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner { let currentBatchBlocks: CodeBlock[] = [] let currentBatchTexts: string[] = [] let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = [] - const activeBatchPromises: Promise[] = [] + const activeBatchPromises = new Set>() // Initialize block counter let totalBlockCount = 0 @@ -171,7 +171,12 @@ export class DirectoryScanner implements IDirectoryScanner { onBlocksIndexed, ), ) - activeBatchPromises.push(batchPromise) + activeBatchPromises.add(batchPromise) + + // Clean up completed promises to prevent memory accumulation + batchPromise.finally(() => { + activeBatchPromises.delete(batchPromise) + }) } } finally { release() @@ -237,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner { const batchPromise = batchLimiter(() => this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed), ) - activeBatchPromises.push(batchPromise) + activeBatchPromises.add(batchPromise) + + // Clean up completed promises to prevent memory accumulation + batchPromise.finally(() => { + activeBatchPromises.delete(batchPromise) + }) } finally { release() }