diff --git a/src/services/code-index/constants/index.ts b/src/services/code-index/constants/index.ts index c2567f5635..706a73935a 100644 --- a/src/services/code-index/constants/index.ts +++ b/src/services/code-index/constants/index.ts @@ -15,7 +15,7 @@ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479 export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB /**Directory Scanner */ -export const MAX_LIST_FILES_LIMIT = 3_000 +export const MAX_LIST_FILES_LIMIT_CODE_INDEX = 50_000 export const BATCH_SEGMENT_THRESHOLD = 60 // Number of code segments to batch for embeddings/upserts export const MAX_BATCH_RETRIES = 3 export const INITIAL_RETRY_DELAY_MS = 500 diff --git a/src/services/code-index/interfaces/file-processor.ts b/src/services/code-index/interfaces/file-processor.ts index f00c19c619..88b19007c3 100644 --- a/src/services/code-index/interfaces/file-processor.ts +++ b/src/services/code-index/interfaces/file-processor.ts @@ -38,7 +38,6 @@ export interface IDirectoryScanner { onBlocksIndexed?: (indexedCount: number) => void, onFileParsed?: (fileBlockCount: number) => void, ): Promise<{ - codeBlocks: CodeBlock[] stats: { processed: number skipped: number diff --git a/src/services/code-index/processors/__tests__/scanner.spec.ts b/src/services/code-index/processors/__tests__/scanner.spec.ts index f90a6c8159..4d4150b443 100644 --- a/src/services/code-index/processors/__tests__/scanner.spec.ts +++ b/src/services/code-index/processors/__tests__/scanner.spec.ts @@ -168,7 +168,16 @@ describe("DirectoryScanner", () => { expect(mockCodeParser.parseFile).not.toHaveBeenCalled() }) - it("should parse changed files and return code blocks", async () => { + it("should parse changed files and return empty codeBlocks array", async () => { + // Create scanner without embedder to test the non-embedding path + const scannerNoEmbeddings = new DirectoryScanner( + null as any, // No embedder + null as any, // No vector store + mockCodeParser, + mockCacheManager, + mockIgnoreInstance, + ) + const { listFiles } = await import("../../../glob/list-files") vi.mocked(listFiles).mockResolvedValue([["test/file1.js"], false]) const mockBlocks: any[] = [ @@ -185,8 +194,7 @@ describe("DirectoryScanner", () => { ] ;(mockCodeParser.parseFile as any).mockResolvedValue(mockBlocks) - const result = await scanner.scanDirectory("/test") - expect(result.codeBlocks).toEqual(mockBlocks) + const result = await scannerNoEmbeddings.scanDirectory("/test") expect(result.stats.processed).toBe(1) }) @@ -252,6 +260,15 @@ describe("DirectoryScanner", () => { }) it("should process markdown files alongside code files", async () => { + // Create scanner without embedder to test the non-embedding path + const scannerNoEmbeddings = new DirectoryScanner( + null as any, // No embedder + null as any, // No vector store + mockCodeParser, + mockCacheManager, + mockIgnoreInstance, + ) + const { listFiles } = await import("../../../glob/list-files") vi.mocked(listFiles).mockResolvedValue([["test/README.md", "test/app.js", "docs/guide.markdown"], false]) @@ -306,7 +323,7 @@ describe("DirectoryScanner", () => { return [] }) - const result = await scanner.scanDirectory("/test") + const result = await scannerNoEmbeddings.scanDirectory("/test") // Verify all files were processed expect(mockCodeParser.parseFile).toHaveBeenCalledTimes(3) @@ -314,16 +331,7 @@ describe("DirectoryScanner", () => { expect(mockCodeParser.parseFile).toHaveBeenCalledWith("test/app.js", expect.any(Object)) expect(mockCodeParser.parseFile).toHaveBeenCalledWith("docs/guide.markdown", expect.any(Object)) - // Verify code blocks include both markdown and code content - expect(result.codeBlocks).toHaveLength(3) - expect(result.codeBlocks).toEqual( - expect.arrayContaining([ - expect.objectContaining({ type: "markdown_header_h1" }), - expect.objectContaining({ type: "function" }), - expect.objectContaining({ type: "markdown_header_h2" }), - ]), - ) - + // Verify processing still works without codeBlocks accumulation expect(result.stats.processed).toBe(3) }) diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 538a1252d7..e6ca297399 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -17,7 +17,7 @@ import { t } from "../../../i18n" import { QDRANT_CODE_BLOCK_NAMESPACE, MAX_FILE_SIZE_BYTES, - MAX_LIST_FILES_LIMIT, + MAX_LIST_FILES_LIMIT_CODE_INDEX, BATCH_SEGMENT_THRESHOLD, MAX_BATCH_RETRIES, INITIAL_RETRY_DELAY_MS, @@ -51,13 +51,13 @@ export class DirectoryScanner implements IDirectoryScanner { onError?: (error: Error) => void, onBlocksIndexed?: (indexedCount: number) => void, onFileParsed?: (fileBlockCount: number) => void, - ): Promise<{ codeBlocks: CodeBlock[]; stats: { processed: number; skipped: number }; totalBlockCount: number }> { + ): Promise<{ stats: { processed: number; skipped: number }; totalBlockCount: number }> { const directoryPath = directory // Capture workspace context at scan start const scanWorkspace = getWorkspacePathForContext(directoryPath) // Get all files recursively (handles .gitignore automatically) - const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT) + const [allPaths, _] = await listFiles(directoryPath, true, MAX_LIST_FILES_LIMIT_CODE_INDEX) // Filter out directories (marked with trailing '/') const filePaths = allPaths.filter((p) => !p.endsWith("/")) @@ -85,7 +85,6 @@ export class DirectoryScanner implements IDirectoryScanner { // Initialize tracking variables const processedFiles = new Set() - const codeBlocks: CodeBlock[] = [] let processedCount = 0 let skippedCount = 0 @@ -98,7 +97,7 @@ export class DirectoryScanner implements IDirectoryScanner { let currentBatchBlocks: CodeBlock[] = [] let currentBatchTexts: string[] = [] let currentBatchFileInfos: { filePath: string; fileHash: string; isNew: boolean }[] = [] - const activeBatchPromises: Promise[] = [] + const activeBatchPromises = new Set>() // Initialize block counter let totalBlockCount = 0 @@ -125,6 +124,7 @@ export class DirectoryScanner implements IDirectoryScanner { // Check against cache const cachedFileHash = this.cacheManager.getHash(filePath) + const isNewFile = !cachedFileHash if (cachedFileHash === currentFileHash) { // File is unchanged skippedCount++ @@ -135,7 +135,6 @@ export class DirectoryScanner implements IDirectoryScanner { const blocks = await this.codeParser.parseFile(filePath, { content, fileHash: currentFileHash }) const fileBlockCount = blocks.length onFileParsed?.(fileBlockCount) - codeBlocks.push(...blocks) processedCount++ // Process embeddings if configured @@ -146,20 +145,11 @@ export class DirectoryScanner implements IDirectoryScanner { const trimmedContent = block.content.trim() if (trimmedContent) { const release = await mutex.acquire() - totalBlockCount += fileBlockCount try { currentBatchBlocks.push(block) currentBatchTexts.push(trimmedContent) addedBlocksFromFile = true - if (addedBlocksFromFile) { - currentBatchFileInfos.push({ - filePath, - fileHash: currentFileHash, - isNew: !this.cacheManager.getHash(filePath), - }) - } - // Check if batch threshold is met if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) { // Copy current batch data and clear accumulators @@ -181,13 +171,33 @@ export class DirectoryScanner implements IDirectoryScanner { onBlocksIndexed, ), ) - activeBatchPromises.push(batchPromise) + activeBatchPromises.add(batchPromise) + + // Clean up completed promises to prevent memory accumulation + batchPromise.finally(() => { + activeBatchPromises.delete(batchPromise) + }) } } finally { release() } } } + + // Add file info once per file (outside the block loop) + if (addedBlocksFromFile) { + const release = await mutex.acquire() + try { + totalBlockCount += fileBlockCount + currentBatchFileInfos.push({ + filePath, + fileHash: currentFileHash, + isNew: isNewFile, + }) + } finally { + release() + } + } } else { // Only update hash if not being processed in a batch await this.cacheManager.updateHash(filePath, currentFileHash) @@ -232,7 +242,12 @@ export class DirectoryScanner implements IDirectoryScanner { const batchPromise = batchLimiter(() => this.processBatch(batchBlocks, batchTexts, batchFileInfos, scanWorkspace, onError, onBlocksIndexed), ) - activeBatchPromises.push(batchPromise) + activeBatchPromises.add(batchPromise) + + // Clean up completed promises to prevent memory accumulation + batchPromise.finally(() => { + activeBatchPromises.delete(batchPromise) + }) } finally { release() } @@ -280,7 +295,6 @@ export class DirectoryScanner implements IDirectoryScanner { } return { - codeBlocks, stats: { processed: processedCount, skipped: skippedCount,