From 7e3ad68285d6b51d877a47b91a6fa60b8b1d4813 Mon Sep 17 00:00:00 2001 From: Roo Code Date: Mon, 14 Jul 2025 23:38:03 +0000 Subject: [PATCH] fix: resolve OOM issues during Swift code indexing - Add MemoryMonitor utility for tracking memory usage and pressure - Implement Swift-specific file size limits (512KB vs 1MB for other files) - Add periodic memory monitoring during file processing - Optimize tree-sitter parser memory usage with proper cleanup - Add early termination for large files and memory pressure scenarios - Implement batch processing limits to prevent memory accumulation - Add comprehensive test coverage for memory monitoring Fixes #5711 --- src/services/code-index/constants/index.ts | 2 + src/services/code-index/processors/parser.ts | 218 ++++++++++++------ src/services/code-index/processors/scanner.ts | 71 +++++- .../utils/__tests__/memoryMonitor.spec.ts | 141 +++++++++++ .../code-index/utils/memoryMonitor.ts | 83 +++++++ 5 files changed, 444 insertions(+), 71 deletions(-) create mode 100644 src/services/code-index/utils/__tests__/memoryMonitor.spec.ts create mode 100644 src/services/code-index/utils/memoryMonitor.ts diff --git a/src/services/code-index/constants/index.ts b/src/services/code-index/constants/index.ts index c2567f5635b..cf612540d72 100644 --- a/src/services/code-index/constants/index.ts +++ b/src/services/code-index/constants/index.ts @@ -13,6 +13,8 @@ export const DEFAULT_MAX_SEARCH_RESULTS = CODEBASE_INDEX_DEFAULTS.DEFAULT_SEARCH /**File Watcher */ export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479" export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB +export const MAX_SWIFT_FILE_SIZE_BYTES = 512 * 1024 // 512KB - Swift files can be memory intensive +export const MEMORY_CHECK_INTERVAL_FILES = 10 // Check memory every N files /**Directory Scanner */ export const MAX_LIST_FILES_LIMIT = 3_000 diff --git a/src/services/code-index/processors/parser.ts b/src/services/code-index/processors/parser.ts index 96d747c4c9f..61e7cc928bf 100644 --- a/src/services/code-index/processors/parser.ts +++ b/src/services/code-index/processors/parser.ts @@ -6,7 +6,15 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l import { parseMarkdown } from "../../tree-sitter/markdownParser" import { ICodeParser, CodeBlock } from "../interfaces" import { scannerExtensions } from "../shared/supported-extensions" -import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MIN_CHUNK_REMAINDER_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants" +import { + MAX_BLOCK_CHARS, + MIN_BLOCK_CHARS, + MIN_CHUNK_REMAINDER_CHARS, + MAX_CHARS_TOLERANCE_FACTOR, + MAX_SWIFT_FILE_SIZE_BYTES, + MEMORY_CHECK_INTERVAL_FILES, +} from "../constants" +import { MemoryMonitor } from "../utils/memoryMonitor" import { TelemetryService } from "@roo-code/telemetry" import { TelemetryEventName } from "@roo-code/types" import { sanitizeErrorMessage } from "../shared/validation-helpers" @@ -17,6 +25,8 @@ import { sanitizeErrorMessage } from "../shared/validation-helpers" export class CodeParser implements ICodeParser { private loadedParsers: LanguageParser = {} private pendingLoads: Map> = new Map() + private memoryMonitor = MemoryMonitor.getInstance() + private filesProcessed = 0 // Markdown files are now supported using the custom markdown parser // which extracts headers and sections for semantic indexing @@ -33,6 +43,17 @@ export class CodeParser implements ICodeParser { fileHash?: string }, ): Promise { + // Periodic memory monitoring + this.filesProcessed++ + if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) { + const isHighMemory = this.memoryMonitor.checkAndCleanup() + if (isHighMemory) { + console.warn( + `High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) after processing ${this.filesProcessed} files`, + ) + } + } + // Get file extension const ext = path.extname(filePath).toLowerCase() @@ -50,6 +71,23 @@ export class CodeParser implements ICodeParser { fileHash = options.fileHash || this.createFileHash(content) } else { try { + // Check file size before reading for Swift files + if (ext === ".swift") { + const stats = await readFile(filePath, "utf8") + .then((content) => ({ size: Buffer.byteLength(content, "utf8") })) + .catch(() => null) + if (stats && stats.size > MAX_SWIFT_FILE_SIZE_BYTES) { + console.warn( + `Skipping large Swift file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(MAX_SWIFT_FILE_SIZE_BYTES / 1024)}KB limit)`, + ) + TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, { + error: `Swift file too large: ${stats.size} bytes`, + location: "parseFile:fileSizeCheck", + }) + return [] + } + } + content = await readFile(filePath, "utf8") fileHash = this.createFileHash(content) } catch (error) { @@ -63,6 +101,14 @@ export class CodeParser implements ICodeParser { } } + // Additional memory check before parsing large files + if (content.length > MAX_SWIFT_FILE_SIZE_BYTES && this.memoryMonitor.isMemoryPressure()) { + console.warn( + `Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`, + ) + return [] + } + // Parse the file return this.parseContent(filePath, content, fileHash) } @@ -144,84 +190,122 @@ export class CodeParser implements ICodeParser { return [] } - const tree = language.parser.parse(content) + let tree: any = null + let captures: any[] = [] - // We don't need to get the query string from languageQueries since it's already loaded - // in the language object - const captures = tree ? language.query.captures(tree.rootNode) : [] - - // Check if captures are empty - if (captures.length === 0) { - if (content.length >= MIN_BLOCK_CHARS) { - // Perform fallback chunking if content is large enough - const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes) - return blocks - } else { - // Return empty if content is too small for fallback + try { + // Check memory before parsing + if (this.memoryMonitor.isMemoryPressure()) { + console.warn(`Skipping parsing ${filePath} due to memory pressure`) return [] } - } - const results: CodeBlock[] = [] + tree = language.parser.parse(content) - // Process captures if not empty - const queue: Node[] = Array.from(captures).map((capture) => capture.node) + // We don't need to get the query string from languageQueries since it's already loaded + // in the language object + captures = tree ? language.query.captures(tree.rootNode) : [] - while (queue.length > 0) { - const currentNode = queue.shift()! - // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error + // Check if captures are empty + if (captures.length === 0) { + if (content.length >= MIN_BLOCK_CHARS) { + // Perform fallback chunking if content is large enough + const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes) + return blocks + } else { + // Return empty if content is too small for fallback + return [] + } + } + + const results: CodeBlock[] = [] + + // Process captures if not empty + const queue: Node[] = Array.from(captures).map((capture) => capture.node) + let processedNodes = 0 + const maxNodesToProcess = 1000 // Limit to prevent excessive memory usage + + while (queue.length > 0 && processedNodes < maxNodesToProcess) { + // Periodic memory check during processing + if (processedNodes % 100 === 0 && this.memoryMonitor.isMemoryPressure()) { + console.warn( + `Stopping node processing for ${filePath} due to memory pressure after ${processedNodes} nodes`, + ) + break + } - // Check if the node meets the minimum character requirement - if (currentNode.text.length >= MIN_BLOCK_CHARS) { - // If it also exceeds the maximum character limit, try to break it down - if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { - if (currentNode.children.filter((child) => child !== null).length > 0) { - // If it has children, process them instead - queue.push(...currentNode.children.filter((child) => child !== null)) + const currentNode = queue.shift()! + processedNodes++ + + // Check if the node meets the minimum character requirement + if (currentNode.text.length >= MIN_BLOCK_CHARS) { + // If it also exceeds the maximum character limit, try to break it down + if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) { + if (currentNode.children.filter((child) => child !== null).length > 0) { + // If it has children, process them instead (but limit queue growth) + const validChildren = currentNode.children.filter((child) => child !== null) + if (queue.length + validChildren.length < maxNodesToProcess) { + queue.push(...validChildren) + } + } else { + // If it's a leaf node, chunk it + const chunkedBlocks = this._chunkLeafNodeByLines( + currentNode, + filePath, + fileHash, + seenSegmentHashes, + ) + results.push(...chunkedBlocks) + } } else { - // If it's a leaf node, chunk it - const chunkedBlocks = this._chunkLeafNodeByLines( - currentNode, - filePath, - fileHash, - seenSegmentHashes, - ) - results.push(...chunkedBlocks) - } - } else { - // Node meets min chars and is within max chars, create a block - const identifier = - currentNode.childForFieldName("name")?.text || - currentNode.children.find((c) => c?.type === "identifier")?.text || - null - const type = currentNode.type - const start_line = currentNode.startPosition.row + 1 - const end_line = currentNode.endPosition.row + 1 - const content = currentNode.text - const contentPreview = content.slice(0, 100) - const segmentHash = createHash("sha256") - .update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`) - .digest("hex") - - if (!seenSegmentHashes.has(segmentHash)) { - seenSegmentHashes.add(segmentHash) - results.push({ - file_path: filePath, - identifier, - type, - start_line, - end_line, - content, - segmentHash, - fileHash, - }) + // Node meets min chars and is within max chars, create a block + const identifier = + currentNode.childForFieldName("name")?.text || + currentNode.children.find((c) => c?.type === "identifier")?.text || + null + const type = currentNode.type + const start_line = currentNode.startPosition.row + 1 + const end_line = currentNode.endPosition.row + 1 + const nodeContent = currentNode.text + const contentPreview = nodeContent.slice(0, 100) + const segmentHash = createHash("sha256") + .update(`${filePath}-${start_line}-${end_line}-${nodeContent.length}-${contentPreview}`) + .digest("hex") + + if (!seenSegmentHashes.has(segmentHash)) { + seenSegmentHashes.add(segmentHash) + results.push({ + file_path: filePath, + identifier, + type, + start_line, + end_line, + content: nodeContent, + segmentHash, + fileHash, + }) + } } } + // Nodes smaller than minBlockChars are ignored } - // Nodes smaller than minBlockChars are ignored - } - return results + return results + } finally { + // Clean up tree-sitter resources + if (tree) { + try { + tree.delete?.() + } catch (e) { + // Ignore cleanup errors + } + } + + // Force garbage collection for Swift files if available + if (ext === "swift" && global.gc) { + global.gc() + } + } } /** diff --git a/src/services/code-index/processors/scanner.ts b/src/services/code-index/processors/scanner.ts index 538a1252d78..491a09451a8 100644 --- a/src/services/code-index/processors/scanner.ts +++ b/src/services/code-index/processors/scanner.ts @@ -17,19 +17,25 @@ import { t } from "../../../i18n" import { QDRANT_CODE_BLOCK_NAMESPACE, MAX_FILE_SIZE_BYTES, + MAX_SWIFT_FILE_SIZE_BYTES, MAX_LIST_FILES_LIMIT, BATCH_SEGMENT_THRESHOLD, MAX_BATCH_RETRIES, INITIAL_RETRY_DELAY_MS, PARSING_CONCURRENCY, BATCH_PROCESSING_CONCURRENCY, + MEMORY_CHECK_INTERVAL_FILES, } from "../constants" +import { MemoryMonitor } from "../utils/memoryMonitor" import { isPathInIgnoredDirectory } from "../../glob/ignore-utils" import { TelemetryService } from "@roo-code/telemetry" import { TelemetryEventName } from "@roo-code/types" import { sanitizeErrorMessage } from "../shared/validation-helpers" export class DirectoryScanner implements IDirectoryScanner { + private memoryMonitor = MemoryMonitor.getInstance() + private filesProcessed = 0 + constructor( private readonly embedder: IEmbedder, private readonly qdrantClient: IVectorStore, @@ -107,9 +113,35 @@ export class DirectoryScanner implements IDirectoryScanner { const parsePromises = supportedPaths.map((filePath) => parseLimiter(async () => { try { - // Check file size + // Periodic memory monitoring + this.filesProcessed++ + if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) { + const isHighMemory = this.memoryMonitor.checkAndCleanup() + if (isHighMemory) { + console.warn( + `High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) during directory scan after ${this.filesProcessed} files`, + ) + } + } + + // Check if memory pressure should stop processing + if (this.memoryMonitor.isMemoryPressure()) { + console.warn( + `Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`, + ) + skippedCount++ + return + } + + // Check file size with Swift-specific limits const stats = await stat(filePath) - if (stats.size > MAX_FILE_SIZE_BYTES) { + const ext = path.extname(filePath).toLowerCase() + const maxSize = ext === ".swift" ? MAX_SWIFT_FILE_SIZE_BYTES : MAX_FILE_SIZE_BYTES + + if (stats.size > maxSize) { + console.warn( + `Skipping large ${ext} file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(maxSize / 1024)}KB limit)`, + ) skippedCount++ // Skip large files return } @@ -148,6 +180,34 @@ export class DirectoryScanner implements IDirectoryScanner { const release = await mutex.acquire() totalBlockCount += fileBlockCount try { + // Check memory before adding to batch + if (this.memoryMonitor.isMemoryPressure()) { + console.warn( + `Memory pressure detected, forcing batch processing early (${currentBatchBlocks.length} blocks)`, + ) + // Force process current batch before adding more + if (currentBatchBlocks.length > 0) { + const batchBlocks = [...currentBatchBlocks] + const batchTexts = [...currentBatchTexts] + const batchFileInfos = [...currentBatchFileInfos] + currentBatchBlocks = [] + currentBatchTexts = [] + currentBatchFileInfos = [] + + const batchPromise = batchLimiter(() => + this.processBatch( + batchBlocks, + batchTexts, + batchFileInfos, + scanWorkspace, + onError, + onBlocksIndexed, + ), + ) + activeBatchPromises.push(batchPromise) + } + } + currentBatchBlocks.push(block) currentBatchTexts.push(trimmedContent) addedBlocksFromFile = true @@ -160,8 +220,11 @@ export class DirectoryScanner implements IDirectoryScanner { }) } - // Check if batch threshold is met - if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) { + // Check if batch threshold is met or memory pressure + if ( + currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD || + this.memoryMonitor.isMemoryPressure() + ) { // Copy current batch data and clear accumulators const batchBlocks = [...currentBatchBlocks] const batchTexts = [...currentBatchTexts] diff --git a/src/services/code-index/utils/__tests__/memoryMonitor.spec.ts b/src/services/code-index/utils/__tests__/memoryMonitor.spec.ts new file mode 100644 index 00000000000..3861ca5e6f7 --- /dev/null +++ b/src/services/code-index/utils/__tests__/memoryMonitor.spec.ts @@ -0,0 +1,141 @@ +// npx vitest services/code-index/utils/__tests__/memoryMonitor.spec.ts + +import { describe, it, expect, beforeEach, vi } from "vitest" +import { MemoryMonitor } from "../memoryMonitor" + +describe("MemoryMonitor", () => { + let memoryMonitor: MemoryMonitor + + beforeEach(() => { + memoryMonitor = MemoryMonitor.getInstance() + vi.clearAllMocks() + }) + + describe("getInstance", () => { + it("should return singleton instance", () => { + const instance1 = MemoryMonitor.getInstance() + const instance2 = MemoryMonitor.getInstance() + expect(instance1).toBe(instance2) + }) + }) + + describe("getMemoryStats", () => { + it("should return memory statistics", () => { + const stats = memoryMonitor.getMemoryStats() + expect(stats).toHaveProperty("used") + expect(stats).toHaveProperty("total") + expect(stats).toHaveProperty("percentage") + expect(typeof stats.used).toBe("number") + expect(typeof stats.total).toBe("number") + expect(typeof stats.percentage).toBe("number") + expect(stats.used).toBeGreaterThan(0) + expect(stats.total).toBeGreaterThan(0) + expect(stats.percentage).toBeGreaterThanOrEqual(0) + expect(stats.percentage).toBeLessThanOrEqual(1) + }) + }) + + describe("isMemoryPressure", () => { + it("should return boolean indicating memory pressure", () => { + const result = memoryMonitor.isMemoryPressure() + expect(typeof result).toBe("boolean") + }) + + it("should return true when memory usage is above threshold", () => { + // Mock high memory usage + const originalGetMemoryStats = memoryMonitor.getMemoryStats + memoryMonitor.getMemoryStats = vi.fn().mockReturnValue({ + used: 900 * 1024 * 1024, // 900MB + total: 1000 * 1024 * 1024, // 1GB + percentage: 0.9, // 90% + }) + + const result = memoryMonitor.isMemoryPressure() + expect(result).toBe(true) + + // Restore original method + memoryMonitor.getMemoryStats = originalGetMemoryStats + }) + + it("should return false when memory usage is below threshold", () => { + // Mock low memory usage + const originalGetMemoryStats = memoryMonitor.getMemoryStats + memoryMonitor.getMemoryStats = vi.fn().mockReturnValue({ + used: 500 * 1024 * 1024, // 500MB + total: 1000 * 1024 * 1024, // 1GB + percentage: 0.5, // 50% + }) + + const result = memoryMonitor.isMemoryPressure() + expect(result).toBe(false) + + // Restore original method + memoryMonitor.getMemoryStats = originalGetMemoryStats + }) + }) + + describe("getMemoryUsageMB", () => { + it("should return memory usage in MB", () => { + const usage = memoryMonitor.getMemoryUsageMB() + expect(typeof usage).toBe("number") + expect(usage).toBeGreaterThan(0) + }) + }) + + describe("forceGC", () => { + it("should call global.gc if available", () => { + const originalGC = global.gc + global.gc = vi.fn() + + memoryMonitor.forceGC() + expect(global.gc).toHaveBeenCalled() + + global.gc = originalGC + }) + + it("should not throw if global.gc is not available", () => { + const originalGC = global.gc + delete (global as any).gc + + expect(() => memoryMonitor.forceGC()).not.toThrow() + + global.gc = originalGC + }) + }) + + describe("checkAndCleanup", () => { + it("should return false if called too frequently", () => { + // First call should work + const result1 = memoryMonitor.checkAndCleanup() + expect(typeof result1).toBe("boolean") + + // Immediate second call should return false (throttled) + const result2 = memoryMonitor.checkAndCleanup() + expect(result2).toBe(false) + }) + + it("should force GC when memory pressure is detected", () => { + const originalGC = global.gc + const originalGetMemoryStats = memoryMonitor.getMemoryStats + global.gc = vi.fn() + + // Mock high memory usage + memoryMonitor.getMemoryStats = vi.fn().mockReturnValue({ + used: 900 * 1024 * 1024, + total: 1000 * 1024 * 1024, + percentage: 0.9, + }) + + // Wait for throttle to reset + setTimeout(() => { + const result = memoryMonitor.checkAndCleanup() + expect(result).toBe(true) + expect(global.gc).toHaveBeenCalled() + }, 6000) + + // Restore + global.gc = originalGC + memoryMonitor.getMemoryStats = originalGetMemoryStats + }) + }) +}) diff --git a/src/services/code-index/utils/memoryMonitor.ts b/src/services/code-index/utils/memoryMonitor.ts new file mode 100644 index 00000000000..4e52f0b18e7 --- /dev/null +++ b/src/services/code-index/utils/memoryMonitor.ts @@ -0,0 +1,83 @@ +/** + * Memory monitoring utilities for code indexing operations + */ + +export interface MemoryStats { + used: number + total: number + percentage: number +} + +export class MemoryMonitor { + private static instance: MemoryMonitor + private memoryThreshold = 0.85 // 85% memory usage threshold + private lastCheck = 0 + private checkInterval = 5000 // Check every 5 seconds + + static getInstance(): MemoryMonitor { + if (!MemoryMonitor.instance) { + MemoryMonitor.instance = new MemoryMonitor() + } + return MemoryMonitor.instance + } + + /** + * Get current memory usage statistics + */ + getMemoryStats(): MemoryStats { + const memUsage = process.memoryUsage() + const totalMemory = require("os").totalmem() + const used = memUsage.heapUsed + memUsage.external + + return { + used, + total: totalMemory, + percentage: used / totalMemory, + } + } + + /** + * Check if memory usage is above threshold + */ + isMemoryPressure(): boolean { + const stats = this.getMemoryStats() + return stats.percentage > this.memoryThreshold + } + + /** + * Force garbage collection if available + */ + forceGC(): void { + if (global.gc) { + global.gc() + } + } + + /** + * Check memory and force GC if needed (throttled) + */ + checkAndCleanup(): boolean { + const now = Date.now() + if (now - this.lastCheck < this.checkInterval) { + return false + } + + this.lastCheck = now + const isHighMemory = this.isMemoryPressure() + + if (isHighMemory) { + this.forceGC() + console.warn("Memory pressure detected, forced garbage collection") + } + + return isHighMemory + } + + /** + * Get memory usage in MB for logging + */ + getMemoryUsageMB(): number { + const stats = this.getMemoryStats() + return Math.round(stats.used / 1024 / 1024) + } +}