Skip to content

Commit 7e3ad68

Browse files
committed
fix: resolve OOM issues during Swift code indexing
- Add MemoryMonitor utility for tracking memory usage and pressure - Implement Swift-specific file size limits (512KB vs 1MB for other files) - Add periodic memory monitoring during file processing - Optimize tree-sitter parser memory usage with proper cleanup - Add early termination for large files and memory pressure scenarios - Implement batch processing limits to prevent memory accumulation - Add comprehensive test coverage for memory monitoring Fixes #5711
1 parent 88c4261 commit 7e3ad68

File tree

5 files changed

+444
-71
lines changed

5 files changed

+444
-71
lines changed

src/services/code-index/constants/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ export const DEFAULT_MAX_SEARCH_RESULTS = CODEBASE_INDEX_DEFAULTS.DEFAULT_SEARCH
1313
/**File Watcher */
1414
export const QDRANT_CODE_BLOCK_NAMESPACE = "f47ac10b-58cc-4372-a567-0e02b2c3d479"
1515
export const MAX_FILE_SIZE_BYTES = 1 * 1024 * 1024 // 1MB
16+
export const MAX_SWIFT_FILE_SIZE_BYTES = 512 * 1024 // 512KB - Swift files can be memory intensive
17+
export const MEMORY_CHECK_INTERVAL_FILES = 10 // Check memory every N files
1618

1719
/**Directory Scanner */
1820
export const MAX_LIST_FILES_LIMIT = 3_000

src/services/code-index/processors/parser.ts

Lines changed: 151 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,15 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l
66
import { parseMarkdown } from "../../tree-sitter/markdownParser"
77
import { ICodeParser, CodeBlock } from "../interfaces"
88
import { scannerExtensions } from "../shared/supported-extensions"
9-
import { MAX_BLOCK_CHARS, MIN_BLOCK_CHARS, MIN_CHUNK_REMAINDER_CHARS, MAX_CHARS_TOLERANCE_FACTOR } from "../constants"
9+
import {
10+
MAX_BLOCK_CHARS,
11+
MIN_BLOCK_CHARS,
12+
MIN_CHUNK_REMAINDER_CHARS,
13+
MAX_CHARS_TOLERANCE_FACTOR,
14+
MAX_SWIFT_FILE_SIZE_BYTES,
15+
MEMORY_CHECK_INTERVAL_FILES,
16+
} from "../constants"
17+
import { MemoryMonitor } from "../utils/memoryMonitor"
1018
import { TelemetryService } from "@roo-code/telemetry"
1119
import { TelemetryEventName } from "@roo-code/types"
1220
import { sanitizeErrorMessage } from "../shared/validation-helpers"
@@ -17,6 +25,8 @@ import { sanitizeErrorMessage } from "../shared/validation-helpers"
1725
export class CodeParser implements ICodeParser {
1826
private loadedParsers: LanguageParser = {}
1927
private pendingLoads: Map<string, Promise<LanguageParser>> = new Map()
28+
private memoryMonitor = MemoryMonitor.getInstance()
29+
private filesProcessed = 0
2030
// Markdown files are now supported using the custom markdown parser
2131
// which extracts headers and sections for semantic indexing
2232

@@ -33,6 +43,17 @@ export class CodeParser implements ICodeParser {
3343
fileHash?: string
3444
},
3545
): Promise<CodeBlock[]> {
46+
// Periodic memory monitoring
47+
this.filesProcessed++
48+
if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) {
49+
const isHighMemory = this.memoryMonitor.checkAndCleanup()
50+
if (isHighMemory) {
51+
console.warn(
52+
`High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) after processing ${this.filesProcessed} files`,
53+
)
54+
}
55+
}
56+
3657
// Get file extension
3758
const ext = path.extname(filePath).toLowerCase()
3859

@@ -50,6 +71,23 @@ export class CodeParser implements ICodeParser {
5071
fileHash = options.fileHash || this.createFileHash(content)
5172
} else {
5273
try {
74+
// Check file size before reading for Swift files
75+
if (ext === ".swift") {
76+
const stats = await readFile(filePath, "utf8")
77+
.then((content) => ({ size: Buffer.byteLength(content, "utf8") }))
78+
.catch(() => null)
79+
if (stats && stats.size > MAX_SWIFT_FILE_SIZE_BYTES) {
80+
console.warn(
81+
`Skipping large Swift file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(MAX_SWIFT_FILE_SIZE_BYTES / 1024)}KB limit)`,
82+
)
83+
TelemetryService.instance.captureEvent(TelemetryEventName.CODE_INDEX_ERROR, {
84+
error: `Swift file too large: ${stats.size} bytes`,
85+
location: "parseFile:fileSizeCheck",
86+
})
87+
return []
88+
}
89+
}
90+
5391
content = await readFile(filePath, "utf8")
5492
fileHash = this.createFileHash(content)
5593
} catch (error) {
@@ -63,6 +101,14 @@ export class CodeParser implements ICodeParser {
63101
}
64102
}
65103

104+
// Additional memory check before parsing large files
105+
if (content.length > MAX_SWIFT_FILE_SIZE_BYTES && this.memoryMonitor.isMemoryPressure()) {
106+
console.warn(
107+
`Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`,
108+
)
109+
return []
110+
}
111+
66112
// Parse the file
67113
return this.parseContent(filePath, content, fileHash)
68114
}
@@ -144,84 +190,122 @@ export class CodeParser implements ICodeParser {
144190
return []
145191
}
146192

147-
const tree = language.parser.parse(content)
193+
let tree: any = null
194+
let captures: any[] = []
148195

149-
// We don't need to get the query string from languageQueries since it's already loaded
150-
// in the language object
151-
const captures = tree ? language.query.captures(tree.rootNode) : []
152-
153-
// Check if captures are empty
154-
if (captures.length === 0) {
155-
if (content.length >= MIN_BLOCK_CHARS) {
156-
// Perform fallback chunking if content is large enough
157-
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
158-
return blocks
159-
} else {
160-
// Return empty if content is too small for fallback
196+
try {
197+
// Check memory before parsing
198+
if (this.memoryMonitor.isMemoryPressure()) {
199+
console.warn(`Skipping parsing ${filePath} due to memory pressure`)
161200
return []
162201
}
163-
}
164202

165-
const results: CodeBlock[] = []
203+
tree = language.parser.parse(content)
166204

167-
// Process captures if not empty
168-
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
205+
// We don't need to get the query string from languageQueries since it's already loaded
206+
// in the language object
207+
captures = tree ? language.query.captures(tree.rootNode) : []
169208

170-
while (queue.length > 0) {
171-
const currentNode = queue.shift()!
172-
// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
209+
// Check if captures are empty
210+
if (captures.length === 0) {
211+
if (content.length >= MIN_BLOCK_CHARS) {
212+
// Perform fallback chunking if content is large enough
213+
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
214+
return blocks
215+
} else {
216+
// Return empty if content is too small for fallback
217+
return []
218+
}
219+
}
220+
221+
const results: CodeBlock[] = []
222+
223+
// Process captures if not empty
224+
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
225+
let processedNodes = 0
226+
const maxNodesToProcess = 1000 // Limit to prevent excessive memory usage
227+
228+
while (queue.length > 0 && processedNodes < maxNodesToProcess) {
229+
// Periodic memory check during processing
230+
if (processedNodes % 100 === 0 && this.memoryMonitor.isMemoryPressure()) {
231+
console.warn(
232+
`Stopping node processing for ${filePath} due to memory pressure after ${processedNodes} nodes`,
233+
)
234+
break
235+
}
173236

174-
// Check if the node meets the minimum character requirement
175-
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
176-
// If it also exceeds the maximum character limit, try to break it down
177-
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
178-
if (currentNode.children.filter((child) => child !== null).length > 0) {
179-
// If it has children, process them instead
180-
queue.push(...currentNode.children.filter((child) => child !== null))
237+
const currentNode = queue.shift()!
238+
processedNodes++
239+
240+
// Check if the node meets the minimum character requirement
241+
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
242+
// If it also exceeds the maximum character limit, try to break it down
243+
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
244+
if (currentNode.children.filter((child) => child !== null).length > 0) {
245+
// If it has children, process them instead (but limit queue growth)
246+
const validChildren = currentNode.children.filter((child) => child !== null)
247+
if (queue.length + validChildren.length < maxNodesToProcess) {
248+
queue.push(...validChildren)
249+
}
250+
} else {
251+
// If it's a leaf node, chunk it
252+
const chunkedBlocks = this._chunkLeafNodeByLines(
253+
currentNode,
254+
filePath,
255+
fileHash,
256+
seenSegmentHashes,
257+
)
258+
results.push(...chunkedBlocks)
259+
}
181260
} else {
182-
// If it's a leaf node, chunk it
183-
const chunkedBlocks = this._chunkLeafNodeByLines(
184-
currentNode,
185-
filePath,
186-
fileHash,
187-
seenSegmentHashes,
188-
)
189-
results.push(...chunkedBlocks)
190-
}
191-
} else {
192-
// Node meets min chars and is within max chars, create a block
193-
const identifier =
194-
currentNode.childForFieldName("name")?.text ||
195-
currentNode.children.find((c) => c?.type === "identifier")?.text ||
196-
null
197-
const type = currentNode.type
198-
const start_line = currentNode.startPosition.row + 1
199-
const end_line = currentNode.endPosition.row + 1
200-
const content = currentNode.text
201-
const contentPreview = content.slice(0, 100)
202-
const segmentHash = createHash("sha256")
203-
.update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`)
204-
.digest("hex")
205-
206-
if (!seenSegmentHashes.has(segmentHash)) {
207-
seenSegmentHashes.add(segmentHash)
208-
results.push({
209-
file_path: filePath,
210-
identifier,
211-
type,
212-
start_line,
213-
end_line,
214-
content,
215-
segmentHash,
216-
fileHash,
217-
})
261+
// Node meets min chars and is within max chars, create a block
262+
const identifier =
263+
currentNode.childForFieldName("name")?.text ||
264+
currentNode.children.find((c) => c?.type === "identifier")?.text ||
265+
null
266+
const type = currentNode.type
267+
const start_line = currentNode.startPosition.row + 1
268+
const end_line = currentNode.endPosition.row + 1
269+
const nodeContent = currentNode.text
270+
const contentPreview = nodeContent.slice(0, 100)
271+
const segmentHash = createHash("sha256")
272+
.update(`${filePath}-${start_line}-${end_line}-${nodeContent.length}-${contentPreview}`)
273+
.digest("hex")
274+
275+
if (!seenSegmentHashes.has(segmentHash)) {
276+
seenSegmentHashes.add(segmentHash)
277+
results.push({
278+
file_path: filePath,
279+
identifier,
280+
type,
281+
start_line,
282+
end_line,
283+
content: nodeContent,
284+
segmentHash,
285+
fileHash,
286+
})
287+
}
218288
}
219289
}
290+
// Nodes smaller than minBlockChars are ignored
220291
}
221-
// Nodes smaller than minBlockChars are ignored
222-
}
223292

224-
return results
293+
return results
294+
} finally {
295+
// Clean up tree-sitter resources
296+
if (tree) {
297+
try {
298+
tree.delete?.()
299+
} catch (e) {
300+
// Ignore cleanup errors
301+
}
302+
}
303+
304+
// Force garbage collection for Swift files if available
305+
if (ext === "swift" && global.gc) {
306+
global.gc()
307+
}
308+
}
225309
}
226310

227311
/**

src/services/code-index/processors/scanner.ts

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,25 @@ import { t } from "../../../i18n"
1717
import {
1818
QDRANT_CODE_BLOCK_NAMESPACE,
1919
MAX_FILE_SIZE_BYTES,
20+
MAX_SWIFT_FILE_SIZE_BYTES,
2021
MAX_LIST_FILES_LIMIT,
2122
BATCH_SEGMENT_THRESHOLD,
2223
MAX_BATCH_RETRIES,
2324
INITIAL_RETRY_DELAY_MS,
2425
PARSING_CONCURRENCY,
2526
BATCH_PROCESSING_CONCURRENCY,
27+
MEMORY_CHECK_INTERVAL_FILES,
2628
} from "../constants"
29+
import { MemoryMonitor } from "../utils/memoryMonitor"
2730
import { isPathInIgnoredDirectory } from "../../glob/ignore-utils"
2831
import { TelemetryService } from "@roo-code/telemetry"
2932
import { TelemetryEventName } from "@roo-code/types"
3033
import { sanitizeErrorMessage } from "../shared/validation-helpers"
3134

3235
export class DirectoryScanner implements IDirectoryScanner {
36+
private memoryMonitor = MemoryMonitor.getInstance()
37+
private filesProcessed = 0
38+
3339
constructor(
3440
private readonly embedder: IEmbedder,
3541
private readonly qdrantClient: IVectorStore,
@@ -107,9 +113,35 @@ export class DirectoryScanner implements IDirectoryScanner {
107113
const parsePromises = supportedPaths.map((filePath) =>
108114
parseLimiter(async () => {
109115
try {
110-
// Check file size
116+
// Periodic memory monitoring
117+
this.filesProcessed++
118+
if (this.filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0) {
119+
const isHighMemory = this.memoryMonitor.checkAndCleanup()
120+
if (isHighMemory) {
121+
console.warn(
122+
`High memory usage detected (${this.memoryMonitor.getMemoryUsageMB()}MB) during directory scan after ${this.filesProcessed} files`,
123+
)
124+
}
125+
}
126+
127+
// Check if memory pressure should stop processing
128+
if (this.memoryMonitor.isMemoryPressure()) {
129+
console.warn(
130+
`Skipping file ${filePath} due to memory pressure (${this.memoryMonitor.getMemoryUsageMB()}MB used)`,
131+
)
132+
skippedCount++
133+
return
134+
}
135+
136+
// Check file size with Swift-specific limits
111137
const stats = await stat(filePath)
112-
if (stats.size > MAX_FILE_SIZE_BYTES) {
138+
const ext = path.extname(filePath).toLowerCase()
139+
const maxSize = ext === ".swift" ? MAX_SWIFT_FILE_SIZE_BYTES : MAX_FILE_SIZE_BYTES
140+
141+
if (stats.size > maxSize) {
142+
console.warn(
143+
`Skipping large ${ext} file ${filePath} (${Math.round(stats.size / 1024)}KB > ${Math.round(maxSize / 1024)}KB limit)`,
144+
)
113145
skippedCount++ // Skip large files
114146
return
115147
}
@@ -148,6 +180,34 @@ export class DirectoryScanner implements IDirectoryScanner {
148180
const release = await mutex.acquire()
149181
totalBlockCount += fileBlockCount
150182
try {
183+
// Check memory before adding to batch
184+
if (this.memoryMonitor.isMemoryPressure()) {
185+
console.warn(
186+
`Memory pressure detected, forcing batch processing early (${currentBatchBlocks.length} blocks)`,
187+
)
188+
// Force process current batch before adding more
189+
if (currentBatchBlocks.length > 0) {
190+
const batchBlocks = [...currentBatchBlocks]
191+
const batchTexts = [...currentBatchTexts]
192+
const batchFileInfos = [...currentBatchFileInfos]
193+
currentBatchBlocks = []
194+
currentBatchTexts = []
195+
currentBatchFileInfos = []
196+
197+
const batchPromise = batchLimiter(() =>
198+
this.processBatch(
199+
batchBlocks,
200+
batchTexts,
201+
batchFileInfos,
202+
scanWorkspace,
203+
onError,
204+
onBlocksIndexed,
205+
),
206+
)
207+
activeBatchPromises.push(batchPromise)
208+
}
209+
}
210+
151211
currentBatchBlocks.push(block)
152212
currentBatchTexts.push(trimmedContent)
153213
addedBlocksFromFile = true
@@ -160,8 +220,11 @@ export class DirectoryScanner implements IDirectoryScanner {
160220
})
161221
}
162222

163-
// Check if batch threshold is met
164-
if (currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD) {
223+
// Check if batch threshold is met or memory pressure
224+
if (
225+
currentBatchBlocks.length >= BATCH_SEGMENT_THRESHOLD ||
226+
this.memoryMonitor.isMemoryPressure()
227+
) {
165228
// Copy current batch data and clear accumulators
166229
const batchBlocks = [...currentBatchBlocks]
167230
const batchTexts = [...currentBatchTexts]

0 commit comments

Comments
 (0)