@@ -6,7 +6,15 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l
66import { parseMarkdown } from "../../tree-sitter/markdownParser"
77import { ICodeParser , CodeBlock } from "../interfaces"
88import { scannerExtensions } from "../shared/supported-extensions"
9- import { MAX_BLOCK_CHARS , MIN_BLOCK_CHARS , MIN_CHUNK_REMAINDER_CHARS , MAX_CHARS_TOLERANCE_FACTOR } from "../constants"
9+ import {
10+ MAX_BLOCK_CHARS ,
11+ MIN_BLOCK_CHARS ,
12+ MIN_CHUNK_REMAINDER_CHARS ,
13+ MAX_CHARS_TOLERANCE_FACTOR ,
14+ MAX_SWIFT_FILE_SIZE_BYTES ,
15+ MEMORY_CHECK_INTERVAL_FILES ,
16+ } from "../constants"
17+ import { MemoryMonitor } from "../utils/memoryMonitor"
1018import { TelemetryService } from "@roo-code/telemetry"
1119import { TelemetryEventName } from "@roo-code/types"
1220import { sanitizeErrorMessage } from "../shared/validation-helpers"
@@ -17,6 +25,8 @@ import { sanitizeErrorMessage } from "../shared/validation-helpers"
1725export class CodeParser implements ICodeParser {
1826 private loadedParsers : LanguageParser = { }
1927 private pendingLoads : Map < string , Promise < LanguageParser > > = new Map ( )
28+ private memoryMonitor = MemoryMonitor . getInstance ( )
29+ private filesProcessed = 0
2030 // Markdown files are now supported using the custom markdown parser
2131 // which extracts headers and sections for semantic indexing
2232
@@ -33,6 +43,17 @@ export class CodeParser implements ICodeParser {
3343 fileHash ?: string
3444 } ,
3545 ) : Promise < CodeBlock [ ] > {
46+ // Periodic memory monitoring
47+ this . filesProcessed ++
48+ if ( this . filesProcessed % MEMORY_CHECK_INTERVAL_FILES === 0 ) {
49+ const isHighMemory = this . memoryMonitor . checkAndCleanup ( )
50+ if ( isHighMemory ) {
51+ console . warn (
52+ `High memory usage detected (${ this . memoryMonitor . getMemoryUsageMB ( ) } MB) after processing ${ this . filesProcessed } files` ,
53+ )
54+ }
55+ }
56+
3657 // Get file extension
3758 const ext = path . extname ( filePath ) . toLowerCase ( )
3859
@@ -50,6 +71,23 @@ export class CodeParser implements ICodeParser {
5071 fileHash = options . fileHash || this . createFileHash ( content )
5172 } else {
5273 try {
74+ // Check file size before reading for Swift files
75+ if ( ext === ".swift" ) {
76+ const stats = await readFile ( filePath , "utf8" )
77+ . then ( ( content ) => ( { size : Buffer . byteLength ( content , "utf8" ) } ) )
78+ . catch ( ( ) => null )
79+ if ( stats && stats . size > MAX_SWIFT_FILE_SIZE_BYTES ) {
80+ console . warn (
81+ `Skipping large Swift file ${ filePath } (${ Math . round ( stats . size / 1024 ) } KB > ${ Math . round ( MAX_SWIFT_FILE_SIZE_BYTES / 1024 ) } KB limit)` ,
82+ )
83+ TelemetryService . instance . captureEvent ( TelemetryEventName . CODE_INDEX_ERROR , {
84+ error : `Swift file too large: ${ stats . size } bytes` ,
85+ location : "parseFile:fileSizeCheck" ,
86+ } )
87+ return [ ]
88+ }
89+ }
90+
5391 content = await readFile ( filePath , "utf8" )
5492 fileHash = this . createFileHash ( content )
5593 } catch ( error ) {
@@ -63,6 +101,14 @@ export class CodeParser implements ICodeParser {
63101 }
64102 }
65103
104+ // Additional memory check before parsing large files
105+ if ( content . length > MAX_SWIFT_FILE_SIZE_BYTES && this . memoryMonitor . isMemoryPressure ( ) ) {
106+ console . warn (
107+ `Skipping file ${ filePath } due to memory pressure (${ this . memoryMonitor . getMemoryUsageMB ( ) } MB used)` ,
108+ )
109+ return [ ]
110+ }
111+
66112 // Parse the file
67113 return this . parseContent ( filePath , content , fileHash )
68114 }
@@ -144,84 +190,122 @@ export class CodeParser implements ICodeParser {
144190 return [ ]
145191 }
146192
147- const tree = language . parser . parse ( content )
193+ let tree : any = null
194+ let captures : any [ ] = [ ]
148195
149- // We don't need to get the query string from languageQueries since it's already loaded
150- // in the language object
151- const captures = tree ? language . query . captures ( tree . rootNode ) : [ ]
152-
153- // Check if captures are empty
154- if ( captures . length === 0 ) {
155- if ( content . length >= MIN_BLOCK_CHARS ) {
156- // Perform fallback chunking if content is large enough
157- const blocks = this . _performFallbackChunking ( filePath , content , fileHash , seenSegmentHashes )
158- return blocks
159- } else {
160- // Return empty if content is too small for fallback
196+ try {
197+ // Check memory before parsing
198+ if ( this . memoryMonitor . isMemoryPressure ( ) ) {
199+ console . warn ( `Skipping parsing ${ filePath } due to memory pressure` )
161200 return [ ]
162201 }
163- }
164202
165- const results : CodeBlock [ ] = [ ]
203+ tree = language . parser . parse ( content )
166204
167- // Process captures if not empty
168- const queue : Node [ ] = Array . from ( captures ) . map ( ( capture ) => capture . node )
205+ // We don't need to get the query string from languageQueries since it's already loaded
206+ // in the language object
207+ captures = tree ? language . query . captures ( tree . rootNode ) : [ ]
169208
170- while ( queue . length > 0 ) {
171- const currentNode = queue . shift ( ) !
172- // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
209+ // Check if captures are empty
210+ if ( captures . length === 0 ) {
211+ if ( content . length >= MIN_BLOCK_CHARS ) {
212+ // Perform fallback chunking if content is large enough
213+ const blocks = this . _performFallbackChunking ( filePath , content , fileHash , seenSegmentHashes )
214+ return blocks
215+ } else {
216+ // Return empty if content is too small for fallback
217+ return [ ]
218+ }
219+ }
220+
221+ const results : CodeBlock [ ] = [ ]
222+
223+ // Process captures if not empty
224+ const queue : Node [ ] = Array . from ( captures ) . map ( ( capture ) => capture . node )
225+ let processedNodes = 0
226+ const maxNodesToProcess = 1000 // Limit to prevent excessive memory usage
227+
228+ while ( queue . length > 0 && processedNodes < maxNodesToProcess ) {
229+ // Periodic memory check during processing
230+ if ( processedNodes % 100 === 0 && this . memoryMonitor . isMemoryPressure ( ) ) {
231+ console . warn (
232+ `Stopping node processing for ${ filePath } due to memory pressure after ${ processedNodes } nodes` ,
233+ )
234+ break
235+ }
173236
174- // Check if the node meets the minimum character requirement
175- if ( currentNode . text . length >= MIN_BLOCK_CHARS ) {
176- // If it also exceeds the maximum character limit, try to break it down
177- if ( currentNode . text . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
178- if ( currentNode . children . filter ( ( child ) => child !== null ) . length > 0 ) {
179- // If it has children, process them instead
180- queue . push ( ...currentNode . children . filter ( ( child ) => child !== null ) )
237+ const currentNode = queue . shift ( ) !
238+ processedNodes ++
239+
240+ // Check if the node meets the minimum character requirement
241+ if ( currentNode . text . length >= MIN_BLOCK_CHARS ) {
242+ // If it also exceeds the maximum character limit, try to break it down
243+ if ( currentNode . text . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
244+ if ( currentNode . children . filter ( ( child ) => child !== null ) . length > 0 ) {
245+ // If it has children, process them instead (but limit queue growth)
246+ const validChildren = currentNode . children . filter ( ( child ) => child !== null )
247+ if ( queue . length + validChildren . length < maxNodesToProcess ) {
248+ queue . push ( ...validChildren )
249+ }
250+ } else {
251+ // If it's a leaf node, chunk it
252+ const chunkedBlocks = this . _chunkLeafNodeByLines (
253+ currentNode ,
254+ filePath ,
255+ fileHash ,
256+ seenSegmentHashes ,
257+ )
258+ results . push ( ...chunkedBlocks )
259+ }
181260 } else {
182- // If it's a leaf node, chunk it
183- const chunkedBlocks = this . _chunkLeafNodeByLines (
184- currentNode ,
185- filePath ,
186- fileHash ,
187- seenSegmentHashes ,
188- )
189- results . push ( ...chunkedBlocks )
190- }
191- } else {
192- // Node meets min chars and is within max chars, create a block
193- const identifier =
194- currentNode . childForFieldName ( "name" ) ?. text ||
195- currentNode . children . find ( ( c ) => c ?. type === "identifier" ) ?. text ||
196- null
197- const type = currentNode . type
198- const start_line = currentNode . startPosition . row + 1
199- const end_line = currentNode . endPosition . row + 1
200- const content = currentNode . text
201- const contentPreview = content . slice ( 0 , 100 )
202- const segmentHash = createHash ( "sha256" )
203- . update ( `${ filePath } -${ start_line } -${ end_line } -${ content . length } -${ contentPreview } ` )
204- . digest ( "hex" )
205-
206- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
207- seenSegmentHashes . add ( segmentHash )
208- results . push ( {
209- file_path : filePath ,
210- identifier,
211- type,
212- start_line,
213- end_line,
214- content,
215- segmentHash,
216- fileHash,
217- } )
261+ // Node meets min chars and is within max chars, create a block
262+ const identifier =
263+ currentNode . childForFieldName ( "name" ) ?. text ||
264+ currentNode . children . find ( ( c ) => c ?. type === "identifier" ) ?. text ||
265+ null
266+ const type = currentNode . type
267+ const start_line = currentNode . startPosition . row + 1
268+ const end_line = currentNode . endPosition . row + 1
269+ const nodeContent = currentNode . text
270+ const contentPreview = nodeContent . slice ( 0 , 100 )
271+ const segmentHash = createHash ( "sha256" )
272+ . update ( `${ filePath } -${ start_line } -${ end_line } -${ nodeContent . length } -${ contentPreview } ` )
273+ . digest ( "hex" )
274+
275+ if ( ! seenSegmentHashes . has ( segmentHash ) ) {
276+ seenSegmentHashes . add ( segmentHash )
277+ results . push ( {
278+ file_path : filePath ,
279+ identifier,
280+ type,
281+ start_line,
282+ end_line,
283+ content : nodeContent ,
284+ segmentHash,
285+ fileHash,
286+ } )
287+ }
218288 }
219289 }
290+ // Nodes smaller than minBlockChars are ignored
220291 }
221- // Nodes smaller than minBlockChars are ignored
222- }
223292
224- return results
293+ return results
294+ } finally {
295+ // Clean up tree-sitter resources
296+ if ( tree ) {
297+ try {
298+ tree . delete ?.( )
299+ } catch ( e ) {
300+ // Ignore cleanup errors
301+ }
302+ }
303+
304+ // Force garbage collection for Swift files if available
305+ if ( ext === "swift" && global . gc ) {
306+ global . gc ( )
307+ }
308+ }
225309 }
226310
227311 /**
0 commit comments