@@ -6,9 +6,9 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l
66import { ICodeParser , CodeBlock } from "../interfaces"
77import { scannerExtensions } from "../shared/supported-extensions"
88
9- const MIN_BLOCK_LINES = 3
10- const MAX_BLOCK_LINES = 100
11- const MAX_BLOCK_CHARS = 20000
9+ const MAX_BLOCK_CHARS = 1000
10+ const MIN_BLOCK_CHARS = 100
11+ const MIN_CHUNK_REMAINDER_CHARS = 200 // Minimum characters for the *next* chunk after a split
1212
1313/**
1414 * Implementation of the code parser interface
@@ -28,15 +28,10 @@ export class CodeParser implements ICodeParser {
2828 async parseFile (
2929 filePath : string ,
3030 options ?: {
31- minBlockLines ?: number
32- maxBlockLines ?: number
3331 content ?: string
3432 fileHash ?: string
3533 } ,
3634 ) : Promise < CodeBlock [ ] > {
37- const minBlockLines = options ?. minBlockLines ?? MIN_BLOCK_LINES
38- const maxBlockLines = options ?. maxBlockLines ?? MAX_BLOCK_LINES
39-
4035 // Get file extension
4136 const ext = path . extname ( filePath ) . toLowerCase ( )
4237
@@ -63,7 +58,7 @@ export class CodeParser implements ICodeParser {
6358 }
6459
6560 // Parse the file
66- return this . parseContent ( filePath , content , fileHash , minBlockLines , maxBlockLines )
61+ return this . parseContent ( filePath , content , fileHash )
6762 }
6863
6964 /**
@@ -89,17 +84,9 @@ export class CodeParser implements ICodeParser {
8984 * @param filePath Path to the file
9085 * @param content File content
9186 * @param fileHash File hash
92- * @param minBlockLines Minimum number of lines for a block
93- * @param maxBlockLines Maximum number of lines for a block
9487 * @returns Array of code blocks
9588 */
96- private async parseContent (
97- filePath : string ,
98- content : string ,
99- fileHash : string ,
100- minBlockLines : number ,
101- maxBlockLines : number ,
102- ) : Promise < CodeBlock [ ] > {
89+ private async parseContent ( filePath : string , content : string , fileHash : string ) : Promise < CodeBlock [ ] > {
10390 const ext = path . extname ( filePath ) . slice ( 1 ) . toLowerCase ( )
10491
10592 // Check if we already have the parser loaded
@@ -140,29 +127,46 @@ export class CodeParser implements ICodeParser {
140127 // We don't need to get the query string from languageQueries since it's already loaded
141128 // in the language object
142129 const captures = language . query . captures ( tree . rootNode )
130+ // Check if captures are empty
131+ if ( captures . length === 0 ) {
132+ if ( content . length >= MIN_BLOCK_CHARS ) {
133+ // Perform fallback chunking if content is large enough
134+ return this . _performFallbackChunking ( filePath , content , fileHash , MIN_BLOCK_CHARS , MAX_BLOCK_CHARS )
135+ } else {
136+ // Return empty if content is too small for fallback
137+ return [ ]
138+ }
139+ }
140+
143141 const results : CodeBlock [ ] = [ ]
144142
145- // Process captures
143+ // Process captures if not empty
146144 const queue : treeSitter . SyntaxNode [ ] = captures . map ( ( capture : any ) => capture . node )
147145
148146 while ( queue . length > 0 ) {
149147 const currentNode = queue . shift ( ) !
150- const lineSpan = currentNode . endPosition . row - currentNode . startPosition . row + 1
148+ // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
151149
152- if ( lineSpan >= minBlockLines && lineSpan <= maxBlockLines ) {
150+ // Check if the node meets the minimum character requirement
151+ if ( currentNode . text . length >= MIN_BLOCK_CHARS ) {
152+ // If it also exceeds the maximum character limit, try to break it down
153153 if ( currentNode . text . length > MAX_BLOCK_CHARS ) {
154154 if ( currentNode . children . length > 0 ) {
155+ // If it has children, process them instead
155156 queue . push ( ...currentNode . children )
156157 } else {
158+ // If it's a leaf node, chunk it (passing MIN_BLOCK_CHARS as per Task 1 Step 5)
159+ // Note: _chunkLeafNodeByLines logic might need further adjustment later
157160 const chunkedBlocks = this . _chunkLeafNodeByLines (
158161 currentNode ,
159162 filePath ,
160163 fileHash ,
161- MAX_BLOCK_CHARS ,
164+ MIN_BLOCK_CHARS , // Pass minChars as requested
162165 )
163166 results . push ( ...chunkedBlocks )
164167 }
165168 } else {
169+ // Node meets min chars and is within max chars, create a block
166170 const identifier =
167171 currentNode . childForFieldName ( "name" ) ?. text ||
168172 currentNode . children . find ( ( c ) => c . type === "identifier" ) ?. text ||
@@ -186,78 +190,164 @@ export class CodeParser implements ICodeParser {
186190 fileHash,
187191 } )
188192 }
189- } else if ( lineSpan > maxBlockLines ) {
190- queue . push ( ...currentNode . children )
191193 }
194+ // Nodes smaller than MIN_BLOCK_CHARS are ignored
192195 }
193196
194197 return results
195198 }
196199
197- private _chunkLeafNodeByLines (
198- node : treeSitter . SyntaxNode ,
200+ /**
201+ * Common helper function to chunk text by lines, avoiding tiny remainders.
202+ */
203+ private _chunkTextByLines (
204+ lines : string [ ] ,
199205 filePath : string ,
200206 fileHash : string ,
207+ baseStartLine : number , // 1-based start line of the *first* line in the `lines` array
208+ chunkType : string ,
209+ minChars : number ,
201210 maxChars : number ,
211+ minRemainderChars : number ,
202212 ) : CodeBlock [ ] {
203213 const chunks : CodeBlock [ ] = [ ]
204- const lines = node . text . split ( "\n" )
205- let currentChunk : string [ ] = [ ]
214+ let currentChunkLines : string [ ] = [ ]
206215 let currentChunkLength = 0
207- let currentStartLine = node . startPosition . row + 1
208- let chunkStartLine = currentStartLine
216+ let chunkStartLineIndex = 0 // 0-based index within the `lines` array
209217
210- for ( let i = 0 ; i < lines . length ; i ++ ) {
211- const line = lines [ i ]
212- if ( currentChunkLength + line . length > maxChars && currentChunk . length > 0 ) {
213- // Push current chunk
214- const content = currentChunk . join ( "\n" )
218+ const finalizeChunk = ( endLineIndex : number ) => {
219+ if ( currentChunkLength >= minChars && currentChunkLines . length > 0 ) {
220+ const chunkContent = currentChunkLines . join ( "\n" )
221+ const startLine = baseStartLine + chunkStartLineIndex
222+ const endLine = baseStartLine + endLineIndex
215223 const segmentHash = createHash ( "sha256" )
216- . update ( `${ filePath } -${ chunkStartLine } -${ currentStartLine + i - 1 } -${ content } ` )
224+ . update ( `${ filePath } -${ startLine } -${ endLine } -${ chunkContent } ` )
217225 . digest ( "hex" )
218226
219227 chunks . push ( {
220228 file_path : filePath ,
221- identifier : null ,
222- type : node . type ,
223- start_line : chunkStartLine ,
224- end_line : currentStartLine + i - 1 ,
225- content,
229+ identifier : null , // Identifier is handled at a higher level if available
230+ type : chunkType ,
231+ start_line : startLine ,
232+ end_line : endLine ,
233+ content : chunkContent ,
226234 segmentHash,
227235 fileHash,
228236 } )
237+ }
238+ // Reset for the next chunk
239+ currentChunkLines = [ ]
240+ currentChunkLength = 0
241+ chunkStartLineIndex = endLineIndex + 1
242+ }
243+
244+ for ( let i = 0 ; i < lines . length ; i ++ ) {
245+ const line = lines [ i ]
246+ const lineLength = line . length + ( i < lines . length - 1 ? 1 : 0 ) // +1 for newline, except last line
247+
248+ // Check if adding this line exceeds the max limit
249+ if ( currentChunkLength > 0 && currentChunkLength + lineLength > maxChars ) {
250+ // --- Re-balancing Logic ---
251+ let splitIndex = i - 1 // Default split is *before* the current line
252+
253+ // Estimate remaining text length
254+ let remainderLength = 0
255+ for ( let j = i ; j < lines . length ; j ++ ) {
256+ remainderLength += lines [ j ] . length + ( j < lines . length - 1 ? 1 : 0 )
257+ }
258+
259+ // Check if remainder is too small and we have a valid current chunk
260+ if (
261+ currentChunkLength >= minChars &&
262+ remainderLength < minRemainderChars &&
263+ currentChunkLines . length > 1
264+ ) {
265+ // Try to find a better split point by looking backwards
266+ for ( let k = i - 2 ; k >= chunkStartLineIndex ; k -- ) {
267+ const potentialChunkLines = lines . slice ( chunkStartLineIndex , k + 1 )
268+ const potentialChunkLength = potentialChunkLines . join ( "\n" ) . length + 1 // Approx. length
269+
270+ const potentialNextChunkLines = lines . slice ( k + 1 ) // All remaining lines
271+ const potentialNextChunkLength = potentialNextChunkLines . join ( "\n" ) . length + 1 // Approx. length
272+
273+ // Found a split leaving enough in current and next?
274+ if ( potentialChunkLength >= minChars && potentialNextChunkLength >= minRemainderChars ) {
275+ splitIndex = k // Found a better split point
276+ break
277+ }
278+ }
279+ // If no better split found, splitIndex remains i - 1
280+ }
281+ // --- End Re-balancing ---
229282
230- // Start new chunk
231- currentChunk = [ line ]
232- currentChunkLength = line . length
233- chunkStartLine = currentStartLine + i
283+ // Finalize the chunk up to the determined split index
284+ finalizeChunk ( splitIndex )
285+
286+ // Add the current line to start the *new* chunk (if it wasn't part of the finalized chunk)
287+ if ( i >= chunkStartLineIndex ) {
288+ currentChunkLines . push ( line )
289+ currentChunkLength += lineLength
290+ } else {
291+ // This case should ideally not happen with the current logic, but as a safeguard:
292+ // If the split somehow went *past* the current line index 'i',
293+ // we need to reset 'i' to start processing from the beginning of the new chunk.
294+ i = chunkStartLineIndex - 1 // Loop increment will make it chunkStartLineIndex
295+ continue // Re-process the line that starts the new chunk
296+ }
234297 } else {
235- currentChunk . push ( line )
236- currentChunkLength += line . length
298+ // Add the current line to the chunk
299+ currentChunkLines . push ( line )
300+ currentChunkLength += lineLength
237301 }
238302 }
239303
240- // Push remaining chunk
241- if ( currentChunk . length > 0 ) {
242- const content = currentChunk . join ( "\n" )
243- const segmentHash = createHash ( "sha256" )
244- . update ( `${ filePath } -${ chunkStartLine } -${ currentStartLine + lines . length - 1 } -${ content } ` )
245- . digest ( "hex" )
246-
247- chunks . push ( {
248- file_path : filePath ,
249- identifier : null ,
250- type : node . type ,
251- start_line : chunkStartLine ,
252- end_line : currentStartLine + lines . length - 1 ,
253- content,
254- segmentHash,
255- fileHash,
256- } )
304+ // Process the last remaining chunk
305+ if ( currentChunkLines . length > 0 ) {
306+ finalizeChunk ( lines . length - 1 )
257307 }
258308
259309 return chunks
260310 }
311+
312+ private _performFallbackChunking (
313+ filePath : string ,
314+ content : string ,
315+ fileHash : string ,
316+ minChars : number ,
317+ maxChars : number ,
318+ ) : CodeBlock [ ] {
319+ const lines = content . split ( "\n" )
320+ return this . _chunkTextByLines (
321+ lines ,
322+ filePath ,
323+ fileHash ,
324+ 1 , // Fallback starts from line 1
325+ "fallback_chunk" ,
326+ minChars ,
327+ maxChars ,
328+ MIN_CHUNK_REMAINDER_CHARS ,
329+ )
330+ }
331+
332+ private _chunkLeafNodeByLines (
333+ node : treeSitter . SyntaxNode ,
334+ filePath : string ,
335+ fileHash : string ,
336+ minChars : number , // Note: This was previously used as max, now correctly used as min
337+ ) : CodeBlock [ ] {
338+ const lines = node . text . split ( "\n" )
339+ const baseStartLine = node . startPosition . row + 1
340+ return this . _chunkTextByLines (
341+ lines ,
342+ filePath ,
343+ fileHash ,
344+ baseStartLine ,
345+ node . type , // Use the node's type
346+ minChars ,
347+ MAX_BLOCK_CHARS , // Use the global max
348+ MIN_CHUNK_REMAINDER_CHARS ,
349+ )
350+ }
261351}
262352
263353// Export a singleton instance for convenience
0 commit comments