@@ -126,75 +126,84 @@ export class CodeParser implements ICodeParser {
126126 // in the language object
127127 const captures = tree ? language . query . captures ( tree . rootNode ) : [ ]
128128
129- // Check if captures are empty
130- if ( captures . length === 0 ) {
131- if ( content . length >= MIN_BLOCK_CHARS ) {
132- // Perform fallback chunking if content is large enough
133- const blocks = this . _performFallbackChunking ( filePath , content , fileHash , seenSegmentHashes )
134- return blocks
135- } else {
136- // Return empty if content is too small for fallback
137- return [ ]
138- }
139- }
140-
141129 const results : CodeBlock [ ] = [ ]
142130
143- // Process captures if not empty
144- const queue : Node [ ] = Array . from ( captures ) . map ( ( capture ) => capture . node )
131+ // Process captures to find function/class definitions
132+ if ( captures . length > 0 ) {
133+ // Group captures by their definition nodes to avoid duplicates
134+ const processedNodes = new Set < Node > ( )
145135
146- while ( queue . length > 0 ) {
147- const currentNode = queue . shift ( ) !
148- // const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
136+ for ( const capture of captures ) {
137+ const { node, name } = capture
149138
150- // Check if the node meets the minimum character requirement
151- if ( currentNode . text . length >= MIN_BLOCK_CHARS ) {
152- // If it also exceeds the maximum character limit, try to break it down
153- if ( currentNode . text . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
154- if ( currentNode . children . filter ( ( child ) => child !== null ) . length > 0 ) {
155- // If it has children, process them instead
156- queue . push ( ...currentNode . children . filter ( ( child ) => child !== null ) )
157- } else {
158- // If it's a leaf node, chunk it (passing MIN_BLOCK_CHARS as per Task 1 Step 5)
159- // Note: _chunkLeafNodeByLines logic might need further adjustment later
139+ // Find the definition node - this is the node that contains the full function/class
140+ let definitionNode : Node | null = null
141+
142+ if ( name . includes ( "definition." ) ) {
143+ // This capture represents a definition (function, class, method, etc.)
144+ definitionNode = node
145+ } else if ( name . includes ( "name.definition." ) ) {
146+ // This capture represents the name of a definition, get the parent definition
147+ definitionNode = node . parent
148+ }
149+
150+ // Skip if we couldn't find a definition node or already processed it
151+ if ( ! definitionNode || processedNodes . has ( definitionNode ) ) {
152+ continue
153+ }
154+
155+ // Check if the definition meets the minimum character requirement
156+ if ( definitionNode . text . length >= MIN_BLOCK_CHARS ) {
157+ // If it exceeds the maximum character limit, chunk it by lines
158+ if ( definitionNode . text . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
160159 const chunkedBlocks = this . _chunkLeafNodeByLines (
161- currentNode ,
160+ definitionNode ,
162161 filePath ,
163162 fileHash ,
164163 seenSegmentHashes ,
165164 )
166165 results . push ( ...chunkedBlocks )
166+ } else {
167+ // Create a block for the entire definition
168+ const identifier =
169+ definitionNode . childForFieldName ( "name" ) ?. text ||
170+ definitionNode . children . find ( ( c ) => c ?. type === "identifier" ) ?. text ||
171+ definitionNode . children . find ( ( c ) => c ?. type === "property_identifier" ) ?. text ||
172+ definitionNode . children . find ( ( c ) => c ?. type === "type_identifier" ) ?. text ||
173+ null
174+
175+ const type = definitionNode . type
176+ const start_line = definitionNode . startPosition . row + 1
177+ const end_line = definitionNode . endPosition . row + 1
178+ const content = definitionNode . text
179+ const segmentHash = createHash ( "sha256" )
180+ . update ( `${ filePath } -${ start_line } -${ end_line } -${ content } ` )
181+ . digest ( "hex" )
182+
183+ if ( ! seenSegmentHashes . has ( segmentHash ) ) {
184+ seenSegmentHashes . add ( segmentHash )
185+ results . push ( {
186+ file_path : filePath ,
187+ identifier,
188+ type,
189+ start_line,
190+ end_line,
191+ content,
192+ segmentHash,
193+ fileHash,
194+ } )
195+ }
167196 }
168- } else {
169- // Node meets min chars and is within max chars, create a block
170- const identifier =
171- currentNode . childForFieldName ( "name" ) ?. text ||
172- currentNode . children . find ( ( c ) => c ?. type === "identifier" ) ?. text ||
173- null
174- const type = currentNode . type
175- const start_line = currentNode . startPosition . row + 1
176- const end_line = currentNode . endPosition . row + 1
177- const content = currentNode . text
178- const segmentHash = createHash ( "sha256" )
179- . update ( `${ filePath } -${ start_line } -${ end_line } -${ content } ` )
180- . digest ( "hex" )
181-
182- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
183- seenSegmentHashes . add ( segmentHash )
184- results . push ( {
185- file_path : filePath ,
186- identifier,
187- type,
188- start_line,
189- end_line,
190- content,
191- segmentHash,
192- fileHash,
193- } )
194- }
197+
198+ processedNodes . add ( definitionNode )
195199 }
196200 }
197- // Nodes smaller than MIN_BLOCK_CHARS are ignored
201+ }
202+
203+ // If no valid definitions were found, fall back to chunking
204+ if ( results . length === 0 && content . length >= MIN_BLOCK_CHARS ) {
205+ const blocks = this . _performFallbackChunking ( filePath , content , fileHash , seenSegmentHashes )
206+ return blocks
198207 }
199208
200209 return results
0 commit comments