feat(phase-3): Task 3.3.5 - implement smart chunking logic

RepairYourTech · RepairYourTech · commit 84e9e871b50f · 2025-12-05T01:32:39.000-05:00
Sub-task 3.3.5: Implement Smart Chunking Logic - Modified parseContent() to use semantic boundary detection - Implemented smart chunking rules for functions and classes - Integrated comment inclusion and import metadata extraction - Applied new size limits (SEMANTIC_MAX_CHARS, ABSOLUTE_MAX_CHARS) Implementation: 1. Extract File-Level Imports: - Call extractFileImports() once per file - Include in all chunks from that file (Rule 4) 2. Semantic Boundary Detection: - Check if node is semantic unit (function/class) - Apply different size rules based on node type 3. Smart Chunking Rules: - Semantic units ≤3000 chars: Keep entire unit (Rule 1 & 2) - Semantic units ≤5000 chars: Still keep together - Semantic units >5000 chars: Fall back to children (TODO: splitAtLogicalBoundaries) - Non-semantic units: Apply standard size limits (1500 chars with tolerance) 4. Comment Inclusion (Rule 3): - Call includeComments() for every chunk - Adjusts start line to include preceding comments - Uses content with comments for chunk 5. Import Metadata (Rule 4): - Include fileImports in every CodeBlock - Provides type and dependency context Key Changes: - Functions up to 3000 chars stay intact (vs 1150 chars before) - Classes up to 3000 chars stay together - All chunks include preceding comments (JSDoc, etc.) - All chunks have import metadata - Semantic completeness prioritized over strict size limits Benefits: - Addresses Issue RooCodeInc#1: Large functions no longer split mid-way - Addresses Issue RooCodeInc#2: Comments always included with code - Addresses Issue RooCodeInc#3: Import context preserved in metadata - Addresses Issue RooCodeInc#5: Decorators stay with targets (tree-sitter includes them) Limitations: - Very large functions (>5000 chars) still fall back to children - TODO: Implement splitAtLogicalBoundaries() for intelligent splitting - TODO: Implement chunkClass() for large classes (>3000 chars) File modified: src/services/code-index/processors/parser.ts (+73 lines, -27 lines) Sub-task 3.3.5 complete! Task 3.3 complete! Next: Task 3.4 - Validate Chunking Improvements
diff --git a/src/services/code-index/processors/parser.ts b/src/services/code-index/processors/parser.ts
@@ -178,6 +178,9 @@ export class CodeParser implements ICodeParser {
 
 		const results: CodeBlock[] = []
 
+		// Phase 3: Extract file-level imports once
+		const fileImports = this.extractFileImports(tree)
+
 		// Process captures if not empty
 		const queue: Node[] = Array.from(captures).map((capture) => capture.node)
 
@@ -187,34 +190,75 @@ export class CodeParser implements ICodeParser {
 
 			// Check if the node meets the minimum character requirement
 			if (currentNode.text.length >= MIN_BLOCK_CHARS) {
-				// If it also exceeds the maximum character limit, try to break it down
-				if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
-					if (currentNode.children.filter((child) => child !== null).length > 0) {
-						// If it has children, process them instead
-						queue.push(...currentNode.children.filter((child) => child !== null))
+				// Phase 3: Smart chunking based on semantic boundaries
+				const isSemanticUnit = this.isSemanticUnit(currentNode)
+
+				// If it's a semantic unit (function/class), apply special rules
+				if (isSemanticUnit) {
+					// Rule 1 & 2: Never split functions/methods, keep classes together when possible
+					if (currentNode.text.length <= SEMANTIC_MAX_CHARS) {
+						// Keep entire semantic unit (even if >MAX_BLOCK_CHARS)
+						// This is the key change: we allow larger chunks for semantic completeness
+						// Will be handled in the "create a block" section below
+					} else if (currentNode.text.length <= ABSOLUTE_MAX_CHARS) {
+						// Between SEMANTIC_MAX and ABSOLUTE_MAX: still keep together
+						// Will be handled in the "create a block" section below
 					} else {
-						// If it's a leaf node, chunk it
-						const chunkedBlocks = this._chunkLeafNodeByLines(
-							currentNode,
-							filePath,
-							fileHash,
-							seenSegmentHashes,
-						)
-						results.push(...chunkedBlocks)
+						// >ABSOLUTE_MAX_CHARS: Need to split, but intelligently
+						// For now, fall back to processing children
+						// TODO: Implement splitAtLogicalBoundaries() for very large functions
+						if (currentNode.children.filter((child) => child !== null).length > 0) {
+							queue.push(...currentNode.children.filter((child) => child !== null))
+						} else {
+							const chunkedBlocks = this._chunkLeafNodeByLines(
+								currentNode,
+								filePath,
+								fileHash,
+								seenSegmentHashes,
+							)
+							results.push(...chunkedBlocks)
+						}
+						continue // Skip the "create a block" section
 					}
 				} else {
-					// Node meets min chars and is within max chars, create a block
+					// Not a semantic unit: apply standard size limits
+					if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
+						if (currentNode.children.filter((child) => child !== null).length > 0) {
+							// If it has children, process them instead
+							queue.push(...currentNode.children.filter((child) => child !== null))
+						} else {
+							// If it's a leaf node, chunk it
+							const chunkedBlocks = this._chunkLeafNodeByLines(
+								currentNode,
+								filePath,
+								fileHash,
+								seenSegmentHashes,
+							)
+							results.push(...chunkedBlocks)
+						}
+						continue // Skip the "create a block" section
+					}
+				}
+
+				// Create a block (for nodes that passed the size checks above)
+				{
+					// Phase 3: Include comments with code (Rule 3)
+					const { content: contentWithComments, startLine: adjustedStartLine } = this.includeComments(
+						currentNode,
+						content,
+					)
+
 					const identifier =
 						currentNode.childForFieldName("name")?.text ||
 						currentNode.children.find((c) => c?.type === "identifier")?.text ||
 						null
 					const type = currentNode.type
-					const start_line = currentNode.startPosition.row + 1
+					const start_line = adjustedStartLine // Use adjusted start line (includes comments)
 					const end_line = currentNode.endPosition.row + 1
-					const content = currentNode.text
-					const contentPreview = content.slice(0, 100)
+					const contentToUse = contentWithComments // Use content with comments
+					const contentPreview = contentToUse.slice(0, 100)
 					const segmentHash = createHash("sha256")
-						.update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`)
+						.update(`${filePath}-${start_line}-${end_line}-${contentToUse.length}-${contentPreview}`)
 						.digest("hex")
 
 					if (!seenSegmentHashes.has(segmentHash)) {
@@ -225,7 +269,7 @@ export class CodeParser implements ICodeParser {
 						let documentation = undefined
 						if (ext === "ts" || ext === "tsx" || ext === "js" || ext === "jsx") {
 							try {
-								symbolMetadata = extractSymbolMetadata(currentNode, content) || undefined
+								symbolMetadata = extractSymbolMetadata(currentNode, currentNode.text) || undefined
 								documentation = symbolMetadata?.documentation
 							} catch (error) {
 								// Silently fail metadata extraction - don't break indexing
@@ -239,11 +283,12 @@ export class CodeParser implements ICodeParser {
 							type,
 							start_line,
 							end_line,
-							content,
+							content: contentToUse, // Content with comments
 							segmentHash,
 							fileHash,
 							symbolMetadata,
 							documentation,
+							imports: fileImports.length > 0 ? fileImports : undefined, // Phase 3: Include imports (Rule 4)
 						})
 					}
 				}