Skip to content

Commit 843e2d9

Browse files
committed
Fix #5247: Correct codebase indexing to properly parse function/class level chunks
- Fixed CodeParser to properly use tree-sitter captures for identifying complete function and class definitions - Previously was creating arbitrary code fragments instead of complete definitions - Now correctly processes captures to identify definition boundaries and creates chunks for entire functions/classes - Added logic to avoid duplicate processing of the same definition nodes - Maintains fallback chunking only when no valid definitions are found
1 parent 3a8ba27 commit 843e2d9

File tree

1 file changed

+65
-56
lines changed

1 file changed

+65
-56
lines changed

src/services/code-index/processors/parser.ts

Lines changed: 65 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -126,75 +126,84 @@ export class CodeParser implements ICodeParser {
126126
// in the language object
127127
const captures = tree ? language.query.captures(tree.rootNode) : []
128128

129-
// Check if captures are empty
130-
if (captures.length === 0) {
131-
if (content.length >= MIN_BLOCK_CHARS) {
132-
// Perform fallback chunking if content is large enough
133-
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
134-
return blocks
135-
} else {
136-
// Return empty if content is too small for fallback
137-
return []
138-
}
139-
}
140-
141129
const results: CodeBlock[] = []
142130

143-
// Process captures if not empty
144-
const queue: Node[] = Array.from(captures).map((capture) => capture.node)
131+
// Process captures to find function/class definitions
132+
if (captures.length > 0) {
133+
// Group captures by their definition nodes to avoid duplicates
134+
const processedNodes = new Set<Node>()
145135

146-
while (queue.length > 0) {
147-
const currentNode = queue.shift()!
148-
// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
136+
for (const capture of captures) {
137+
const { node, name } = capture
149138

150-
// Check if the node meets the minimum character requirement
151-
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
152-
// If it also exceeds the maximum character limit, try to break it down
153-
if (currentNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
154-
if (currentNode.children.filter((child) => child !== null).length > 0) {
155-
// If it has children, process them instead
156-
queue.push(...currentNode.children.filter((child) => child !== null))
157-
} else {
158-
// If it's a leaf node, chunk it (passing MIN_BLOCK_CHARS as per Task 1 Step 5)
159-
// Note: _chunkLeafNodeByLines logic might need further adjustment later
139+
// Find the definition node - this is the node that contains the full function/class
140+
let definitionNode: Node | null = null
141+
142+
if (name.includes("definition.")) {
143+
// This capture represents a definition (function, class, method, etc.)
144+
definitionNode = node
145+
} else if (name.includes("name.definition.")) {
146+
// This capture represents the name of a definition, get the parent definition
147+
definitionNode = node.parent
148+
}
149+
150+
// Skip if we couldn't find a definition node or already processed it
151+
if (!definitionNode || processedNodes.has(definitionNode)) {
152+
continue
153+
}
154+
155+
// Check if the definition meets the minimum character requirement
156+
if (definitionNode.text.length >= MIN_BLOCK_CHARS) {
157+
// If it exceeds the maximum character limit, chunk it by lines
158+
if (definitionNode.text.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
160159
const chunkedBlocks = this._chunkLeafNodeByLines(
161-
currentNode,
160+
definitionNode,
162161
filePath,
163162
fileHash,
164163
seenSegmentHashes,
165164
)
166165
results.push(...chunkedBlocks)
166+
} else {
167+
// Create a block for the entire definition
168+
const identifier =
169+
definitionNode.childForFieldName("name")?.text ||
170+
definitionNode.children.find((c) => c?.type === "identifier")?.text ||
171+
definitionNode.children.find((c) => c?.type === "property_identifier")?.text ||
172+
definitionNode.children.find((c) => c?.type === "type_identifier")?.text ||
173+
null
174+
175+
const type = definitionNode.type
176+
const start_line = definitionNode.startPosition.row + 1
177+
const end_line = definitionNode.endPosition.row + 1
178+
const content = definitionNode.text
179+
const segmentHash = createHash("sha256")
180+
.update(`${filePath}-${start_line}-${end_line}-${content}`)
181+
.digest("hex")
182+
183+
if (!seenSegmentHashes.has(segmentHash)) {
184+
seenSegmentHashes.add(segmentHash)
185+
results.push({
186+
file_path: filePath,
187+
identifier,
188+
type,
189+
start_line,
190+
end_line,
191+
content,
192+
segmentHash,
193+
fileHash,
194+
})
195+
}
167196
}
168-
} else {
169-
// Node meets min chars and is within max chars, create a block
170-
const identifier =
171-
currentNode.childForFieldName("name")?.text ||
172-
currentNode.children.find((c) => c?.type === "identifier")?.text ||
173-
null
174-
const type = currentNode.type
175-
const start_line = currentNode.startPosition.row + 1
176-
const end_line = currentNode.endPosition.row + 1
177-
const content = currentNode.text
178-
const segmentHash = createHash("sha256")
179-
.update(`${filePath}-${start_line}-${end_line}-${content}`)
180-
.digest("hex")
181-
182-
if (!seenSegmentHashes.has(segmentHash)) {
183-
seenSegmentHashes.add(segmentHash)
184-
results.push({
185-
file_path: filePath,
186-
identifier,
187-
type,
188-
start_line,
189-
end_line,
190-
content,
191-
segmentHash,
192-
fileHash,
193-
})
194-
}
197+
198+
processedNodes.add(definitionNode)
195199
}
196200
}
197-
// Nodes smaller than MIN_BLOCK_CHARS are ignored
201+
}
202+
203+
// If no valid definitions were found, fall back to chunking
204+
if (results.length === 0 && content.length >= MIN_BLOCK_CHARS) {
205+
const blocks = this._performFallbackChunking(filePath, content, fileHash, seenSegmentHashes)
206+
return blocks
198207
}
199208

200209
return results

0 commit comments

Comments
 (0)