Skip to content

Commit ee8ec12

Browse files
committed
feat: implement fallback parsing for supported files
1 parent ba6483b commit ee8ec12

File tree

1 file changed

+154
-64
lines changed

1 file changed

+154
-64
lines changed

src/services/code-index/processors/parser.ts

Lines changed: 154 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,9 @@ import { LanguageParser, loadRequiredLanguageParsers } from "../../tree-sitter/l
66
import { ICodeParser, CodeBlock } from "../interfaces"
77
import { scannerExtensions } from "../shared/supported-extensions"
88

9-
const MIN_BLOCK_LINES = 3
10-
const MAX_BLOCK_LINES = 100
11-
const MAX_BLOCK_CHARS = 20000
9+
const MAX_BLOCK_CHARS = 1000
10+
const MIN_BLOCK_CHARS = 100
11+
const MIN_CHUNK_REMAINDER_CHARS = 200 // Minimum characters for the *next* chunk after a split
1212

1313
/**
1414
* Implementation of the code parser interface
@@ -28,15 +28,10 @@ export class CodeParser implements ICodeParser {
2828
async parseFile(
2929
filePath: string,
3030
options?: {
31-
minBlockLines?: number
32-
maxBlockLines?: number
3331
content?: string
3432
fileHash?: string
3533
},
3634
): Promise<CodeBlock[]> {
37-
const minBlockLines = options?.minBlockLines ?? MIN_BLOCK_LINES
38-
const maxBlockLines = options?.maxBlockLines ?? MAX_BLOCK_LINES
39-
4035
// Get file extension
4136
const ext = path.extname(filePath).toLowerCase()
4237

@@ -63,7 +58,7 @@ export class CodeParser implements ICodeParser {
6358
}
6459

6560
// Parse the file
66-
return this.parseContent(filePath, content, fileHash, minBlockLines, maxBlockLines)
61+
return this.parseContent(filePath, content, fileHash)
6762
}
6863

6964
/**
@@ -89,17 +84,9 @@ export class CodeParser implements ICodeParser {
8984
* @param filePath Path to the file
9085
* @param content File content
9186
* @param fileHash File hash
92-
* @param minBlockLines Minimum number of lines for a block
93-
* @param maxBlockLines Maximum number of lines for a block
9487
* @returns Array of code blocks
9588
*/
96-
private async parseContent(
97-
filePath: string,
98-
content: string,
99-
fileHash: string,
100-
minBlockLines: number,
101-
maxBlockLines: number,
102-
): Promise<CodeBlock[]> {
89+
private async parseContent(filePath: string, content: string, fileHash: string): Promise<CodeBlock[]> {
10390
const ext = path.extname(filePath).slice(1).toLowerCase()
10491

10592
// Check if we already have the parser loaded
@@ -140,29 +127,46 @@ export class CodeParser implements ICodeParser {
140127
// We don't need to get the query string from languageQueries since it's already loaded
141128
// in the language object
142129
const captures = language.query.captures(tree.rootNode)
130+
// Check if captures are empty
131+
if (captures.length === 0) {
132+
if (content.length >= MIN_BLOCK_CHARS) {
133+
// Perform fallback chunking if content is large enough
134+
return this._performFallbackChunking(filePath, content, fileHash, MIN_BLOCK_CHARS, MAX_BLOCK_CHARS)
135+
} else {
136+
// Return empty if content is too small for fallback
137+
return []
138+
}
139+
}
140+
143141
const results: CodeBlock[] = []
144142

145-
// Process captures
143+
// Process captures if not empty
146144
const queue: treeSitter.SyntaxNode[] = captures.map((capture: any) => capture.node)
147145

148146
while (queue.length > 0) {
149147
const currentNode = queue.shift()!
150-
const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1
148+
// const lineSpan = currentNode.endPosition.row - currentNode.startPosition.row + 1 // Removed as per lint error
151149

152-
if (lineSpan >= minBlockLines && lineSpan <= maxBlockLines) {
150+
// Check if the node meets the minimum character requirement
151+
if (currentNode.text.length >= MIN_BLOCK_CHARS) {
152+
// If it also exceeds the maximum character limit, try to break it down
153153
if (currentNode.text.length > MAX_BLOCK_CHARS) {
154154
if (currentNode.children.length > 0) {
155+
// If it has children, process them instead
155156
queue.push(...currentNode.children)
156157
} else {
158+
// If it's a leaf node, chunk it (passing MIN_BLOCK_CHARS as per Task 1 Step 5)
159+
// Note: _chunkLeafNodeByLines logic might need further adjustment later
157160
const chunkedBlocks = this._chunkLeafNodeByLines(
158161
currentNode,
159162
filePath,
160163
fileHash,
161-
MAX_BLOCK_CHARS,
164+
MIN_BLOCK_CHARS, // Pass minChars as requested
162165
)
163166
results.push(...chunkedBlocks)
164167
}
165168
} else {
169+
// Node meets min chars and is within max chars, create a block
166170
const identifier =
167171
currentNode.childForFieldName("name")?.text ||
168172
currentNode.children.find((c) => c.type === "identifier")?.text ||
@@ -186,78 +190,164 @@ export class CodeParser implements ICodeParser {
186190
fileHash,
187191
})
188192
}
189-
} else if (lineSpan > maxBlockLines) {
190-
queue.push(...currentNode.children)
191193
}
194+
// Nodes smaller than MIN_BLOCK_CHARS are ignored
192195
}
193196

194197
return results
195198
}
196199

197-
private _chunkLeafNodeByLines(
198-
node: treeSitter.SyntaxNode,
200+
/**
201+
* Common helper function to chunk text by lines, avoiding tiny remainders.
202+
*/
203+
private _chunkTextByLines(
204+
lines: string[],
199205
filePath: string,
200206
fileHash: string,
207+
baseStartLine: number, // 1-based start line of the *first* line in the `lines` array
208+
chunkType: string,
209+
minChars: number,
201210
maxChars: number,
211+
minRemainderChars: number,
202212
): CodeBlock[] {
203213
const chunks: CodeBlock[] = []
204-
const lines = node.text.split("\n")
205-
let currentChunk: string[] = []
214+
let currentChunkLines: string[] = []
206215
let currentChunkLength = 0
207-
let currentStartLine = node.startPosition.row + 1
208-
let chunkStartLine = currentStartLine
216+
let chunkStartLineIndex = 0 // 0-based index within the `lines` array
209217

210-
for (let i = 0; i < lines.length; i++) {
211-
const line = lines[i]
212-
if (currentChunkLength + line.length > maxChars && currentChunk.length > 0) {
213-
// Push current chunk
214-
const content = currentChunk.join("\n")
218+
const finalizeChunk = (endLineIndex: number) => {
219+
if (currentChunkLength >= minChars && currentChunkLines.length > 0) {
220+
const chunkContent = currentChunkLines.join("\n")
221+
const startLine = baseStartLine + chunkStartLineIndex
222+
const endLine = baseStartLine + endLineIndex
215223
const segmentHash = createHash("sha256")
216-
.update(`${filePath}-${chunkStartLine}-${currentStartLine + i - 1}-${content}`)
224+
.update(`${filePath}-${startLine}-${endLine}-${chunkContent}`)
217225
.digest("hex")
218226

219227
chunks.push({
220228
file_path: filePath,
221-
identifier: null,
222-
type: node.type,
223-
start_line: chunkStartLine,
224-
end_line: currentStartLine + i - 1,
225-
content,
229+
identifier: null, // Identifier is handled at a higher level if available
230+
type: chunkType,
231+
start_line: startLine,
232+
end_line: endLine,
233+
content: chunkContent,
226234
segmentHash,
227235
fileHash,
228236
})
237+
}
238+
// Reset for the next chunk
239+
currentChunkLines = []
240+
currentChunkLength = 0
241+
chunkStartLineIndex = endLineIndex + 1
242+
}
243+
244+
for (let i = 0; i < lines.length; i++) {
245+
const line = lines[i]
246+
const lineLength = line.length + (i < lines.length - 1 ? 1 : 0) // +1 for newline, except last line
247+
248+
// Check if adding this line exceeds the max limit
249+
if (currentChunkLength > 0 && currentChunkLength + lineLength > maxChars) {
250+
// --- Re-balancing Logic ---
251+
let splitIndex = i - 1 // Default split is *before* the current line
252+
253+
// Estimate remaining text length
254+
let remainderLength = 0
255+
for (let j = i; j < lines.length; j++) {
256+
remainderLength += lines[j].length + (j < lines.length - 1 ? 1 : 0)
257+
}
258+
259+
// Check if remainder is too small and we have a valid current chunk
260+
if (
261+
currentChunkLength >= minChars &&
262+
remainderLength < minRemainderChars &&
263+
currentChunkLines.length > 1
264+
) {
265+
// Try to find a better split point by looking backwards
266+
for (let k = i - 2; k >= chunkStartLineIndex; k--) {
267+
const potentialChunkLines = lines.slice(chunkStartLineIndex, k + 1)
268+
const potentialChunkLength = potentialChunkLines.join("\n").length + 1 // Approx. length
269+
270+
const potentialNextChunkLines = lines.slice(k + 1) // All remaining lines
271+
const potentialNextChunkLength = potentialNextChunkLines.join("\n").length + 1 // Approx. length
272+
273+
// Found a split leaving enough in current and next?
274+
if (potentialChunkLength >= minChars && potentialNextChunkLength >= minRemainderChars) {
275+
splitIndex = k // Found a better split point
276+
break
277+
}
278+
}
279+
// If no better split found, splitIndex remains i - 1
280+
}
281+
// --- End Re-balancing ---
229282

230-
// Start new chunk
231-
currentChunk = [line]
232-
currentChunkLength = line.length
233-
chunkStartLine = currentStartLine + i
283+
// Finalize the chunk up to the determined split index
284+
finalizeChunk(splitIndex)
285+
286+
// Add the current line to start the *new* chunk (if it wasn't part of the finalized chunk)
287+
if (i >= chunkStartLineIndex) {
288+
currentChunkLines.push(line)
289+
currentChunkLength += lineLength
290+
} else {
291+
// This case should ideally not happen with the current logic, but as a safeguard:
292+
// If the split somehow went *past* the current line index 'i',
293+
// we need to reset 'i' to start processing from the beginning of the new chunk.
294+
i = chunkStartLineIndex - 1 // Loop increment will make it chunkStartLineIndex
295+
continue // Re-process the line that starts the new chunk
296+
}
234297
} else {
235-
currentChunk.push(line)
236-
currentChunkLength += line.length
298+
// Add the current line to the chunk
299+
currentChunkLines.push(line)
300+
currentChunkLength += lineLength
237301
}
238302
}
239303

240-
// Push remaining chunk
241-
if (currentChunk.length > 0) {
242-
const content = currentChunk.join("\n")
243-
const segmentHash = createHash("sha256")
244-
.update(`${filePath}-${chunkStartLine}-${currentStartLine + lines.length - 1}-${content}`)
245-
.digest("hex")
246-
247-
chunks.push({
248-
file_path: filePath,
249-
identifier: null,
250-
type: node.type,
251-
start_line: chunkStartLine,
252-
end_line: currentStartLine + lines.length - 1,
253-
content,
254-
segmentHash,
255-
fileHash,
256-
})
304+
// Process the last remaining chunk
305+
if (currentChunkLines.length > 0) {
306+
finalizeChunk(lines.length - 1)
257307
}
258308

259309
return chunks
260310
}
311+
312+
private _performFallbackChunking(
313+
filePath: string,
314+
content: string,
315+
fileHash: string,
316+
minChars: number,
317+
maxChars: number,
318+
): CodeBlock[] {
319+
const lines = content.split("\n")
320+
return this._chunkTextByLines(
321+
lines,
322+
filePath,
323+
fileHash,
324+
1, // Fallback starts from line 1
325+
"fallback_chunk",
326+
minChars,
327+
maxChars,
328+
MIN_CHUNK_REMAINDER_CHARS,
329+
)
330+
}
331+
332+
private _chunkLeafNodeByLines(
333+
node: treeSitter.SyntaxNode,
334+
filePath: string,
335+
fileHash: string,
336+
minChars: number, // Note: This was previously used as max, now correctly used as min
337+
): CodeBlock[] {
338+
const lines = node.text.split("\n")
339+
const baseStartLine = node.startPosition.row + 1
340+
return this._chunkTextByLines(
341+
lines,
342+
filePath,
343+
fileHash,
344+
baseStartLine,
345+
node.type, // Use the node's type
346+
minChars,
347+
MAX_BLOCK_CHARS, // Use the global max
348+
MIN_CHUNK_REMAINDER_CHARS,
349+
)
350+
}
261351
}
262352

263353
// Export a singleton instance for convenience

0 commit comments

Comments
 (0)