Skip to content

Commit 5e74d28

Browse files
committed
feat: enhance markdown processing with consistent chunking logic and segment hashing
1 parent 4eaf223 commit 5e74d28

File tree

1 file changed

+101
-197
lines changed

1 file changed

+101
-197
lines changed

src/services/code-index/processors/parser.ts

Lines changed: 101 additions & 197 deletions
Original file line numberDiff line numberDiff line change
@@ -181,8 +181,9 @@ export class CodeParser implements ICodeParser {
181181
const start_line = currentNode.startPosition.row + 1
182182
const end_line = currentNode.endPosition.row + 1
183183
const content = currentNode.text
184+
const contentPreview = content.slice(0, 100)
184185
const segmentHash = createHash("sha256")
185-
.update(`${filePath}-${start_line}-${end_line}-${content}`)
186+
.update(`${filePath}-${start_line}-${end_line}-${content.length}-${contentPreview}`)
186187
.digest("hex")
187188

188189
if (!seenSegmentHashes.has(segmentHash)) {
@@ -229,8 +230,9 @@ export class CodeParser implements ICodeParser {
229230
const chunkContent = currentChunkLines.join("\n")
230231
const startLine = baseStartLine + chunkStartLineIndex
231232
const endLine = baseStartLine + endLineIndex
233+
const contentPreview = chunkContent.slice(0, 100)
232234
const segmentHash = createHash("sha256")
233-
.update(`${filePath}-${startLine}-${endLine}-${chunkContent}`)
235+
.update(`${filePath}-${startLine}-${endLine}-${chunkContent.length}-${contentPreview}`)
234236
.digest("hex")
235237

236238
if (!seenSegmentHashes.has(segmentHash)) {
@@ -253,8 +255,11 @@ export class CodeParser implements ICodeParser {
253255
}
254256

255257
const createSegmentBlock = (segment: string, originalLineNumber: number, startCharIndex: number) => {
258+
const segmentPreview = segment.slice(0, 100)
256259
const segmentHash = createHash("sha256")
257-
.update(`${filePath}-${originalLineNumber}-${originalLineNumber}-${startCharIndex}-${segment}`)
260+
.update(
261+
`${filePath}-${originalLineNumber}-${originalLineNumber}-${startCharIndex}-${segment.length}-${segmentPreview}`,
262+
)
258263
.digest("hex")
259264

260265
if (!seenSegmentHashes.has(segmentHash)) {
@@ -379,6 +384,67 @@ export class CodeParser implements ICodeParser {
379384
)
380385
}
381386

387+
/**
388+
* Helper method to process markdown content sections with consistent chunking logic
389+
*/
390+
private processMarkdownSection(
391+
lines: string[],
392+
filePath: string,
393+
fileHash: string,
394+
type: string,
395+
seenSegmentHashes: Set<string>,
396+
startLine: number,
397+
identifier: string | null = null,
398+
): CodeBlock[] {
399+
const content = lines.join("\n")
400+
401+
if (content.trim().length < MIN_BLOCK_CHARS) {
402+
return []
403+
}
404+
405+
// Check if content needs chunking (either total size or individual line size)
406+
const needsChunking =
407+
content.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ||
408+
lines.some((line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR)
409+
410+
if (needsChunking) {
411+
// Apply chunking for large content or oversized lines
412+
const chunks = this._chunkTextByLines(lines, filePath, fileHash, type, seenSegmentHashes, startLine)
413+
// Preserve identifier in all chunks if provided
414+
if (identifier) {
415+
chunks.forEach((chunk) => {
416+
chunk.identifier = identifier
417+
})
418+
}
419+
return chunks
420+
}
421+
422+
// Create a single block for normal-sized content with no oversized lines
423+
const endLine = startLine + lines.length - 1
424+
const contentPreview = content.slice(0, 100)
425+
const segmentHash = createHash("sha256")
426+
.update(`${filePath}-${startLine}-${endLine}-${content.length}-${contentPreview}`)
427+
.digest("hex")
428+
429+
if (!seenSegmentHashes.has(segmentHash)) {
430+
seenSegmentHashes.add(segmentHash)
431+
return [
432+
{
433+
file_path: filePath,
434+
identifier,
435+
type,
436+
start_line: startLine,
437+
end_line: endLine,
438+
content,
439+
segmentHash,
440+
fileHash,
441+
},
442+
]
443+
}
444+
445+
return []
446+
}
447+
382448
private parseMarkdownContent(
383449
filePath: string,
384450
content: string,
@@ -389,53 +455,8 @@ export class CodeParser implements ICodeParser {
389455
const markdownCaptures = parseMarkdown(content) || []
390456

391457
if (markdownCaptures.length === 0) {
392-
// No headers found, check if content needs chunking
393-
if (content.length >= MIN_BLOCK_CHARS) {
394-
// Check if content exceeds maximum size and needs chunking
395-
if (content.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
396-
// Apply chunking for large header-less markdown files
397-
return this._chunkTextByLines(lines, filePath, fileHash, "markdown_content", seenSegmentHashes, 1)
398-
} else {
399-
// Check if any individual line is oversized before creating a single block
400-
const hasOversizedLine = lines.some(
401-
(line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR,
402-
)
403-
404-
if (hasOversizedLine) {
405-
// Apply chunking if there's an oversized line
406-
return this._chunkTextByLines(
407-
lines,
408-
filePath,
409-
fileHash,
410-
"markdown_content",
411-
seenSegmentHashes,
412-
1,
413-
)
414-
} else {
415-
// Create a single block for normal-sized content with no oversized lines
416-
const segmentHash = createHash("sha256")
417-
.update(`${filePath}-1-${lines.length}-${content}`)
418-
.digest("hex")
419-
420-
if (!seenSegmentHashes.has(segmentHash)) {
421-
seenSegmentHashes.add(segmentHash)
422-
return [
423-
{
424-
file_path: filePath,
425-
identifier: null,
426-
type: "markdown_content",
427-
start_line: 1,
428-
end_line: lines.length,
429-
content: content,
430-
segmentHash,
431-
fileHash,
432-
},
433-
]
434-
}
435-
}
436-
}
437-
}
438-
return []
458+
// No headers found, process entire content
459+
return this.processMarkdownSection(lines, filePath, fileHash, "markdown_content", seenSegmentHashes, 1)
439460
}
440461

441462
const results: CodeBlock[] = []
@@ -446,179 +467,62 @@ export class CodeParser implements ICodeParser {
446467
const firstHeaderLine = markdownCaptures[0].node.startPosition.row
447468
if (firstHeaderLine > 0) {
448469
const preHeaderLines = lines.slice(0, firstHeaderLine)
449-
const preHeaderContent = preHeaderLines.join("\n")
450-
if (preHeaderContent.trim().length >= MIN_BLOCK_CHARS) {
451-
// Check if content exceeds maximum size and needs chunking
452-
if (preHeaderContent.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
453-
// Apply chunking for large pre-header content
454-
const chunks = this._chunkTextByLines(
455-
preHeaderLines,
456-
filePath,
457-
fileHash,
458-
"markdown_content",
459-
seenSegmentHashes,
460-
1,
461-
)
462-
results.push(...chunks)
463-
} else {
464-
// Check if any individual line is oversized before creating a single block
465-
const hasOversizedLine = preHeaderLines.some(
466-
(line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR,
467-
)
468-
469-
if (hasOversizedLine) {
470-
// Apply chunking if there's an oversized line
471-
const chunks = this._chunkTextByLines(
472-
preHeaderLines,
473-
filePath,
474-
fileHash,
475-
"markdown_content",
476-
seenSegmentHashes,
477-
1,
478-
)
479-
results.push(...chunks)
480-
} else {
481-
// Create a single block for normal-sized pre-header content with no oversized lines
482-
const segmentHash = createHash("sha256")
483-
.update(`${filePath}-1-${firstHeaderLine}-${preHeaderContent}`)
484-
.digest("hex")
485-
486-
if (!seenSegmentHashes.has(segmentHash)) {
487-
seenSegmentHashes.add(segmentHash)
488-
results.push({
489-
file_path: filePath,
490-
identifier: null,
491-
type: "markdown_content",
492-
start_line: 1,
493-
end_line: firstHeaderLine,
494-
content: preHeaderContent,
495-
segmentHash,
496-
fileHash,
497-
})
498-
}
499-
}
500-
}
501-
}
470+
const preHeaderBlocks = this.processMarkdownSection(
471+
preHeaderLines,
472+
filePath,
473+
fileHash,
474+
"markdown_content",
475+
seenSegmentHashes,
476+
1,
477+
)
478+
results.push(...preHeaderBlocks)
502479
}
503480
}
504481

505482
// Process markdown captures (headers and sections)
506483
for (let i = 0; i < markdownCaptures.length; i += 2) {
507484
const nameCapture = markdownCaptures[i]
485+
// Ensure we don't go out of bounds when accessing the next capture
486+
if (i + 1 >= markdownCaptures.length) break
508487
const definitionCapture = markdownCaptures[i + 1]
509488

510489
if (!definitionCapture) continue
511490

512491
const startLine = definitionCapture.node.startPosition.row + 1
513492
const endLine = definitionCapture.node.endPosition.row + 1
514493
const sectionLines = lines.slice(startLine - 1, endLine)
515-
const sectionContent = sectionLines.join("\n")
516494

517495
// Extract header level for type classification
518496
const headerMatch = nameCapture.name.match(/\.h(\d)$/)
519497
const headerLevel = headerMatch ? parseInt(headerMatch[1]) : 1
520498
const headerText = nameCapture.node.text
521499

522-
// Check if section needs chunking
523-
if (sectionContent.length >= MIN_BLOCK_CHARS) {
524-
if (sectionContent.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
525-
// Apply chunking for large sections
526-
const chunks = this._chunkTextByLines(
527-
sectionLines,
528-
filePath,
529-
fileHash,
530-
`markdown_header_h${headerLevel}`,
531-
seenSegmentHashes,
532-
startLine,
533-
)
534-
// Preserve header information in all chunks
535-
chunks.forEach((chunk) => {
536-
chunk.identifier = headerText
537-
})
538-
results.push(...chunks)
539-
} else {
540-
// Create a single block for normal-sized sections
541-
const segmentHash = createHash("sha256")
542-
.update(`${filePath}-${startLine}-${endLine}-${sectionContent}`)
543-
.digest("hex")
544-
545-
if (!seenSegmentHashes.has(segmentHash)) {
546-
seenSegmentHashes.add(segmentHash)
547-
548-
results.push({
549-
file_path: filePath,
550-
identifier: headerText,
551-
type: `markdown_header_h${headerLevel}`,
552-
start_line: startLine,
553-
end_line: endLine,
554-
content: sectionContent,
555-
segmentHash,
556-
fileHash,
557-
})
558-
}
559-
}
560-
}
561-
// Sections smaller than MIN_BLOCK_CHARS are ignored
500+
const sectionBlocks = this.processMarkdownSection(
501+
sectionLines,
502+
filePath,
503+
fileHash,
504+
`markdown_header_h${headerLevel}`,
505+
seenSegmentHashes,
506+
startLine,
507+
headerText,
508+
)
509+
results.push(...sectionBlocks)
562510

563511
lastProcessedLine = endLine
564512
}
565513

566514
// Process any remaining content after the last header section
567515
if (lastProcessedLine < lines.length) {
568516
const remainingLines = lines.slice(lastProcessedLine)
569-
const remainingContent = remainingLines.join("\n")
570-
if (remainingContent.trim().length >= MIN_BLOCK_CHARS) {
571-
// Check if content exceeds maximum size and needs chunking
572-
if (remainingContent.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR) {
573-
// Apply chunking for large post-header content
574-
const chunks = this._chunkTextByLines(
575-
remainingLines,
576-
filePath,
577-
fileHash,
578-
"markdown_content",
579-
seenSegmentHashes,
580-
lastProcessedLine + 1,
581-
)
582-
results.push(...chunks)
583-
} else {
584-
// Check if any individual line is oversized before creating a single block
585-
const hasOversizedLine = remainingLines.some(
586-
(line) => line.length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR,
587-
)
588-
589-
if (hasOversizedLine) {
590-
// Apply chunking if there's an oversized line
591-
const chunks = this._chunkTextByLines(
592-
remainingLines,
593-
filePath,
594-
fileHash,
595-
"markdown_content",
596-
seenSegmentHashes,
597-
lastProcessedLine + 1,
598-
)
599-
results.push(...chunks)
600-
} else {
601-
// Create a single block for normal-sized post-header content with no oversized lines
602-
const segmentHash = createHash("sha256")
603-
.update(`${filePath}-${lastProcessedLine + 1}-${lines.length}-${remainingContent}`)
604-
.digest("hex")
605-
606-
if (!seenSegmentHashes.has(segmentHash)) {
607-
seenSegmentHashes.add(segmentHash)
608-
results.push({
609-
file_path: filePath,
610-
identifier: null,
611-
type: "markdown_content",
612-
start_line: lastProcessedLine + 1,
613-
end_line: lines.length,
614-
content: remainingContent,
615-
segmentHash,
616-
fileHash,
617-
})
618-
}
619-
}
620-
}
621-
}
517+
const remainingBlocks = this.processMarkdownSection(
518+
remainingLines,
519+
filePath,
520+
fileHash,
521+
"markdown_content",
522+
seenSegmentHashes,
523+
lastProcessedLine + 1,
524+
)
525+
results.push(...remainingBlocks)
622526
}
623527

624528
return results

0 commit comments

Comments
 (0)