@@ -181,8 +181,9 @@ export class CodeParser implements ICodeParser {
181181 const start_line = currentNode . startPosition . row + 1
182182 const end_line = currentNode . endPosition . row + 1
183183 const content = currentNode . text
184+ const contentPreview = content . slice ( 0 , 100 )
184185 const segmentHash = createHash ( "sha256" )
185- . update ( `${ filePath } -${ start_line } -${ end_line } -${ content } ` )
186+ . update ( `${ filePath } -${ start_line } -${ end_line } -${ content . length } - ${ contentPreview } ` )
186187 . digest ( "hex" )
187188
188189 if ( ! seenSegmentHashes . has ( segmentHash ) ) {
@@ -229,8 +230,9 @@ export class CodeParser implements ICodeParser {
229230 const chunkContent = currentChunkLines . join ( "\n" )
230231 const startLine = baseStartLine + chunkStartLineIndex
231232 const endLine = baseStartLine + endLineIndex
233+ const contentPreview = chunkContent . slice ( 0 , 100 )
232234 const segmentHash = createHash ( "sha256" )
233- . update ( `${ filePath } -${ startLine } -${ endLine } -${ chunkContent } ` )
235+ . update ( `${ filePath } -${ startLine } -${ endLine } -${ chunkContent . length } - ${ contentPreview } ` )
234236 . digest ( "hex" )
235237
236238 if ( ! seenSegmentHashes . has ( segmentHash ) ) {
@@ -253,8 +255,11 @@ export class CodeParser implements ICodeParser {
253255 }
254256
255257 const createSegmentBlock = ( segment : string , originalLineNumber : number , startCharIndex : number ) => {
258+ const segmentPreview = segment . slice ( 0 , 100 )
256259 const segmentHash = createHash ( "sha256" )
257- . update ( `${ filePath } -${ originalLineNumber } -${ originalLineNumber } -${ startCharIndex } -${ segment } ` )
260+ . update (
261+ `${ filePath } -${ originalLineNumber } -${ originalLineNumber } -${ startCharIndex } -${ segment . length } -${ segmentPreview } ` ,
262+ )
258263 . digest ( "hex" )
259264
260265 if ( ! seenSegmentHashes . has ( segmentHash ) ) {
@@ -379,6 +384,67 @@ export class CodeParser implements ICodeParser {
379384 )
380385 }
381386
387+ /**
388+ * Helper method to process markdown content sections with consistent chunking logic
389+ */
390+ private processMarkdownSection (
391+ lines : string [ ] ,
392+ filePath : string ,
393+ fileHash : string ,
394+ type : string ,
395+ seenSegmentHashes : Set < string > ,
396+ startLine : number ,
397+ identifier : string | null = null ,
398+ ) : CodeBlock [ ] {
399+ const content = lines . join ( "\n" )
400+
401+ if ( content . trim ( ) . length < MIN_BLOCK_CHARS ) {
402+ return [ ]
403+ }
404+
405+ // Check if content needs chunking (either total size or individual line size)
406+ const needsChunking =
407+ content . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ||
408+ lines . some ( ( line ) => line . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR )
409+
410+ if ( needsChunking ) {
411+ // Apply chunking for large content or oversized lines
412+ const chunks = this . _chunkTextByLines ( lines , filePath , fileHash , type , seenSegmentHashes , startLine )
413+ // Preserve identifier in all chunks if provided
414+ if ( identifier ) {
415+ chunks . forEach ( ( chunk ) => {
416+ chunk . identifier = identifier
417+ } )
418+ }
419+ return chunks
420+ }
421+
422+ // Create a single block for normal-sized content with no oversized lines
423+ const endLine = startLine + lines . length - 1
424+ const contentPreview = content . slice ( 0 , 100 )
425+ const segmentHash = createHash ( "sha256" )
426+ . update ( `${ filePath } -${ startLine } -${ endLine } -${ content . length } -${ contentPreview } ` )
427+ . digest ( "hex" )
428+
429+ if ( ! seenSegmentHashes . has ( segmentHash ) ) {
430+ seenSegmentHashes . add ( segmentHash )
431+ return [
432+ {
433+ file_path : filePath ,
434+ identifier,
435+ type,
436+ start_line : startLine ,
437+ end_line : endLine ,
438+ content,
439+ segmentHash,
440+ fileHash,
441+ } ,
442+ ]
443+ }
444+
445+ return [ ]
446+ }
447+
382448 private parseMarkdownContent (
383449 filePath : string ,
384450 content : string ,
@@ -389,53 +455,8 @@ export class CodeParser implements ICodeParser {
389455 const markdownCaptures = parseMarkdown ( content ) || [ ]
390456
391457 if ( markdownCaptures . length === 0 ) {
392- // No headers found, check if content needs chunking
393- if ( content . length >= MIN_BLOCK_CHARS ) {
394- // Check if content exceeds maximum size and needs chunking
395- if ( content . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
396- // Apply chunking for large header-less markdown files
397- return this . _chunkTextByLines ( lines , filePath , fileHash , "markdown_content" , seenSegmentHashes , 1 )
398- } else {
399- // Check if any individual line is oversized before creating a single block
400- const hasOversizedLine = lines . some (
401- ( line ) => line . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ,
402- )
403-
404- if ( hasOversizedLine ) {
405- // Apply chunking if there's an oversized line
406- return this . _chunkTextByLines (
407- lines ,
408- filePath ,
409- fileHash ,
410- "markdown_content" ,
411- seenSegmentHashes ,
412- 1 ,
413- )
414- } else {
415- // Create a single block for normal-sized content with no oversized lines
416- const segmentHash = createHash ( "sha256" )
417- . update ( `${ filePath } -1-${ lines . length } -${ content } ` )
418- . digest ( "hex" )
419-
420- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
421- seenSegmentHashes . add ( segmentHash )
422- return [
423- {
424- file_path : filePath ,
425- identifier : null ,
426- type : "markdown_content" ,
427- start_line : 1 ,
428- end_line : lines . length ,
429- content : content ,
430- segmentHash,
431- fileHash,
432- } ,
433- ]
434- }
435- }
436- }
437- }
438- return [ ]
458+ // No headers found, process entire content
459+ return this . processMarkdownSection ( lines , filePath , fileHash , "markdown_content" , seenSegmentHashes , 1 )
439460 }
440461
441462 const results : CodeBlock [ ] = [ ]
@@ -446,179 +467,62 @@ export class CodeParser implements ICodeParser {
446467 const firstHeaderLine = markdownCaptures [ 0 ] . node . startPosition . row
447468 if ( firstHeaderLine > 0 ) {
448469 const preHeaderLines = lines . slice ( 0 , firstHeaderLine )
449- const preHeaderContent = preHeaderLines . join ( "\n" )
450- if ( preHeaderContent . trim ( ) . length >= MIN_BLOCK_CHARS ) {
451- // Check if content exceeds maximum size and needs chunking
452- if ( preHeaderContent . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
453- // Apply chunking for large pre-header content
454- const chunks = this . _chunkTextByLines (
455- preHeaderLines ,
456- filePath ,
457- fileHash ,
458- "markdown_content" ,
459- seenSegmentHashes ,
460- 1 ,
461- )
462- results . push ( ...chunks )
463- } else {
464- // Check if any individual line is oversized before creating a single block
465- const hasOversizedLine = preHeaderLines . some (
466- ( line ) => line . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ,
467- )
468-
469- if ( hasOversizedLine ) {
470- // Apply chunking if there's an oversized line
471- const chunks = this . _chunkTextByLines (
472- preHeaderLines ,
473- filePath ,
474- fileHash ,
475- "markdown_content" ,
476- seenSegmentHashes ,
477- 1 ,
478- )
479- results . push ( ...chunks )
480- } else {
481- // Create a single block for normal-sized pre-header content with no oversized lines
482- const segmentHash = createHash ( "sha256" )
483- . update ( `${ filePath } -1-${ firstHeaderLine } -${ preHeaderContent } ` )
484- . digest ( "hex" )
485-
486- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
487- seenSegmentHashes . add ( segmentHash )
488- results . push ( {
489- file_path : filePath ,
490- identifier : null ,
491- type : "markdown_content" ,
492- start_line : 1 ,
493- end_line : firstHeaderLine ,
494- content : preHeaderContent ,
495- segmentHash,
496- fileHash,
497- } )
498- }
499- }
500- }
501- }
470+ const preHeaderBlocks = this . processMarkdownSection (
471+ preHeaderLines ,
472+ filePath ,
473+ fileHash ,
474+ "markdown_content" ,
475+ seenSegmentHashes ,
476+ 1 ,
477+ )
478+ results . push ( ...preHeaderBlocks )
502479 }
503480 }
504481
505482 // Process markdown captures (headers and sections)
506483 for ( let i = 0 ; i < markdownCaptures . length ; i += 2 ) {
507484 const nameCapture = markdownCaptures [ i ]
485+ // Ensure we don't go out of bounds when accessing the next capture
486+ if ( i + 1 >= markdownCaptures . length ) break
508487 const definitionCapture = markdownCaptures [ i + 1 ]
509488
510489 if ( ! definitionCapture ) continue
511490
512491 const startLine = definitionCapture . node . startPosition . row + 1
513492 const endLine = definitionCapture . node . endPosition . row + 1
514493 const sectionLines = lines . slice ( startLine - 1 , endLine )
515- const sectionContent = sectionLines . join ( "\n" )
516494
517495 // Extract header level for type classification
518496 const headerMatch = nameCapture . name . match ( / \. h ( \d ) $ / )
519497 const headerLevel = headerMatch ? parseInt ( headerMatch [ 1 ] ) : 1
520498 const headerText = nameCapture . node . text
521499
522- // Check if section needs chunking
523- if ( sectionContent . length >= MIN_BLOCK_CHARS ) {
524- if ( sectionContent . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
525- // Apply chunking for large sections
526- const chunks = this . _chunkTextByLines (
527- sectionLines ,
528- filePath ,
529- fileHash ,
530- `markdown_header_h${ headerLevel } ` ,
531- seenSegmentHashes ,
532- startLine ,
533- )
534- // Preserve header information in all chunks
535- chunks . forEach ( ( chunk ) => {
536- chunk . identifier = headerText
537- } )
538- results . push ( ...chunks )
539- } else {
540- // Create a single block for normal-sized sections
541- const segmentHash = createHash ( "sha256" )
542- . update ( `${ filePath } -${ startLine } -${ endLine } -${ sectionContent } ` )
543- . digest ( "hex" )
544-
545- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
546- seenSegmentHashes . add ( segmentHash )
547-
548- results . push ( {
549- file_path : filePath ,
550- identifier : headerText ,
551- type : `markdown_header_h${ headerLevel } ` ,
552- start_line : startLine ,
553- end_line : endLine ,
554- content : sectionContent ,
555- segmentHash,
556- fileHash,
557- } )
558- }
559- }
560- }
561- // Sections smaller than MIN_BLOCK_CHARS are ignored
500+ const sectionBlocks = this . processMarkdownSection (
501+ sectionLines ,
502+ filePath ,
503+ fileHash ,
504+ `markdown_header_h${ headerLevel } ` ,
505+ seenSegmentHashes ,
506+ startLine ,
507+ headerText ,
508+ )
509+ results . push ( ...sectionBlocks )
562510
563511 lastProcessedLine = endLine
564512 }
565513
566514 // Process any remaining content after the last header section
567515 if ( lastProcessedLine < lines . length ) {
568516 const remainingLines = lines . slice ( lastProcessedLine )
569- const remainingContent = remainingLines . join ( "\n" )
570- if ( remainingContent . trim ( ) . length >= MIN_BLOCK_CHARS ) {
571- // Check if content exceeds maximum size and needs chunking
572- if ( remainingContent . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ) {
573- // Apply chunking for large post-header content
574- const chunks = this . _chunkTextByLines (
575- remainingLines ,
576- filePath ,
577- fileHash ,
578- "markdown_content" ,
579- seenSegmentHashes ,
580- lastProcessedLine + 1 ,
581- )
582- results . push ( ...chunks )
583- } else {
584- // Check if any individual line is oversized before creating a single block
585- const hasOversizedLine = remainingLines . some (
586- ( line ) => line . length > MAX_BLOCK_CHARS * MAX_CHARS_TOLERANCE_FACTOR ,
587- )
588-
589- if ( hasOversizedLine ) {
590- // Apply chunking if there's an oversized line
591- const chunks = this . _chunkTextByLines (
592- remainingLines ,
593- filePath ,
594- fileHash ,
595- "markdown_content" ,
596- seenSegmentHashes ,
597- lastProcessedLine + 1 ,
598- )
599- results . push ( ...chunks )
600- } else {
601- // Create a single block for normal-sized post-header content with no oversized lines
602- const segmentHash = createHash ( "sha256" )
603- . update ( `${ filePath } -${ lastProcessedLine + 1 } -${ lines . length } -${ remainingContent } ` )
604- . digest ( "hex" )
605-
606- if ( ! seenSegmentHashes . has ( segmentHash ) ) {
607- seenSegmentHashes . add ( segmentHash )
608- results . push ( {
609- file_path : filePath ,
610- identifier : null ,
611- type : "markdown_content" ,
612- start_line : lastProcessedLine + 1 ,
613- end_line : lines . length ,
614- content : remainingContent ,
615- segmentHash,
616- fileHash,
617- } )
618- }
619- }
620- }
621- }
517+ const remainingBlocks = this . processMarkdownSection (
518+ remainingLines ,
519+ filePath ,
520+ fileHash ,
521+ "markdown_content" ,
522+ seenSegmentHashes ,
523+ lastProcessedLine + 1 ,
524+ )
525+ results . push ( ...remainingBlocks )
622526 }
623527
624528 return results
0 commit comments