22
33import { CodeParser , codeParser } from "../parser"
44import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser"
5+ import { parseMarkdown } from "../../../tree-sitter/markdownParser"
56import { readFile } from "fs/promises"
67import { Node } from "web-tree-sitter"
78
@@ -23,6 +24,7 @@ vi.mock("fs/promises", () => ({
2324} ) )
2425
2526vi . mock ( "../../../tree-sitter/languageParser" )
27+ vi . mock ( "../../../tree-sitter/markdownParser" )
2628
2729const mockLanguageParser = {
2830 js : {
@@ -242,4 +244,274 @@ describe("CodeParser", () => {
242244 expect ( result2 ) . toBeDefined ( )
243245 } )
244246 } )
247+
248+ describe ( "Markdown Support" , ( ) => {
249+ beforeEach ( ( ) => {
250+ vi . clearAllMocks ( )
251+ } )
252+
253+ it ( "should detect markdown files by extension" , async ( ) => {
254+ const markdownContent = `# Header 1
255+ This is a long section with enough content to meet the minimum character requirements for indexing.
256+ It contains multiple lines and detailed information about the topic.
257+ This ensures the section will be included in the code blocks.
258+
259+ ## Header 2
260+ Another substantial section with comprehensive content that exceeds the minimum character threshold.
261+ This section provides detailed explanations and examples to ensure proper indexing.`
262+
263+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
264+ {
265+ node : { startPosition : { row : 0 } , endPosition : { row : 4 } , text : "Header 1" } ,
266+ name : "name.definition.header.h1" ,
267+ patternIndex : 0 ,
268+ } ,
269+ {
270+ node : { startPosition : { row : 0 } , endPosition : { row : 4 } , text : "Header 1" } ,
271+ name : "definition.header.h1" ,
272+ patternIndex : 0 ,
273+ } ,
274+ {
275+ node : { startPosition : { row : 5 } , endPosition : { row : 7 } , text : "Header 2" } ,
276+ name : "name.definition.header.h2" ,
277+ patternIndex : 0 ,
278+ } ,
279+ {
280+ node : { startPosition : { row : 5 } , endPosition : { row : 7 } , text : "Header 2" } ,
281+ name : "definition.header.h2" ,
282+ patternIndex : 0 ,
283+ } ,
284+ ] as any )
285+
286+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
287+
288+ expect ( parseMarkdown ) . toHaveBeenCalledWith ( markdownContent )
289+ expect ( result ) . toHaveLength ( 2 )
290+ expect ( result [ 0 ] . type ) . toBe ( "markdown_header_h1" )
291+ expect ( result [ 1 ] . type ) . toBe ( "markdown_header_h2" )
292+ } )
293+
294+ it ( "should parse markdown headers into code blocks" , async ( ) => {
295+ const markdownContent = `# Introduction
296+ This is a comprehensive introduction section that provides detailed background information.
297+ It contains multiple paragraphs with substantial content to ensure it meets the minimum character requirements.
298+ The section covers important concepts and sets the foundation for the rest of the document.
299+
300+ ## Getting Started
301+ This section provides step-by-step instructions for getting started with the project.
302+ It includes detailed explanations, code examples, and troubleshooting tips.
303+ The content is substantial enough to warrant inclusion in the search index.`
304+
305+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
306+ {
307+ node : { startPosition : { row : 0 } , endPosition : { row : 4 } , text : "Introduction" } ,
308+ name : "name.definition.header.h1" ,
309+ patternIndex : 0 ,
310+ } ,
311+ {
312+ node : { startPosition : { row : 0 } , endPosition : { row : 4 } , text : "Introduction" } ,
313+ name : "definition.header.h1" ,
314+ patternIndex : 0 ,
315+ } ,
316+ {
317+ node : { startPosition : { row : 5 } , endPosition : { row : 8 } , text : "Getting Started" } ,
318+ name : "name.definition.header.h2" ,
319+ patternIndex : 0 ,
320+ } ,
321+ {
322+ node : { startPosition : { row : 5 } , endPosition : { row : 8 } , text : "Getting Started" } ,
323+ name : "definition.header.h2" ,
324+ patternIndex : 0 ,
325+ } ,
326+ ] as any )
327+
328+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
329+
330+ expect ( result ) . toHaveLength ( 2 )
331+ expect ( result [ 0 ] . identifier ) . toBe ( "Introduction" )
332+ expect ( result [ 0 ] . type ) . toBe ( "markdown_header_h1" )
333+ expect ( result [ 0 ] . start_line ) . toBe ( 1 )
334+ expect ( result [ 0 ] . end_line ) . toBe ( 5 )
335+
336+ expect ( result [ 1 ] . identifier ) . toBe ( "Getting Started" )
337+ expect ( result [ 1 ] . type ) . toBe ( "markdown_header_h2" )
338+ expect ( result [ 1 ] . start_line ) . toBe ( 6 )
339+ expect ( result [ 1 ] . end_line ) . toBe ( 9 )
340+ } )
341+
342+ it ( "should handle markdown files with no headers using fallback chunking" , async ( ) => {
343+ const markdownContent = `This is a markdown file without any headers but with substantial content.
344+ It contains multiple paragraphs and detailed information that should be indexed.
345+ The content is long enough to meet the minimum character requirements for fallback chunking.
346+ This ensures that even headerless markdown files can be properly indexed and searched.
347+ Additional content to ensure we exceed the minimum block size requirements for proper indexing.`
348+
349+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [ ] )
350+
351+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
352+
353+ expect ( parseMarkdown ) . toHaveBeenCalledWith ( markdownContent )
354+ expect ( result ) . toHaveLength ( 1 )
355+ expect ( result [ 0 ] . type ) . toBe ( "fallback_chunk" )
356+ } )
357+
358+ it ( "should respect minimum block size requirements" , async ( ) => {
359+ const markdownContent = `# Short
360+ Small content.
361+
362+ ## Another Short
363+ Also small.`
364+
365+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
366+ {
367+ node : { startPosition : { row : 0 } , endPosition : { row : 1 } , text : "Short" } ,
368+ name : "name.definition.header.h1" ,
369+ patternIndex : 0 ,
370+ } ,
371+ {
372+ node : { startPosition : { row : 0 } , endPosition : { row : 1 } , text : "Short" } ,
373+ name : "definition.header.h1" ,
374+ patternIndex : 0 ,
375+ } ,
376+ {
377+ node : { startPosition : { row : 3 } , endPosition : { row : 4 } , text : "Another Short" } ,
378+ name : "name.definition.header.h2" ,
379+ patternIndex : 0 ,
380+ } ,
381+ {
382+ node : { startPosition : { row : 3 } , endPosition : { row : 4 } , text : "Another Short" } ,
383+ name : "definition.header.h2" ,
384+ patternIndex : 0 ,
385+ } ,
386+ ] as any )
387+
388+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
389+
390+ expect ( result ) . toHaveLength ( 0 ) // Both sections are too small
391+ } )
392+
393+ it ( "should generate unique segment hashes for markdown sections" , async ( ) => {
394+ const markdownContent = `# Unique Section
395+ This is a unique section with substantial content that meets the minimum character requirements.
396+ It contains detailed information and multiple paragraphs to ensure proper indexing.
397+ The content is comprehensive and provides valuable information for search functionality.`
398+
399+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
400+ {
401+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "Unique Section" } ,
402+ name : "name.definition.header.h1" ,
403+ patternIndex : 0 ,
404+ } ,
405+ {
406+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "Unique Section" } ,
407+ name : "definition.header.h1" ,
408+ patternIndex : 0 ,
409+ } ,
410+ ] as any )
411+
412+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
413+
414+ expect ( result ) . toHaveLength ( 1 )
415+ expect ( result [ 0 ] . segmentHash ) . toMatch ( / ^ [ a - f 0 - 9 ] { 64 } $ / ) // SHA-256 hex format
416+ expect ( result [ 0 ] . fileHash ) . toMatch ( / ^ [ a - f 0 - 9 ] { 64 } $ / )
417+ } )
418+
419+ it ( "should handle .markdown extension" , async ( ) => {
420+ const markdownContent = `# Documentation
421+ This is comprehensive documentation with substantial content for proper indexing.
422+ It includes detailed explanations, examples, and best practices.
423+ The content is designed to be searchable and useful for developers.`
424+
425+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
426+ {
427+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "Documentation" } ,
428+ name : "name.definition.header.h1" ,
429+ patternIndex : 0 ,
430+ } ,
431+ {
432+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "Documentation" } ,
433+ name : "definition.header.h1" ,
434+ patternIndex : 0 ,
435+ } ,
436+ ] as any )
437+
438+ const result = await parser . parseFile ( "test.markdown" , { content : markdownContent } )
439+
440+ expect ( parseMarkdown ) . toHaveBeenCalledWith ( markdownContent )
441+ expect ( result ) . toHaveLength ( 1 )
442+ expect ( result [ 0 ] . type ) . toBe ( "markdown_header_h1" )
443+ } )
444+
445+ it ( "should handle empty markdown files" , async ( ) => {
446+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [ ] )
447+
448+ const result = await parser . parseFile ( "test.md" , { content : "" } )
449+
450+ expect ( result ) . toHaveLength ( 0 )
451+ } )
452+
453+ it ( "should handle markdown files with malformed content" , async ( ) => {
454+ const malformedContent = "Some content without proper structure"
455+
456+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [ ] )
457+
458+ const result = await parser . parseFile ( "test.md" , { content : malformedContent } )
459+
460+ expect ( result ) . toHaveLength ( 0 ) // Too small for fallback chunking
461+ } )
462+
463+ it ( "should extract correct header levels" , async ( ) => {
464+ const markdownContent = `# H1 Header
465+ Content for H1 with substantial text to meet minimum requirements.
466+ This section provides comprehensive information about the main topic.
467+
468+ ### H3 Header
469+ Content for H3 with detailed explanations and examples.
470+ This subsection covers specific aspects of the topic in depth.
471+
472+ ###### H6 Header
473+ Content for H6 with focused information on a particular detail.
474+ This section provides specific technical information for advanced users.`
475+
476+ vi . mocked ( parseMarkdown ) . mockReturnValue ( [
477+ {
478+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "H1 Header" } ,
479+ name : "name.definition.header.h1" ,
480+ patternIndex : 0 ,
481+ } ,
482+ {
483+ node : { startPosition : { row : 0 } , endPosition : { row : 3 } , text : "H1 Header" } ,
484+ name : "definition.header.h1" ,
485+ patternIndex : 0 ,
486+ } ,
487+ {
488+ node : { startPosition : { row : 4 } , endPosition : { row : 7 } , text : "H3 Header" } ,
489+ name : "name.definition.header.h3" ,
490+ patternIndex : 0 ,
491+ } ,
492+ {
493+ node : { startPosition : { row : 4 } , endPosition : { row : 7 } , text : "H3 Header" } ,
494+ name : "definition.header.h3" ,
495+ patternIndex : 0 ,
496+ } ,
497+ {
498+ node : { startPosition : { row : 8 } , endPosition : { row : 10 } , text : "H6 Header" } ,
499+ name : "name.definition.header.h6" ,
500+ patternIndex : 0 ,
501+ } ,
502+ {
503+ node : { startPosition : { row : 8 } , endPosition : { row : 10 } , text : "H6 Header" } ,
504+ name : "definition.header.h6" ,
505+ patternIndex : 0 ,
506+ } ,
507+ ] as any )
508+
509+ const result = await parser . parseFile ( "test.md" , { content : markdownContent } )
510+
511+ expect ( result ) . toHaveLength ( 3 )
512+ expect ( result [ 0 ] . type ) . toBe ( "markdown_header_h1" )
513+ expect ( result [ 1 ] . type ) . toBe ( "markdown_header_h3" )
514+ expect ( result [ 2 ] . type ) . toBe ( "markdown_header_h6" )
515+ } )
516+ } )
245517} )
0 commit comments