Skip to content

Commit 28745d1

Browse files
committed
feat: add markdown support to codebase indexing (#4660)
1 parent a57b0b6 commit 28745d1

File tree

5 files changed

+489
-4
lines changed

5 files changed

+489
-4
lines changed

src/services/code-index/processors/__tests__/parser.spec.ts

Lines changed: 272 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import { CodeParser, codeParser } from "../parser"
44
import { loadRequiredLanguageParsers } from "../../../tree-sitter/languageParser"
5+
import { parseMarkdown } from "../../../tree-sitter/markdownParser"
56
import { readFile } from "fs/promises"
67
import { Node } from "web-tree-sitter"
78

@@ -23,6 +24,7 @@ vi.mock("fs/promises", () => ({
2324
}))
2425

2526
vi.mock("../../../tree-sitter/languageParser")
27+
vi.mock("../../../tree-sitter/markdownParser")
2628

2729
const mockLanguageParser = {
2830
js: {
@@ -242,4 +244,274 @@ describe("CodeParser", () => {
242244
expect(result2).toBeDefined()
243245
})
244246
})
247+
248+
describe("Markdown Support", () => {
249+
beforeEach(() => {
250+
vi.clearAllMocks()
251+
})
252+
253+
it("should detect markdown files by extension", async () => {
254+
const markdownContent = `# Header 1
255+
This is a long section with enough content to meet the minimum character requirements for indexing.
256+
It contains multiple lines and detailed information about the topic.
257+
This ensures the section will be included in the code blocks.
258+
259+
## Header 2
260+
Another substantial section with comprehensive content that exceeds the minimum character threshold.
261+
This section provides detailed explanations and examples to ensure proper indexing.`
262+
263+
vi.mocked(parseMarkdown).mockReturnValue([
264+
{
265+
node: { startPosition: { row: 0 }, endPosition: { row: 4 }, text: "Header 1" },
266+
name: "name.definition.header.h1",
267+
patternIndex: 0,
268+
},
269+
{
270+
node: { startPosition: { row: 0 }, endPosition: { row: 4 }, text: "Header 1" },
271+
name: "definition.header.h1",
272+
patternIndex: 0,
273+
},
274+
{
275+
node: { startPosition: { row: 5 }, endPosition: { row: 7 }, text: "Header 2" },
276+
name: "name.definition.header.h2",
277+
patternIndex: 0,
278+
},
279+
{
280+
node: { startPosition: { row: 5 }, endPosition: { row: 7 }, text: "Header 2" },
281+
name: "definition.header.h2",
282+
patternIndex: 0,
283+
},
284+
] as any)
285+
286+
const result = await parser.parseFile("test.md", { content: markdownContent })
287+
288+
expect(parseMarkdown).toHaveBeenCalledWith(markdownContent)
289+
expect(result).toHaveLength(2)
290+
expect(result[0].type).toBe("markdown_header_h1")
291+
expect(result[1].type).toBe("markdown_header_h2")
292+
})
293+
294+
it("should parse markdown headers into code blocks", async () => {
295+
const markdownContent = `# Introduction
296+
This is a comprehensive introduction section that provides detailed background information.
297+
It contains multiple paragraphs with substantial content to ensure it meets the minimum character requirements.
298+
The section covers important concepts and sets the foundation for the rest of the document.
299+
300+
## Getting Started
301+
This section provides step-by-step instructions for getting started with the project.
302+
It includes detailed explanations, code examples, and troubleshooting tips.
303+
The content is substantial enough to warrant inclusion in the search index.`
304+
305+
vi.mocked(parseMarkdown).mockReturnValue([
306+
{
307+
node: { startPosition: { row: 0 }, endPosition: { row: 4 }, text: "Introduction" },
308+
name: "name.definition.header.h1",
309+
patternIndex: 0,
310+
},
311+
{
312+
node: { startPosition: { row: 0 }, endPosition: { row: 4 }, text: "Introduction" },
313+
name: "definition.header.h1",
314+
patternIndex: 0,
315+
},
316+
{
317+
node: { startPosition: { row: 5 }, endPosition: { row: 8 }, text: "Getting Started" },
318+
name: "name.definition.header.h2",
319+
patternIndex: 0,
320+
},
321+
{
322+
node: { startPosition: { row: 5 }, endPosition: { row: 8 }, text: "Getting Started" },
323+
name: "definition.header.h2",
324+
patternIndex: 0,
325+
},
326+
] as any)
327+
328+
const result = await parser.parseFile("test.md", { content: markdownContent })
329+
330+
expect(result).toHaveLength(2)
331+
expect(result[0].identifier).toBe("Introduction")
332+
expect(result[0].type).toBe("markdown_header_h1")
333+
expect(result[0].start_line).toBe(1)
334+
expect(result[0].end_line).toBe(5)
335+
336+
expect(result[1].identifier).toBe("Getting Started")
337+
expect(result[1].type).toBe("markdown_header_h2")
338+
expect(result[1].start_line).toBe(6)
339+
expect(result[1].end_line).toBe(9)
340+
})
341+
342+
it("should handle markdown files with no headers using fallback chunking", async () => {
343+
const markdownContent = `This is a markdown file without any headers but with substantial content.
344+
It contains multiple paragraphs and detailed information that should be indexed.
345+
The content is long enough to meet the minimum character requirements for fallback chunking.
346+
This ensures that even headerless markdown files can be properly indexed and searched.
347+
Additional content to ensure we exceed the minimum block size requirements for proper indexing.`
348+
349+
vi.mocked(parseMarkdown).mockReturnValue([])
350+
351+
const result = await parser.parseFile("test.md", { content: markdownContent })
352+
353+
expect(parseMarkdown).toHaveBeenCalledWith(markdownContent)
354+
expect(result).toHaveLength(1)
355+
expect(result[0].type).toBe("fallback_chunk")
356+
})
357+
358+
it("should respect minimum block size requirements", async () => {
359+
const markdownContent = `# Short
360+
Small content.
361+
362+
## Another Short
363+
Also small.`
364+
365+
vi.mocked(parseMarkdown).mockReturnValue([
366+
{
367+
node: { startPosition: { row: 0 }, endPosition: { row: 1 }, text: "Short" },
368+
name: "name.definition.header.h1",
369+
patternIndex: 0,
370+
},
371+
{
372+
node: { startPosition: { row: 0 }, endPosition: { row: 1 }, text: "Short" },
373+
name: "definition.header.h1",
374+
patternIndex: 0,
375+
},
376+
{
377+
node: { startPosition: { row: 3 }, endPosition: { row: 4 }, text: "Another Short" },
378+
name: "name.definition.header.h2",
379+
patternIndex: 0,
380+
},
381+
{
382+
node: { startPosition: { row: 3 }, endPosition: { row: 4 }, text: "Another Short" },
383+
name: "definition.header.h2",
384+
patternIndex: 0,
385+
},
386+
] as any)
387+
388+
const result = await parser.parseFile("test.md", { content: markdownContent })
389+
390+
expect(result).toHaveLength(0) // Both sections are too small
391+
})
392+
393+
it("should generate unique segment hashes for markdown sections", async () => {
394+
const markdownContent = `# Unique Section
395+
This is a unique section with substantial content that meets the minimum character requirements.
396+
It contains detailed information and multiple paragraphs to ensure proper indexing.
397+
The content is comprehensive and provides valuable information for search functionality.`
398+
399+
vi.mocked(parseMarkdown).mockReturnValue([
400+
{
401+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "Unique Section" },
402+
name: "name.definition.header.h1",
403+
patternIndex: 0,
404+
},
405+
{
406+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "Unique Section" },
407+
name: "definition.header.h1",
408+
patternIndex: 0,
409+
},
410+
] as any)
411+
412+
const result = await parser.parseFile("test.md", { content: markdownContent })
413+
414+
expect(result).toHaveLength(1)
415+
expect(result[0].segmentHash).toMatch(/^[a-f0-9]{64}$/) // SHA-256 hex format
416+
expect(result[0].fileHash).toMatch(/^[a-f0-9]{64}$/)
417+
})
418+
419+
it("should handle .markdown extension", async () => {
420+
const markdownContent = `# Documentation
421+
This is comprehensive documentation with substantial content for proper indexing.
422+
It includes detailed explanations, examples, and best practices.
423+
The content is designed to be searchable and useful for developers.`
424+
425+
vi.mocked(parseMarkdown).mockReturnValue([
426+
{
427+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "Documentation" },
428+
name: "name.definition.header.h1",
429+
patternIndex: 0,
430+
},
431+
{
432+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "Documentation" },
433+
name: "definition.header.h1",
434+
patternIndex: 0,
435+
},
436+
] as any)
437+
438+
const result = await parser.parseFile("test.markdown", { content: markdownContent })
439+
440+
expect(parseMarkdown).toHaveBeenCalledWith(markdownContent)
441+
expect(result).toHaveLength(1)
442+
expect(result[0].type).toBe("markdown_header_h1")
443+
})
444+
445+
it("should handle empty markdown files", async () => {
446+
vi.mocked(parseMarkdown).mockReturnValue([])
447+
448+
const result = await parser.parseFile("test.md", { content: "" })
449+
450+
expect(result).toHaveLength(0)
451+
})
452+
453+
it("should handle markdown files with malformed content", async () => {
454+
const malformedContent = "Some content without proper structure"
455+
456+
vi.mocked(parseMarkdown).mockReturnValue([])
457+
458+
const result = await parser.parseFile("test.md", { content: malformedContent })
459+
460+
expect(result).toHaveLength(0) // Too small for fallback chunking
461+
})
462+
463+
it("should extract correct header levels", async () => {
464+
const markdownContent = `# H1 Header
465+
Content for H1 with substantial text to meet minimum requirements.
466+
This section provides comprehensive information about the main topic.
467+
468+
### H3 Header
469+
Content for H3 with detailed explanations and examples.
470+
This subsection covers specific aspects of the topic in depth.
471+
472+
###### H6 Header
473+
Content for H6 with focused information on a particular detail.
474+
This section provides specific technical information for advanced users.`
475+
476+
vi.mocked(parseMarkdown).mockReturnValue([
477+
{
478+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "H1 Header" },
479+
name: "name.definition.header.h1",
480+
patternIndex: 0,
481+
},
482+
{
483+
node: { startPosition: { row: 0 }, endPosition: { row: 3 }, text: "H1 Header" },
484+
name: "definition.header.h1",
485+
patternIndex: 0,
486+
},
487+
{
488+
node: { startPosition: { row: 4 }, endPosition: { row: 7 }, text: "H3 Header" },
489+
name: "name.definition.header.h3",
490+
patternIndex: 0,
491+
},
492+
{
493+
node: { startPosition: { row: 4 }, endPosition: { row: 7 }, text: "H3 Header" },
494+
name: "definition.header.h3",
495+
patternIndex: 0,
496+
},
497+
{
498+
node: { startPosition: { row: 8 }, endPosition: { row: 10 }, text: "H6 Header" },
499+
name: "name.definition.header.h6",
500+
patternIndex: 0,
501+
},
502+
{
503+
node: { startPosition: { row: 8 }, endPosition: { row: 10 }, text: "H6 Header" },
504+
name: "definition.header.h6",
505+
patternIndex: 0,
506+
},
507+
] as any)
508+
509+
const result = await parser.parseFile("test.md", { content: markdownContent })
510+
511+
expect(result).toHaveLength(3)
512+
expect(result[0].type).toBe("markdown_header_h1")
513+
expect(result[1].type).toBe("markdown_header_h3")
514+
expect(result[2].type).toBe("markdown_header_h6")
515+
})
516+
})
245517
})

0 commit comments

Comments
 (0)