@@ -318,36 +318,64 @@ describe("DirectoryScanner", () => {
318318 expect ( result . stats . processed ) . toBe ( 3 )
319319 } )
320320
321- it ( "should handle large markdown documentation folders efficiently " , async ( ) => {
321+ it ( "should generate unique point IDs for each block from the same file " , async ( ) => {
322322 const { listFiles } = await import ( "../../../glob/list-files" )
323+ vi . mocked ( listFiles ) . mockResolvedValue ( [ [ "test/large-doc.md" ] , false ] )
323324
324- // Simulate a large documentation folder with many markdown files
325- const markdownFiles = Array . from ( { length : 50 } , ( _ , i ) => `docs/section-${ i } .md` )
326- vi . mocked ( listFiles ) . mockResolvedValue ( [ markdownFiles , false ] )
327-
328- const mockMarkdownBlock : any = {
329- file_path : "docs/section-0.md" ,
330- content : "# Section Header\nDetailed content..." ,
331- start_line : 1 ,
332- end_line : 5 ,
333- identifier : "Section Header" ,
334- type : "markdown_header_h1" ,
335- fileHash : "section-hash" ,
336- segmentHash : "section-segment-hash" ,
337- }
325+ // Mock multiple blocks from the same file with different segmentHash values
326+ const mockBlocks : any [ ] = [
327+ {
328+ file_path : "test/large-doc.md" ,
329+ content : "# Introduction\nThis is the intro section..." ,
330+ start_line : 1 ,
331+ end_line : 10 ,
332+ identifier : "Introduction" ,
333+ type : "markdown_header_h1" ,
334+ fileHash : "same-file-hash" ,
335+ segmentHash : "unique-segment-hash-1" ,
336+ } ,
337+ {
338+ file_path : "test/large-doc.md" ,
339+ content : "## Getting Started\nHere's how to begin..." ,
340+ start_line : 11 ,
341+ end_line : 20 ,
342+ identifier : "Getting Started" ,
343+ type : "markdown_header_h2" ,
344+ fileHash : "same-file-hash" ,
345+ segmentHash : "unique-segment-hash-2" ,
346+ } ,
347+ {
348+ file_path : "test/large-doc.md" ,
349+ content : "## Advanced Topics\nFor advanced users..." ,
350+ start_line : 21 ,
351+ end_line : 30 ,
352+ identifier : "Advanced Topics" ,
353+ type : "markdown_header_h2" ,
354+ fileHash : "same-file-hash" ,
355+ segmentHash : "unique-segment-hash-3" ,
356+ } ,
357+ ]
338358
339- ; ( mockCodeParser . parseFile as any ) . mockResolvedValue ( [ mockMarkdownBlock ] )
359+ ; ( mockCodeParser . parseFile as any ) . mockResolvedValue ( mockBlocks )
340360
341- const result = await scanner . scanDirectory ( "/test" )
361+ await scanner . scanDirectory ( "/test" )
342362
343- // Verify all markdown files were processed
344- expect ( mockCodeParser . parseFile ) . toHaveBeenCalledTimes ( 50 )
345- expect ( result . stats . processed ) . toBe ( 50 )
346- expect ( result . codeBlocks ) . toHaveLength ( 50 )
363+ // Verify that upsertPoints was called with unique IDs for each block
364+ expect ( mockVectorStore . upsertPoints ) . toHaveBeenCalledTimes ( 1 )
365+ const upsertCall = mockVectorStore . upsertPoints . mock . calls [ 0 ]
366+ const points = upsertCall [ 0 ]
347367
348- // Verify embeddings were created for all markdown content
349- expect ( mockEmbedder . createEmbeddings ) . toHaveBeenCalled ( )
350- expect ( mockVectorStore . upsertPoints ) . toHaveBeenCalled ( )
368+ // Extract the IDs from the points
369+ const pointIds = points . map ( ( point : any ) => point . id )
370+
371+ // Verify all IDs are unique
372+ expect ( pointIds ) . toHaveLength ( 3 )
373+ expect ( new Set ( pointIds ) . size ) . toBe ( 3 ) // All IDs should be unique
374+
375+ // Verify that each point has the correct payload
376+ expect ( points [ 0 ] . payload . segmentHash ) . toBe ( "unique-segment-hash-1" )
377+ expect ( points [ 1 ] . payload . segmentHash ) . toBe ( "unique-segment-hash-2" )
378+ expect ( points [ 2 ] . payload . segmentHash ) . toBe ( "unique-segment-hash-3" )
351379 } )
352380 } )
353381} )
0 commit comments