@@ -26,15 +26,22 @@ export async function listFiles(dirPath: string, recursive: boolean, limit: numb
2626 // Get ripgrep path
2727 const rgPath = await getRipgrepPath ( )
2828
29- // Get files using ripgrep
30- const files = await listFilesWithRipgrep ( rgPath , dirPath , recursive , limit )
31-
32- // Get directories with proper filtering
29+ // Get directories with proper filtering first to ensure we capture the structure
3330 const gitignorePatterns = await parseGitignoreFile ( dirPath , recursive )
3431 const directories = await listFilteredDirectories ( dirPath , recursive , gitignorePatterns )
3532
33+ // Get files using ripgrep with a higher limit to avoid early termination
34+ // We'll use a higher internal limit and then apply balanced sampling
35+ const internalLimit = Math . max ( limit * 3 , 1000 ) // Use 3x the requested limit or 1000, whichever is higher
36+ const files = await listFilesWithRipgrep ( rgPath , dirPath , recursive , internalLimit )
37+
38+ // Apply balanced sampling to ensure fair representation across directories
39+ // Reserve some space for directories in the limit
40+ const filesLimit = Math . max ( limit - directories . length , Math . floor ( limit * 0.8 ) )
41+ const balancedFiles = applyBalancedSampling ( files , directories , filesLimit )
42+
3643 // Combine and format the results
37- return formatAndCombineResults ( files , directories , limit )
44+ return formatAndCombineResults ( balancedFiles , directories , limit )
3845}
3946
4047/**
@@ -304,6 +311,91 @@ function isIgnoredByGitignore(dirName: string, gitignorePatterns: string[]): boo
304311 return false
305312}
306313
314+ /**
315+ * Apply balanced sampling to ensure fair representation across directories
316+ * This prevents one large directory from dominating the file list
317+ */
318+ function applyBalancedSampling ( files : string [ ] , directories : string [ ] , limit : number ) : string [ ] {
319+ if ( files . length <= limit ) {
320+ return files
321+ }
322+
323+ // Group files by their parent directory
324+ const filesByDirectory = new Map < string , string [ ] > ( )
325+
326+ for ( const file of files ) {
327+ const dir = path . dirname ( file )
328+ if ( ! filesByDirectory . has ( dir ) ) {
329+ filesByDirectory . set ( dir , [ ] )
330+ }
331+ filesByDirectory . get ( dir ) ! . push ( file )
332+ }
333+
334+ // Improved balanced sampling algorithm
335+ const dirEntries = Array . from ( filesByDirectory . entries ( ) )
336+ dirEntries . sort ( ( [ a ] , [ b ] ) => a . localeCompare ( b ) )
337+
338+ const result : string [ ] = [ ]
339+ const dirCount = dirEntries . length
340+
341+ // Ensure each directory gets at least a minimum number of files
342+ const minFilesPerDir = Math . max ( 3 , Math . floor ( limit / ( dirCount * 4 ) ) ) // At least 3 files per dir, or 1/4 of fair share
343+ const maxFilesPerDir = Math . floor ( limit / dirCount ) + 10 // Allow some directories to have more files
344+
345+ // First pass: give each directory its minimum allocation
346+ let remainingLimit = limit
347+ const dirAllocations = new Map < string , number > ( )
348+
349+ for ( const [ dir , dirFiles ] of dirEntries ) {
350+ const allocation = Math . min ( minFilesPerDir , dirFiles . length , remainingLimit )
351+ dirAllocations . set ( dir , allocation )
352+ remainingLimit -= allocation
353+ }
354+
355+ // Second pass: distribute remaining slots proportionally to directory sizes
356+ if ( remainingLimit > 0 ) {
357+ const totalFiles = dirEntries . reduce ( ( sum , [ , dirFiles ] ) => sum + dirFiles . length , 0 )
358+
359+ for ( const [ dir , dirFiles ] of dirEntries ) {
360+ const currentAllocation = dirAllocations . get ( dir ) !
361+ const proportion = dirFiles . length / totalFiles
362+ const additionalSlots = Math . min (
363+ Math . floor ( remainingLimit * proportion ) ,
364+ maxFilesPerDir - currentAllocation ,
365+ dirFiles . length - currentAllocation ,
366+ )
367+
368+ if ( additionalSlots > 0 ) {
369+ dirAllocations . set ( dir , currentAllocation + additionalSlots )
370+ remainingLimit -= additionalSlots
371+ }
372+ }
373+ }
374+
375+ // Third pass: distribute any remaining slots to directories that can take them
376+ for ( const [ dir , dirFiles ] of dirEntries ) {
377+ if ( remainingLimit <= 0 ) break
378+
379+ const currentAllocation = dirAllocations . get ( dir ) !
380+ const canTakeMore = Math . min ( remainingLimit , dirFiles . length - currentAllocation )
381+
382+ if ( canTakeMore > 0 ) {
383+ dirAllocations . set ( dir , currentAllocation + canTakeMore )
384+ remainingLimit -= canTakeMore
385+ }
386+ }
387+
388+ // Collect the files based on allocations
389+ for ( const [ dir , dirFiles ] of dirEntries ) {
390+ const allocation = dirAllocations . get ( dir ) !
391+ dirFiles . sort ( ) // Ensure consistent ordering
392+ const selectedFiles = dirFiles . slice ( 0 , allocation )
393+ result . push ( ...selectedFiles )
394+ }
395+
396+ return result
397+ }
398+
307399/**
308400 * Combine file and directory results and format them properly
309401 */
@@ -338,23 +430,20 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
338430 let output = ""
339431 let results : string [ ] = [ ]
340432
341- // Set timeout to avoid hanging
433+ // Set timeout to avoid hanging - increased to allow more complete traversal
342434 const timeoutId = setTimeout ( ( ) => {
343435 rgProcess . kill ( )
344436 console . warn ( "ripgrep timed out, returning partial results" )
345- resolve ( results . slice ( 0 , limit ) )
346- } , 10_000 )
437+ resolve ( results ) // Don't slice here either
438+ } , 15_000 )
347439
348440 // Process stdout data as it comes in
349441 rgProcess . stdout . on ( "data" , ( data ) => {
350442 output += data . toString ( )
351443 processRipgrepOutput ( )
352444
353- // Kill the process if we've reached the limit
354- if ( results . length >= limit ) {
355- rgProcess . kill ( )
356- clearTimeout ( timeoutId ) // Clear the timeout when we kill the process due to reaching the limit
357- }
445+ // Don't kill the process early - let it complete to get full directory structure
446+ // The balanced sampling will be applied later in applyBalancedSampling
358447 } )
359448
360449 // Process stderr but don't fail on non-zero exit codes
@@ -375,7 +464,7 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
375464 console . warn ( `ripgrep process exited with code ${ code } , returning partial results` )
376465 }
377466
378- resolve ( results . slice ( 0 , limit ) )
467+ resolve ( results ) // Don't slice here - let balanced sampling handle the limit
379468 } )
380469
381470 // Handle process errors
@@ -396,12 +485,10 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
396485 output = ""
397486 }
398487
399- // Process each complete line
488+ // Process each complete line - don't limit here, let balanced sampling handle it
400489 for ( const line of lines ) {
401- if ( line . trim ( ) && results . length < limit ) {
490+ if ( line . trim ( ) ) {
402491 results . push ( line )
403- } else if ( results . length >= limit ) {
404- break
405492 }
406493 }
407494 }
0 commit comments