Skip to content

Commit aa77ada

Browse files
committed
Fix #5301: Prevent early termination in list-files when directory has many files
- Remove early process termination in execRipgrep that was causing incomplete directory traversal - Implement balanced sampling algorithm to ensure fair representation across all directories - Increase timeout values to allow complete directory scanning before applying limits - Add applyBalancedSampling function to distribute file selection evenly across directories - Maintain existing 200-file limit while ensuring all directories are represented This fixes the issue where having 200+ files in one directory (e.g., 'a/') would cause other directories (e.g., 'b/') to be completely ignored in the file listing results.
1 parent 3a8ba27 commit aa77ada

File tree

219 files changed

+381
-18
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

219 files changed

+381
-18
lines changed

src/services/glob/list-files.ts

Lines changed: 105 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,22 @@ export async function listFiles(dirPath: string, recursive: boolean, limit: numb
2626
// Get ripgrep path
2727
const rgPath = await getRipgrepPath()
2828

29-
// Get files using ripgrep
30-
const files = await listFilesWithRipgrep(rgPath, dirPath, recursive, limit)
31-
32-
// Get directories with proper filtering
29+
// Get directories with proper filtering first to ensure we capture the structure
3330
const gitignorePatterns = await parseGitignoreFile(dirPath, recursive)
3431
const directories = await listFilteredDirectories(dirPath, recursive, gitignorePatterns)
3532

33+
// Get files using ripgrep with a higher limit to avoid early termination
34+
// We'll use a higher internal limit and then apply balanced sampling
35+
const internalLimit = Math.max(limit * 3, 1000) // Use 3x the requested limit or 1000, whichever is higher
36+
const files = await listFilesWithRipgrep(rgPath, dirPath, recursive, internalLimit)
37+
38+
// Apply balanced sampling to ensure fair representation across directories
39+
// Reserve some space for directories in the limit
40+
const filesLimit = Math.max(limit - directories.length, Math.floor(limit * 0.8))
41+
const balancedFiles = applyBalancedSampling(files, directories, filesLimit)
42+
3643
// Combine and format the results
37-
return formatAndCombineResults(files, directories, limit)
44+
return formatAndCombineResults(balancedFiles, directories, limit)
3845
}
3946

4047
/**
@@ -304,6 +311,91 @@ function isIgnoredByGitignore(dirName: string, gitignorePatterns: string[]): boo
304311
return false
305312
}
306313

314+
/**
315+
* Apply balanced sampling to ensure fair representation across directories
316+
* This prevents one large directory from dominating the file list
317+
*/
318+
function applyBalancedSampling(files: string[], directories: string[], limit: number): string[] {
319+
if (files.length <= limit) {
320+
return files
321+
}
322+
323+
// Group files by their parent directory
324+
const filesByDirectory = new Map<string, string[]>()
325+
326+
for (const file of files) {
327+
const dir = path.dirname(file)
328+
if (!filesByDirectory.has(dir)) {
329+
filesByDirectory.set(dir, [])
330+
}
331+
filesByDirectory.get(dir)!.push(file)
332+
}
333+
334+
// Improved balanced sampling algorithm
335+
const dirEntries = Array.from(filesByDirectory.entries())
336+
dirEntries.sort(([a], [b]) => a.localeCompare(b))
337+
338+
const result: string[] = []
339+
const dirCount = dirEntries.length
340+
341+
// Ensure each directory gets at least a minimum number of files
342+
const minFilesPerDir = Math.max(3, Math.floor(limit / (dirCount * 4))) // At least 3 files per dir, or 1/4 of fair share
343+
const maxFilesPerDir = Math.floor(limit / dirCount) + 10 // Allow some directories to have more files
344+
345+
// First pass: give each directory its minimum allocation
346+
let remainingLimit = limit
347+
const dirAllocations = new Map<string, number>()
348+
349+
for (const [dir, dirFiles] of dirEntries) {
350+
const allocation = Math.min(minFilesPerDir, dirFiles.length, remainingLimit)
351+
dirAllocations.set(dir, allocation)
352+
remainingLimit -= allocation
353+
}
354+
355+
// Second pass: distribute remaining slots proportionally to directory sizes
356+
if (remainingLimit > 0) {
357+
const totalFiles = dirEntries.reduce((sum, [, dirFiles]) => sum + dirFiles.length, 0)
358+
359+
for (const [dir, dirFiles] of dirEntries) {
360+
const currentAllocation = dirAllocations.get(dir)!
361+
const proportion = dirFiles.length / totalFiles
362+
const additionalSlots = Math.min(
363+
Math.floor(remainingLimit * proportion),
364+
maxFilesPerDir - currentAllocation,
365+
dirFiles.length - currentAllocation,
366+
)
367+
368+
if (additionalSlots > 0) {
369+
dirAllocations.set(dir, currentAllocation + additionalSlots)
370+
remainingLimit -= additionalSlots
371+
}
372+
}
373+
}
374+
375+
// Third pass: distribute any remaining slots to directories that can take them
376+
for (const [dir, dirFiles] of dirEntries) {
377+
if (remainingLimit <= 0) break
378+
379+
const currentAllocation = dirAllocations.get(dir)!
380+
const canTakeMore = Math.min(remainingLimit, dirFiles.length - currentAllocation)
381+
382+
if (canTakeMore > 0) {
383+
dirAllocations.set(dir, currentAllocation + canTakeMore)
384+
remainingLimit -= canTakeMore
385+
}
386+
}
387+
388+
// Collect the files based on allocations
389+
for (const [dir, dirFiles] of dirEntries) {
390+
const allocation = dirAllocations.get(dir)!
391+
dirFiles.sort() // Ensure consistent ordering
392+
const selectedFiles = dirFiles.slice(0, allocation)
393+
result.push(...selectedFiles)
394+
}
395+
396+
return result
397+
}
398+
307399
/**
308400
* Combine file and directory results and format them properly
309401
*/
@@ -338,23 +430,20 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
338430
let output = ""
339431
let results: string[] = []
340432

341-
// Set timeout to avoid hanging
433+
// Set timeout to avoid hanging - increased to allow more complete traversal
342434
const timeoutId = setTimeout(() => {
343435
rgProcess.kill()
344436
console.warn("ripgrep timed out, returning partial results")
345-
resolve(results.slice(0, limit))
346-
}, 10_000)
437+
resolve(results) // Don't slice here either
438+
}, 15_000)
347439

348440
// Process stdout data as it comes in
349441
rgProcess.stdout.on("data", (data) => {
350442
output += data.toString()
351443
processRipgrepOutput()
352444

353-
// Kill the process if we've reached the limit
354-
if (results.length >= limit) {
355-
rgProcess.kill()
356-
clearTimeout(timeoutId) // Clear the timeout when we kill the process due to reaching the limit
357-
}
445+
// Don't kill the process early - let it complete to get full directory structure
446+
// The balanced sampling will be applied later in applyBalancedSampling
358447
})
359448

360449
// Process stderr but don't fail on non-zero exit codes
@@ -375,7 +464,7 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
375464
console.warn(`ripgrep process exited with code ${code}, returning partial results`)
376465
}
377466

378-
resolve(results.slice(0, limit))
467+
resolve(results) // Don't slice here - let balanced sampling handle the limit
379468
})
380469

381470
// Handle process errors
@@ -396,12 +485,10 @@ async function execRipgrep(rgPath: string, args: string[], limit: number): Promi
396485
output = ""
397486
}
398487

399-
// Process each complete line
488+
// Process each complete line - don't limit here, let balanced sampling handle it
400489
for (const line of lines) {
401-
if (line.trim() && results.length < limit) {
490+
if (line.trim()) {
402491
results.push(line)
403-
} else if (results.length >= limit) {
404-
break
405492
}
406493
}
407494
}

test-issue-5301/a/file001.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 1

test-issue-5301/a/file002.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 2

test-issue-5301/a/file003.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 3

test-issue-5301/a/file004.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 4

test-issue-5301/a/file005.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 5

test-issue-5301/a/file006.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 6

test-issue-5301/a/file007.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 7

test-issue-5301/a/file008.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 8

test-issue-5301/a/file009.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Content of file 9

0 commit comments

Comments
 (0)