Skip to content

Commit 038d0d2

Browse files
committed
feat: optimize large file scanning by pre-filtering git-lfs patterns
- Modified getLargeFileAutoExcludePatterns to accept git-lfs patterns as parameter - Use ripgrep exclusion flags to pre-filter git-lfs managed files before size checking - This avoids unnecessary file system operations on already-ignored files - Added comprehensive unit tests for the optimization As suggested by @adamhill, this leverages the existing git-lfs filter to improve performance
1 parent 24d887b commit 038d0d2

File tree

2 files changed

+153
-5
lines changed

2 files changed

+153
-5
lines changed

src/services/checkpoints/__tests__/excludes.spec.ts

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@
33
import { join } from "path"
44
import fs from "fs/promises"
55
import { fileExistsAtPath } from "../../../utils/fs"
6-
import { getExcludePatterns } from "../excludes"
6+
import { getExcludePatterns, getExcludePatternsWithStats } from "../excludes"
7+
import { executeRipgrep } from "../../search/file-search"
78

89
// Mock fs/promises
910
vi.mock("fs/promises", () => ({
1011
default: {
1112
readFile: vi.fn(),
13+
stat: vi.fn(),
1214
},
1315
}))
1416

@@ -17,6 +19,12 @@ vi.mock("../../../utils/fs", () => ({
1719
fileExistsAtPath: vi.fn(),
1820
}))
1921

22+
// Mock executeRipgrep
23+
vi.mock("../../search/file-search", () => ({
24+
executeRipgrep: vi.fn(),
25+
executeRipgrepForFiles: vi.fn(),
26+
}))
27+
2028
describe("getExcludePatterns", () => {
2129
const testWorkspacePath = "/test/workspace"
2230

@@ -152,4 +160,108 @@ readme.md text
152160
expect(excludePatterns).toContain("*.log") // log
153161
})
154162
})
163+
164+
describe("getLargeFileAutoExcludePatterns with LFS pre-filtering", () => {
165+
it("should pre-filter git-lfs patterns when scanning for large files", async () => {
166+
// Mock .gitattributes file exists with LFS patterns
167+
vi.mocked(fileExistsAtPath).mockResolvedValue(true)
168+
const gitAttributesContent = `*.psd filter=lfs diff=lfs merge=lfs -text
169+
*.zip filter=lfs diff=lfs merge=lfs -text
170+
*.mp4 filter=lfs diff=lfs merge=lfs -text
171+
`
172+
vi.mocked(fs.readFile).mockResolvedValue(gitAttributesContent)
173+
174+
// Mock executeRipgrep to return some files
175+
vi.mocked(executeRipgrep).mockResolvedValue([
176+
{ path: "file1.txt", type: "file", label: "file1.txt" },
177+
{ path: "large.bin", type: "file", label: "large.bin" },
178+
{ path: "code.js", type: "file", label: "code.js" },
179+
])
180+
181+
// Mock file stats
182+
vi.mocked(fs.stat).mockImplementation(async (path) => {
183+
const pathStr = path.toString()
184+
if (pathStr.includes("large.bin")) {
185+
return { size: 20 * 1024 * 1024 } as any // 20MB
186+
}
187+
return { size: 1024 } as any // 1KB
188+
})
189+
190+
// Get exclude patterns with stats
191+
const result = await getExcludePatternsWithStats(testWorkspacePath)
192+
193+
// Verify executeRipgrep was called with LFS patterns as exclusions
194+
expect(executeRipgrep).toHaveBeenCalledWith(
195+
expect.objectContaining({
196+
args: expect.arrayContaining(["-g", "!*.psd", "-g", "!*.zip", "-g", "!*.mp4"]),
197+
workspacePath: testWorkspacePath,
198+
}),
199+
)
200+
201+
// Verify large.bin was detected and included
202+
expect(result.stats.largeFilesExcluded).toBe(1)
203+
expect(result.stats.sample).toContain("large.bin")
204+
})
205+
206+
it("should handle empty LFS patterns gracefully", async () => {
207+
// Mock no .gitattributes file
208+
vi.mocked(fileExistsAtPath).mockResolvedValue(false)
209+
210+
// Mock executeRipgrep to return some files
211+
vi.mocked(executeRipgrep).mockResolvedValue([
212+
{ path: "file1.txt", type: "file", label: "file1.txt" },
213+
{ path: "large.bin", type: "file", label: "large.bin" },
214+
])
215+
216+
// Mock file stats
217+
vi.mocked(fs.stat).mockImplementation(async (path) => {
218+
const pathStr = path.toString()
219+
if (pathStr.includes("large.bin")) {
220+
return { size: 20 * 1024 * 1024 } as any // 20MB
221+
}
222+
return { size: 1024 } as any // 1KB
223+
})
224+
225+
// Get exclude patterns with stats
226+
const result = await getExcludePatternsWithStats(testWorkspacePath)
227+
228+
// Verify executeRipgrep was called without LFS patterns
229+
expect(executeRipgrep).toHaveBeenCalledWith(
230+
expect.objectContaining({
231+
args: expect.not.arrayContaining(["-g", "!*.psd", "-g", "!*.zip", "-g", "!*.mp4"]),
232+
workspacePath: testWorkspacePath,
233+
}),
234+
)
235+
236+
// Verify large file was still detected
237+
expect(result.stats.largeFilesExcluded).toBe(1)
238+
expect(result.stats.sample).toContain("large.bin")
239+
})
240+
241+
it("should not exclude code files even if they are large", async () => {
242+
// Mock no .gitattributes file
243+
vi.mocked(fileExistsAtPath).mockResolvedValue(false)
244+
245+
// Mock executeRipgrep to return some files including large code files
246+
vi.mocked(executeRipgrep).mockResolvedValue([
247+
{ path: "huge.js", type: "file", label: "huge.js" },
248+
{ path: "large.bin", type: "file", label: "large.bin" },
249+
{ path: "big.ts", type: "file", label: "big.ts" },
250+
])
251+
252+
// Mock file stats - all files are large
253+
vi.mocked(fs.stat).mockImplementation(async () => {
254+
return { size: 20 * 1024 * 1024 } as any // 20MB
255+
})
256+
257+
// Get exclude patterns with stats
258+
const result = await getExcludePatternsWithStats(testWorkspacePath)
259+
260+
// Verify only non-code file was excluded
261+
expect(result.stats.largeFilesExcluded).toBe(1)
262+
expect(result.stats.sample).toContain("large.bin")
263+
expect(result.stats.sample).not.toContain("huge.js")
264+
expect(result.stats.sample).not.toContain("big.ts")
265+
})
266+
})
155267
})

src/services/checkpoints/excludes.ts

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ import fs from "fs/promises"
22
import * as path from "path"
33

44
import { fileExistsAtPath } from "../../utils/fs"
5-
import { executeRipgrepForFiles } from "../search/file-search"
5+
import { executeRipgrepForFiles, executeRipgrep } from "../search/file-search"
66

77
const DEFAULT_LARGE_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 // 10 MB
88

@@ -268,14 +268,42 @@ const getGameEnginePatterns = () => [
268268

269269
/**
270270
* Scan the workspace for very large non-code files and exclude them automatically.
271+
* Pre-filters out git-lfs managed files to avoid unnecessary file system operations.
271272
* Uses ripgrep for fast file listing, then fs.stat for sizes.
272273
*/
273274
async function getLargeFileAutoExcludePatterns(
274275
workspacePath: string,
275276
thresholdBytes: number = DEFAULT_LARGE_FILE_THRESHOLD_BYTES,
277+
lfsPatterns: string[] = [],
276278
): Promise<string[]> {
277279
try {
278-
const items = await executeRipgrepForFiles(workspacePath, 50000)
280+
// Create a custom ripgrep execution that excludes git-lfs patterns
281+
const args = [
282+
"--files",
283+
"--follow",
284+
"--hidden",
285+
"-g",
286+
"!**/node_modules/**",
287+
"-g",
288+
"!**/.git/**",
289+
"-g",
290+
"!**/out/**",
291+
"-g",
292+
"!**/dist/**",
293+
]
294+
295+
// Add git-lfs patterns as exclusions to ripgrep
296+
// This pre-filters files before we check their sizes
297+
for (const pattern of lfsPatterns) {
298+
// Convert git-lfs patterns to ripgrep glob patterns
299+
// Git patterns like "*.psd" need to be "!*.psd" for ripgrep
300+
const rgPattern = pattern.startsWith("!") ? pattern.substring(1) : `!${pattern}`
301+
args.push("-g", rgPattern)
302+
}
303+
304+
args.push(workspacePath)
305+
306+
const items = await executeRipgrep({ args, workspacePath, limit: 50000 })
279307
const large: string[] = []
280308

281309
for (const item of items) {
@@ -311,6 +339,9 @@ export async function getExcludePatternsWithStats(workspacePath: string): Promis
311339
patterns: string[]
312340
stats: { largeFilesExcluded: number; thresholdBytes: number; sample: string[] }
313341
}> {
342+
// Get git-lfs patterns first
343+
const lfsPatterns = await getLfsPatterns(workspacePath)
344+
314345
const base = [
315346
".git/",
316347
...getBuildArtifactPatterns(),
@@ -322,10 +353,15 @@ export async function getExcludePatternsWithStats(workspacePath: string): Promis
322353
...getGeospatialPatterns(),
323354
...getLogFilePatterns(),
324355
...getGameEnginePatterns(),
325-
...(await getLfsPatterns(workspacePath)),
356+
...lfsPatterns,
326357
]
327358

328-
const dynamicLarge = await getLargeFileAutoExcludePatterns(workspacePath)
359+
// Pass lfs patterns to the large file scanner to pre-filter them
360+
const dynamicLarge = await getLargeFileAutoExcludePatterns(
361+
workspacePath,
362+
DEFAULT_LARGE_FILE_THRESHOLD_BYTES,
363+
lfsPatterns,
364+
)
329365

330366
const patterns = Array.from(new Set([...base, ...dynamicLarge]))
331367

0 commit comments

Comments
 (0)