diff --git a/src/services/checkpoints/ShadowCheckpointService.ts b/src/services/checkpoints/ShadowCheckpointService.ts index 03e019ed60..d0b9990f9f 100644 --- a/src/services/checkpoints/ShadowCheckpointService.ts +++ b/src/services/checkpoints/ShadowCheckpointService.ts @@ -11,7 +11,7 @@ import { fileExistsAtPath } from "../../utils/fs" import { executeRipgrep } from "../../services/search/file-search" import { CheckpointDiff, CheckpointResult, CheckpointEventMap } from "./types" -import { getExcludePatterns } from "./excludes" +import { getExcludePatterns, getExcludePatternsWithStats } from "./excludes" export abstract class ShadowCheckpointService extends EventEmitter { public readonly taskId: string @@ -95,7 +95,12 @@ export abstract class ShadowCheckpointService extends EventEmitter { ) } - await this.writeExcludeFile() + // Only regenerate exclude file if it doesn't exist + const excludePath = path.join(this.dotGitDir, "info", "exclude") + if (!(await fileExistsAtPath(excludePath))) { + this.log(`[${this.constructor.name}#initShadowGit] exclude file missing, regenerating`) + await this.writeExcludeFile() + } this.baseHash = await git.revparse(["HEAD"]) } else { this.log(`[${this.constructor.name}#initShadowGit] creating shadow git repo at ${this.checkpointsDir}`) @@ -104,7 +109,8 @@ export abstract class ShadowCheckpointService extends EventEmitter { await git.addConfig("commit.gpgSign", "false") // Disable commit signing for shadow repo. await git.addConfig("user.name", "Roo Code") await git.addConfig("user.email", "noreply@example.com") - await this.writeExcludeFile() + // Force write exclude file on initial creation (git creates a default one) + await this.writeExcludeFile(true) await this.stageAll(git) const { commit } = await git.commit("initial commit", { "--allow-empty": null }) this.baseHash = commit @@ -137,10 +143,44 @@ export abstract class ShadowCheckpointService extends EventEmitter { // .git/info/exclude is local to the shadow git repo, so it's not // shared with the main repo - and won't conflict with user's // .gitignore. - protected async writeExcludeFile() { + // Note: This is only called on initial creation or when the exclude file is missing + // to avoid expensive scans on every initialization. + protected async writeExcludeFile(forceRefresh: boolean = false) { + // Skip if exclude file exists and not forcing refresh + if (!forceRefresh) { + const excludePath = path.join(this.dotGitDir, "info", "exclude") + if (await fileExistsAtPath(excludePath)) { + this.log(`[${this.constructor.name}#writeExcludeFile] exclude file exists, skipping regeneration`) + return + } + } + await fs.mkdir(path.join(this.dotGitDir, "info"), { recursive: true }) - const patterns = await getExcludePatterns(this.workspaceDir) + const { patterns, stats } = await getExcludePatternsWithStats(this.workspaceDir) await fs.writeFile(path.join(this.dotGitDir, "info", "exclude"), patterns.join("\n")) + + const mb = Math.round(stats.thresholdBytes / (1024 * 1024)) + + if (stats?.largeFilesExcluded && stats.largeFilesExcluded > 0) { + this.log( + `[${this.constructor.name}#writeExcludeFile] auto-excluding ${stats.largeFilesExcluded} large files (>= ${mb}MB) from checkpoints. Sample: ${stats.sample.join(", ")}`, + ) + } + + if (stats?.errorCounts && (stats.errorCounts.ripgrepErrors > 0 || stats.errorCounts.fsStatErrors > 0)) { + this.log( + `[${this.constructor.name}#writeExcludeFile] auto-exclude encountered errors (ripgrepErrors=${stats.errorCounts.ripgrepErrors}, fsStatErrors=${stats.errorCounts.fsStatErrors}). Check environment and filesystem permissions.`, + ) + } + } + + // Public method to allow manual refresh of exclude patterns if needed + public async refreshExcludePatterns() { + if (!this.git) { + throw new Error("Shadow git repo not initialized") + } + this.log(`[${this.constructor.name}#refreshExcludePatterns] manually refreshing exclude patterns`) + await this.writeExcludeFile(true) } private async stageAll(git: SimpleGit) { diff --git a/src/services/checkpoints/__tests__/ShadowCheckpointService.spec.ts b/src/services/checkpoints/__tests__/ShadowCheckpointService.spec.ts index 4bf2529d59..678b614547 100644 --- a/src/services/checkpoints/__tests__/ShadowCheckpointService.spec.ts +++ b/src/services/checkpoints/__tests__/ShadowCheckpointService.spec.ts @@ -295,6 +295,11 @@ describe.each([[RepoPerTaskCheckpointService, "RepoPerTaskCheckpointService"]])( }) it("does not create a checkpoint for ignored files", async () => { + // Verify that the exclude file was created during initialization + const excludesPath = path.join(service.checkpointsDir, ".git", "info", "exclude") + const excludeContent = await fs.readFile(excludesPath, "utf-8") + expect(excludeContent).toContain("*.log") + // Create a file that matches an ignored pattern (e.g., .log file). const ignoredFile = path.join(service.workspaceDir, "ignored.log") await fs.writeFile(ignoredFile, "Initial ignored content") @@ -315,10 +320,12 @@ describe.each([[RepoPerTaskCheckpointService, "RepoPerTaskCheckpointService"]])( const gitattributesPath = path.join(service.workspaceDir, ".gitattributes") await fs.writeFile(gitattributesPath, "*.lfs filter=lfs diff=lfs merge=lfs -text") + // Delete the exclude file to force regeneration with new LFS patterns + const excludesPath = path.join(service.checkpointsDir, ".git", "info", "exclude") + await fs.unlink(excludesPath).catch(() => {}) // Ignore error if file doesn't exist + // Re-initialize the service to trigger a write to .git/info/exclude. service = new klass(service.taskId, service.checkpointsDir, service.workspaceDir, () => {}) - const excludesPath = path.join(service.checkpointsDir, ".git", "info", "exclude") - expect((await fs.readFile(excludesPath, "utf-8")).split("\n")).not.toContain("*.lfs") await service.initShadowGit() expect((await fs.readFile(excludesPath, "utf-8")).split("\n")).toContain("*.lfs") diff --git a/src/services/checkpoints/__tests__/excludes.spec.ts b/src/services/checkpoints/__tests__/excludes.spec.ts index 923b3d478e..3a41c346ef 100644 --- a/src/services/checkpoints/__tests__/excludes.spec.ts +++ b/src/services/checkpoints/__tests__/excludes.spec.ts @@ -3,12 +3,14 @@ import { join } from "path" import fs from "fs/promises" import { fileExistsAtPath } from "../../../utils/fs" -import { getExcludePatterns } from "../excludes" +import { getExcludePatterns, getExcludePatternsWithStats } from "../excludes" +import { executeRipgrep } from "../../search/file-search" // Mock fs/promises vi.mock("fs/promises", () => ({ default: { readFile: vi.fn(), + stat: vi.fn(), }, })) @@ -17,6 +19,12 @@ vi.mock("../../../utils/fs", () => ({ fileExistsAtPath: vi.fn(), })) +// Mock executeRipgrep +vi.mock("../../search/file-search", () => ({ + executeRipgrep: vi.fn(), + executeRipgrepForFiles: vi.fn(), +})) + describe("getExcludePatterns", () => { const testWorkspacePath = "/test/workspace" @@ -151,5 +159,193 @@ readme.md text expect(excludePatterns).toContain("*.shp") // geospatial expect(excludePatterns).toContain("*.log") // log }) + it("should include Windows Thumbs.db cache pattern", async () => { + // Mock .gitattributes file doesn't exist + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + + // Get exclude patterns + const excludePatterns = await getExcludePatterns(testWorkspacePath) + + // Verify Windows cache file pattern is included + expect(excludePatterns).toContain("Thumbs.db") + }) + }) + + describe("getLargeFileAutoExcludePatterns with LFS pre-filtering", () => { + it("should pre-filter git-lfs patterns when scanning for large files", async () => { + // Mock .gitattributes file exists with LFS patterns + vi.mocked(fileExistsAtPath).mockResolvedValue(true) + const gitAttributesContent = `*.psd filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.mp4 filter=lfs diff=lfs merge=lfs -text +` + vi.mocked(fs.readFile).mockResolvedValue(gitAttributesContent) + + // Mock executeRipgrep to return some files + vi.mocked(executeRipgrep).mockResolvedValue([ + { path: "file1.txt", type: "file", label: "file1.txt" }, + { path: "large.bin", type: "file", label: "large.bin" }, + { path: "code.js", type: "file", label: "code.js" }, + ]) + + // Mock file stats + vi.mocked(fs.stat).mockImplementation(async (path) => { + const pathStr = path.toString() + if (pathStr.includes("large.bin")) { + return { size: 20 * 1024 * 1024 } as any // 20MB + } + return { size: 1024 } as any // 1KB + }) + + // Get exclude patterns with stats + const result = await getExcludePatternsWithStats(testWorkspacePath) + + // Verify executeRipgrep was called with LFS patterns as exclusions + expect(executeRipgrep).toHaveBeenCalledWith( + expect.objectContaining({ + args: expect.arrayContaining(["-g", "!*.psd", "-g", "!*.zip", "-g", "!*.mp4"]), + workspacePath: testWorkspacePath, + }), + ) + + // Verify large.bin was detected and included + expect(result.stats.largeFilesExcluded).toBe(1) + expect(result.stats.sample).toContain("large.bin") + }) + + it("should handle empty LFS patterns gracefully", async () => { + // Mock no .gitattributes file + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + + // Mock executeRipgrep to return some files + vi.mocked(executeRipgrep).mockResolvedValue([ + { path: "file1.txt", type: "file", label: "file1.txt" }, + { path: "large.bin", type: "file", label: "large.bin" }, + ]) + + // Mock file stats + vi.mocked(fs.stat).mockImplementation(async (path) => { + const pathStr = path.toString() + if (pathStr.includes("large.bin")) { + return { size: 20 * 1024 * 1024 } as any // 20MB + } + return { size: 1024 } as any // 1KB + }) + + // Get exclude patterns with stats + const result = await getExcludePatternsWithStats(testWorkspacePath) + + // Verify executeRipgrep was called without LFS patterns + expect(executeRipgrep).toHaveBeenCalledWith( + expect.objectContaining({ + args: expect.not.arrayContaining(["-g", "!*.psd", "-g", "!*.zip", "-g", "!*.mp4"]), + workspacePath: testWorkspacePath, + }), + ) + + // Verify large file was still detected + expect(result.stats.largeFilesExcluded).toBe(1) + expect(result.stats.sample).toContain("large.bin") + }) + + it("should not exclude code files even if they are large", async () => { + // Mock no .gitattributes file + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + + // Mock executeRipgrep to return some files including large code files + vi.mocked(executeRipgrep).mockResolvedValue([ + { path: "huge.js", type: "file", label: "huge.js" }, + { path: "large.bin", type: "file", label: "large.bin" }, + { path: "big.ts", type: "file", label: "big.ts" }, + ]) + + // Mock file stats - all files are large + vi.mocked(fs.stat).mockImplementation(async () => { + return { size: 20 * 1024 * 1024 } as any // 20MB + }) + + // Get exclude patterns with stats + const result = await getExcludePatternsWithStats(testWorkspacePath) + + // Verify only non-code file was excluded + expect(result.stats.largeFilesExcluded).toBe(1) + expect(result.stats.sample).toContain("large.bin") + expect(result.stats.sample).not.toContain("huge.js") + expect(result.stats.sample).not.toContain("big.ts") + }) + }) + + describe("configurable threshold and error reporting", () => { + it("respects ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB override", async () => { + // Ensure no LFS patterns + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + + // Set threshold to 1 MB + const prev = process.env.ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB + process.env.ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB = "1" + + try { + // Mock file listing + vi.mocked(executeRipgrep).mockResolvedValue([ + { path: "large.bin", type: "file", label: "large.bin" }, + { path: "code.js", type: "file", label: "code.js" }, + ]) + + // Mock sizes: 2MB for large.bin, 2MB for code.js (but code is allowlisted) + vi.mocked(fs.stat).mockImplementation(async (p) => { + const s = p.toString() + if (s.includes("large.bin") || s.includes("code.js")) { + return { size: 2 * 1024 * 1024 } as any + } + return { size: 1024 } as any + }) + + const result = await getExcludePatternsWithStats(testWorkspacePath) + + expect(result.stats.thresholdBytes).toBe(1 * 1024 * 1024) + expect(result.stats.largeFilesExcluded).toBe(1) + expect(result.stats.sample).toContain("large.bin") + // code.js should never be excluded even if large + expect(result.stats.sample).not.toContain("code.js") + } finally { + // cleanup + if (prev === undefined) { + delete process.env.ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB + } else { + process.env.ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB = prev + } + } + }) + + it("records ripgrep failures without breaking pattern generation", async () => { + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + // Force executeRipgrep to throw + vi.mocked(executeRipgrep).mockRejectedValue(new Error("ripgrep failed")) + + const result = await getExcludePatternsWithStats(testWorkspacePath) + + // No dynamic large files because ripgrep failed + expect(result.stats.largeFilesExcluded).toBe(0) + expect(result.stats.sample.length).toBe(0) + // Error counts should reflect one ripgrep error + expect(result.stats.errorCounts?.ripgrepErrors).toBe(1) + expect(result.stats.errorCounts?.fsStatErrors).toBe(0) + // Base patterns should still include .git/ + expect(result.patterns).toContain(".git/") + }) + + it("counts fs.stat errors for diagnostics", async () => { + vi.mocked(fileExistsAtPath).mockResolvedValue(false) + vi.mocked(executeRipgrep).mockResolvedValue([{ path: "mystery.bin", type: "file", label: "mystery.bin" }]) + // Make stat fail + vi.mocked(fs.stat).mockRejectedValue(new Error("stat failure")) + + const result = await getExcludePatternsWithStats(testWorkspacePath) + + expect(result.stats.largeFilesExcluded).toBe(0) + expect(result.stats.sample.length).toBe(0) + expect(result.stats.errorCounts?.ripgrepErrors).toBe(0) + expect(result.stats.errorCounts?.fsStatErrors).toBe(1) + }) }) }) diff --git a/src/services/checkpoints/excludes.ts b/src/services/checkpoints/excludes.ts index 382e400f18..0867073e40 100644 --- a/src/services/checkpoints/excludes.ts +++ b/src/services/checkpoints/excludes.ts @@ -1,7 +1,61 @@ import fs from "fs/promises" -import { join } from "path" +import * as path from "path" import { fileExistsAtPath } from "../../utils/fs" +import { executeRipgrep } from "../search/file-search" + +const DEFAULT_LARGE_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 // 10 MB + +function getConfiguredLargeFileThresholdBytes(): number { + // Allow override via environment variable (in MB), e.g. ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB=25 + const env = process.env.ROO_CHECKPOINTS_LARGE_FILE_THRESHOLD_MB + const parsed = env ? Number(env) : NaN + if (Number.isFinite(parsed) && parsed > 0) { + return Math.round(parsed * 1024 * 1024) + } + return DEFAULT_LARGE_FILE_THRESHOLD_BYTES +} + +// Common code/text extensions that should not be auto-excluded by size +const CODE_EXT_ALLOWLIST: Set = new Set([ + ".ts", + ".tsx", + ".js", + ".jsx", + ".json", + ".md", + ".txt", + ".py", + ".java", + ".cs", + ".cpp", + ".c", + ".h", + ".hpp", + ".go", + ".rb", + ".rs", + ".kt", + ".swift", + ".m", + ".mm", + ".php", + ".html", + ".css", + ".scss", + ".less", + ".xml", + ".yml", + ".yaml", + ".toml", + ".ini", + ".gradle", + ".csproj", + ".sln", + ".vue", + ".svelte", + ".astro", +]) const getBuildArtifactPatterns = () => [ ".gradle/", @@ -96,7 +150,7 @@ const getCacheFilePatterns = () => [ "*.swp", "*.temp", "*.tmp", - "*.Thumbs.db", + "Thumbs.db", ] const getConfigFilePatterns = () => ["*.env*", "*.local", "*.development", "*.production"] @@ -185,7 +239,7 @@ const getLogFilePatterns = () => [ const getLfsPatterns = async (workspacePath: string) => { try { - const attributesPath = join(workspacePath, ".gitattributes") + const attributesPath = path.join(workspacePath, ".gitattributes") if (await fileExistsAtPath(attributesPath)) { return (await fs.readFile(attributesPath, "utf8")) @@ -198,15 +252,159 @@ const getLfsPatterns = async (workspacePath: string) => { return [] } -export const getExcludePatterns = async (workspacePath: string) => [ - ".git/", - ...getBuildArtifactPatterns(), - ...getMediaFilePatterns(), - ...getCacheFilePatterns(), - ...getConfigFilePatterns(), - ...getLargeDataFilePatterns(), - ...getDatabaseFilePatterns(), - ...getGeospatialPatterns(), - ...getLogFilePatterns(), - ...(await getLfsPatterns(workspacePath)), +/** + * Additional patterns for common game engines and large asset-heavy projects (Unity, Unreal, etc.) + * This helps avoid checkpointing huge binary assets by default. + */ +const getGameEnginePatterns = () => [ + // Unity + "Library/", + "Temp/", + "Build/", + "Builds/", + "Logs/", + "UserSettings/", + "*.unity", + "*.prefab", + "*.asset", + "*.fbx", + "*.blend", + "*.obj", + "*.unitypackage", + // Unreal + "*.uasset", + "*.umap", ] + +/** + * Scan the workspace for very large non-code files and exclude them automatically. + * Pre-filters out git-lfs managed files to avoid unnecessary file system operations. + * Uses ripgrep for fast file listing, then fs.stat for sizes. + */ +async function getLargeFileAutoExcludePatterns( + workspacePath: string, + thresholdBytes: number, + lfsPatterns: string[] = [], +): Promise<{ patterns: string[]; errorCounts: { ripgrepErrors: number; fsStatErrors: number } }> { + // Build ripgrep args with common ignores + const args = [ + "--files", + "--follow", + "--hidden", + "-g", + "!**/node_modules/**", + "-g", + "!**/.git/**", + "-g", + "!**/out/**", + "-g", + "!**/dist/**", + ] + + // Pre-filter git-lfs patterns at ripgrep level + for (const pattern of lfsPatterns) { + const rgPattern = pattern.startsWith("!") ? pattern.substring(1) : `!${pattern}` + args.push("-g", rgPattern) + } + + args.push(workspacePath) + + let items: Array<{ path: string; type: string }> = [] + let ripgrepErrors = 0 + let fsStatErrors = 0 + + try { + const rgResult = await executeRipgrep({ args, workspacePath, limit: 50000 }) + items = Array.isArray(rgResult) ? rgResult : [] + } catch { + // If ripgrep fails, record error and continue with empty items to avoid breaking checkpoints + ripgrepErrors = 1 + items = [] + } + + const large: string[] = [] + + for (const item of items) { + if ((item as any).type !== "file") continue + + const rel = (item as any).path + const ext = path.extname(rel).toLowerCase() + + // Keep code/text files even if large + if (CODE_EXT_ALLOWLIST.has(ext)) continue + + try { + const stat = await fs.stat(path.join(workspacePath, rel)) + if (stat.size >= thresholdBytes) { + // Normalize to forward slashes for git exclude + large.push(rel.replace(/\\/g, "/")) + } + } catch { + // Count stat errors for diagnostics + fsStatErrors++ + } + } + + return { + patterns: Array.from(new Set(large)), + errorCounts: { ripgrepErrors, fsStatErrors }, + } +} + +/** + * Returns exclude patterns and statistics used for logging/UX decisions. + */ +export async function getExcludePatternsWithStats(workspacePath: string): Promise<{ + patterns: string[] + stats: { + largeFilesExcluded: number + thresholdBytes: number + sample: string[] + errorCounts?: { ripgrepErrors: number; fsStatErrors: number } + } +}> { + // Get git-lfs patterns first + const lfsPatterns = await getLfsPatterns(workspacePath) + + const base = [ + ".git/", + ...getBuildArtifactPatterns(), + ...getMediaFilePatterns(), + ...getCacheFilePatterns(), + ...getConfigFilePatterns(), + ...getLargeDataFilePatterns(), + ...getDatabaseFilePatterns(), + ...getGeospatialPatterns(), + ...getLogFilePatterns(), + ...getGameEnginePatterns(), + ...lfsPatterns, + ] + + // Determine threshold (env override supported) + const thresholdBytes = getConfiguredLargeFileThresholdBytes() + + // Pass lfs patterns to the large file scanner to pre-filter them + const { patterns: dynamicLarge, errorCounts } = await getLargeFileAutoExcludePatterns( + workspacePath, + thresholdBytes, + lfsPatterns, + ) + + const patterns = Array.from(new Set([...base, ...dynamicLarge])) + + return { + patterns, + stats: { + largeFilesExcluded: dynamicLarge.length, + thresholdBytes, + sample: dynamicLarge.slice(0, 10), + errorCounts, + }, + } +} + +/** + * Backwards-compatible helper used by existing callers/tests. + */ +export const getExcludePatterns = async (workspacePath: string) => + (await getExcludePatternsWithStats(workspacePath)).patterns