diff --git a/src/services/checkpoints/ShadowCheckpointService.ts b/src/services/checkpoints/ShadowCheckpointService.ts index d6e53980cb..cc7916cf67 100644 --- a/src/services/checkpoints/ShadowCheckpointService.ts +++ b/src/services/checkpoints/ShadowCheckpointService.ts @@ -25,6 +25,10 @@ export abstract class ShadowCheckpointService extends EventEmitter { protected readonly dotGitDir: string protected git?: SimpleGit protected readonly log: (message: string) => void + // GC related properties + private gcCounter: number = 0 + private readonly GC_CHECKPOINT_THRESHOLD: number = Number(process.env.GC_CHECKPOINT_THRESHOLD) || 20 // Run gc every 20 checkpoints + private _nestedGitDirPaths: string[] | null = null // Cache for relative paths like "submodule/.git" protected shadowGitConfigWorktree?: string public get baseHash() { @@ -85,6 +89,13 @@ export abstract class ShadowCheckpointService extends EventEmitter { await this.writeExcludeFile() this.baseHash = await git.revparse(["HEAD"]) + + // --- STAGE 1: Run GC on init for existing repo --- + this.log(`[${this.constructor.name}#initShadowGit] Existing shadow repo found. Running garbage collection.`) + const gcStartTime = Date.now() + + const isWindows = process.platform === "win32" + await this.tryRepack(git, isWindows, gcStartTime) } else { this.log(`[${this.constructor.name}#initShadowGit] creating shadow git repo at ${this.checkpointsDir}`) await git.init() @@ -109,6 +120,10 @@ export abstract class ShadowCheckpointService extends EventEmitter { await onInit?.() + // Warm up the nested git paths cache + // This ensures the potentially slow scan happens once during initialization. + await this.findAndCacheNestedGitRepoPaths() + this.emit("initialize", { type: "initialize", workspaceDir: this.workspaceDir, @@ -131,15 +146,125 @@ export abstract class ShadowCheckpointService extends EventEmitter { await fs.writeFile(path.join(this.dotGitDir, "info", "exclude"), patterns.join("\n")) } + private async tryRepack(git: SimpleGit, isWindows: boolean, gcStartTime: number) { + // Define commands to try based on platform + const commands = isWindows + ? [ + // Try the more thorough repack command on Git 2.49.0 for Windows + { + args: ["repack", "-a", "-d", "-f", "--path-walk", "--quiet"], + desc: "advanced repack with --path-walk", + }, + { args: ["repack", "-a", "-d", "-f", "--window=250", "--quiet"], desc: "repack with --window=250" }, + ] + : [{ args: ["repack", "-a", "-d", "-f", "--window=250", "--quiet"], desc: "repack with --window=250" }] + + // Try each command until one succeeds + for (const command of commands) { + try { + this.log(`[${this.constructor.name}#initShadowGit] Running ${command.desc}...`) + await git.raw(command.args) + this.log( + `[${this.constructor.name}#initShadowGit] Repository repack completed in ${Date.now() - gcStartTime}ms.`, + ) + return // Success - exit early + } catch (error) { + this.log( + `[${this.constructor.name}#initShadowGit] ${command.desc} failed: ${error instanceof Error ? error.message : String(error)}`, + ) + } + } + + this.log(`[${this.constructor.name}#initShadowGit] Repository repack failed, continuing without optimization.`) + } + + // This method scans for nested .git directories once and caches their paths + // to avoid repeated expensive scans by ripgrep. + private async findAndCacheNestedGitRepoPaths(): Promise { + if (this._nestedGitDirPaths === null) { + this.log(`[${this.constructor.name}#findAndCacheNestedGitRepoPaths] Scanning for nested .git directories.`) + // Ripgrep for .git/HEAD files, excluding the root .git/HEAD of the workspace itself. + // Note: The original `renameNestedGitRepos` looked for `**/${gitDir}/HEAD`. + // We'll look for `**/.git/HEAD` and then `renameNestedGitRepos` will handle the suffix. + const ripGrepArgs = [ + "--files", + "--hidden", + "--follow", + "-g", + `**/${path.join(".git", "HEAD")}`, + this.workspaceDir, + ] + const headFileEntries = await executeRipgrep({ args: ripGrepArgs, workspacePath: this.workspaceDir }) + + this._nestedGitDirPaths = headFileEntries + .filter( + ({ type, path: p }) => + type === "file" && + p.endsWith(path.join(".git", "HEAD")) && // Ensure it's a HEAD file + p !== path.join(".git", "HEAD"), // Exclude the main .git/HEAD if workspaceDir is a git repo + ) + .map((entry) => path.dirname(entry.path)) // Get the .git directory path (relative to workspaceDir) + + this.log( + `[${this.constructor.name}#findAndCacheNestedGitRepoPaths] Found ${this._nestedGitDirPaths.length} nested .git directories.`, + ) + } + return this._nestedGitDirPaths + } + + // Instead of `git add .`, this uses `git status` to find specific changes + // and then `git add ` and `git rm `. private async stageAll(git: SimpleGit) { await this.renameNestedGitRepos(true) try { - await git.add(".") + // Use git status to find changed/new/deleted files and stage them specifically + const status = await git.status(["--porcelain=v1", "-uall"]) // -uall includes all untracked files + const filesToAdd: string[] = [] + const filesToRemove: string[] = [] + + if (status && status.files && status.files.length > 0) { + // simple-git's status.files is an array of objects like: + // { path: 'file.txt', index: 'M', working_dir: ' ' } + // index: Status of the index + // working_dir: Status of the working directory + status.files.forEach((file) => { + const filePath = file.path + // Determine if file needs to be added or removed + // 'D' in index or working_dir means deleted + if (file.index === "D" || file.working_dir === "D") { + filesToRemove.push(filePath) + } else if (file.index === "?" || file.working_dir === "?") { + // Untracked + filesToAdd.push(filePath) + } else if (file.index === "A" || file.working_dir === "A") { + // Added + filesToAdd.push(filePath) + } else if (file.index === "M" || file.working_dir === "M") { + // Modified + filesToAdd.push(filePath) + } else if (file.index.startsWith("R") || file.working_dir.startsWith("R")) { + // Renamed + filesToAdd.push(filePath) // Add the new path + } else if (file.index.startsWith("C") || file.working_dir.startsWith("C")) { + // Copied + filesToAdd.push(filePath) // Add the new path + } + // Other statuses like 'U' (unmerged) might need specific handling if relevant + }) + } + + if (filesToRemove.length > 0) { + await git.rm(filesToRemove) + } + if (filesToAdd.length > 0) { + await git.add(filesToAdd) + } } catch (error) { this.log( - `[${this.constructor.name}#stageAll] failed to add files to git: ${error instanceof Error ? error.message : String(error)}`, + `[${this.constructor.name}#stageAll] failed to process git status or stage/remove files: ${error instanceof Error ? error.message : String(error)}`, ) + throw error } finally { await this.renameNestedGitRepos(false) } @@ -148,55 +273,39 @@ export abstract class ShadowCheckpointService extends EventEmitter { // Since we use git to track checkpoints, we need to temporarily disable // nested git repos to work around git's requirement of using submodules for // nested repos. + // This method now uses the pre-scanned and cached list of nested .git directories. private async renameNestedGitRepos(disable: boolean) { - try { - // Find all .git directories that are not at the root level. - const gitDir = ".git" + (disable ? "" : GIT_DISABLED_SUFFIX) - const args = ["--files", "--hidden", "--follow", "-g", `**/${gitDir}/HEAD`, this.workspaceDir] - - const gitPaths = await ( - await executeRipgrep({ args, workspacePath: this.workspaceDir }) - ).filter(({ type, path }) => type === "folder" && path.includes(".git") && !path.startsWith(".git")) - - // For each nested .git directory, rename it based on operation. - for (const gitPath of gitPaths) { - if (gitPath.path.startsWith(".git")) { - continue - } + const nestedGitDirPaths = await this.findAndCacheNestedGitRepoPaths() // Uses cache after first scan - const currentPath = path.join(this.workspaceDir, gitPath.path) - let newPath: string + for (const relativeGitDirPath of nestedGitDirPaths) { + // e.g., "submoduleA/.git" + const originalGitPath = path.join(this.workspaceDir, relativeGitDirPath) // Becomes absolute path + const disabledGitPath = originalGitPath + GIT_DISABLED_SUFFIX + try { if (disable) { - newPath = !currentPath.endsWith(GIT_DISABLED_SUFFIX) - ? currentPath + GIT_DISABLED_SUFFIX - : currentPath + // Check if the original .git directory exists and is not already disabled + if ((await fileExistsAtPath(originalGitPath)) && !originalGitPath.endsWith(GIT_DISABLED_SUFFIX)) { + await fs.rename(originalGitPath, disabledGitPath) + this.log( + `[${this.constructor.name}#renameNestedGitRepos] disabled nested git repo ${originalGitPath}`, + ) + } } else { - newPath = currentPath.endsWith(GIT_DISABLED_SUFFIX) - ? currentPath.slice(0, -GIT_DISABLED_SUFFIX.length) - : currentPath - } - - if (currentPath === newPath) { - continue - } - - try { - await fs.rename(currentPath, newPath) - - this.log( - `[${this.constructor.name}#renameNestedGitRepos] ${disable ? "disabled" : "enabled"} nested git repo ${currentPath}`, - ) - } catch (error) { - this.log( - `[${this.constructor.name}#renameNestedGitRepos] failed to ${disable ? "disable" : "enable"} nested git repo ${currentPath}: ${error instanceof Error ? error.message : String(error)}`, - ) + // Check if the disabled version exists + if (await fileExistsAtPath(disabledGitPath)) { + await fs.rename(disabledGitPath, originalGitPath) + this.log( + `[${this.constructor.name}#renameNestedGitRepos] enabled nested git repo ${originalGitPath}`, + ) + } } + } catch (error) { + // Log specific error for this rename operation but continue with others + this.log( + `[${this.constructor.name}#renameNestedGitRepos] failed to ${disable ? "disable" : "enable"} ${originalGitPath}: ${error instanceof Error ? error.message : String(error)}`, + ) } - } catch (error) { - this.log( - `[${this.constructor.name}#renameNestedGitRepos] failed to ${disable ? "disable" : "enable"} nested git repos: ${error instanceof Error ? error.message : String(error)}`, - ) } } @@ -239,6 +348,29 @@ export abstract class ShadowCheckpointService extends EventEmitter { this.log( `[${this.constructor.name}#saveCheckpoint] checkpoint saved in ${duration}ms -> ${result.commit}`, ) + + // --- STAGE 2: Periodically run GC after saving checkpoints --- + this.gcCounter++ + if (this.gcCounter >= this.GC_CHECKPOINT_THRESHOLD) { + this.log( + `[${this.constructor.name}#saveCheckpoint] Reached gc threshold (${this.GC_CHECKPOINT_THRESHOLD}). Running background gc.`, + ) + this.gcCounter = 0 // Reset counter + + // Run gc asynchronously (fire and forget) to avoid blocking the save operation. + this.git + .raw(["gc"]) + .then(() => { + this.log( + `[${this.constructor.name}#saveCheckpoint] Background garbage collection completed.`, + ) + }) + .catch((gcError: any) => { + this.log( + `[${this.constructor.name}#saveCheckpoint] Background garbage collection failed: ${gcError instanceof Error ? gcError.message : String(gcError)}`, + ) + }) + } return result } else { this.log(`[${this.constructor.name}#saveCheckpoint] found no changes to commit in ${duration}ms`) @@ -261,6 +393,12 @@ export abstract class ShadowCheckpointService extends EventEmitter { } const start = Date.now() + + // If `git clean` or `git reset` could be affected by nested .git repos + // (e.g., if they try to operate on them as submodules despite core.worktree), + // you might need to wrap this section with renameNestedGitRepos(true/false). + // However, `core.worktree` usually makes Git operate on the workspace files directly. + // For now, assuming it's not strictly needed here for performance, but it's a thought. await this.git.clean("f", ["-d", "-f"]) await this.git.reset(["--hard", commitHash]) @@ -287,32 +425,117 @@ export abstract class ShadowCheckpointService extends EventEmitter { throw new Error("Shadow git repo not initialized") } - const result = [] - if (!from) { - from = (await this.git.raw(["rev-list", "--max-parents=0", "HEAD"])).trim() + // Get the initial commit hash if 'from' is not provided + const revListOutput = await this.git.raw(["rev-list", "--max-parents=0", "HEAD"]) + from = revListOutput ? revListOutput.trim() : undefined + if (!from) { + this.log(`[${this.constructor.name}#getDiff] Could not determine initial commit (baseHash).`) + return [] // Or throw an error, depending on desired behavior + } + } + + // Stage changes only if diffing against the working directory (when 'to' is undefined). + // This ensures untracked files are considered by `git diff `. + // `git diff ..` doesn't need this staging of the working dir. + if (!to) { + await this.stageAll(this.git) } - // Stage all changes so that untracked files appear in diff summary. - await this.stageAll(this.git) + this.log( + `[${this.constructor.name}#getDiff] diffing ${to ? `${from}..${to}` : `${from}..HEAD (working directory)`}`, + ) - this.log(`[${this.constructor.name}#getDiff] diffing ${to ? `${from}..${to}` : `${from}..HEAD`}`) - const { files } = to ? await this.git.diffSummary([`${from}..${to}`]) : await this.git.diffSummary([from]) + // Use `git diff --name-status` for a more precise list of changes (Added, Modified, Deleted, Renamed). + const diffArgs = ["--name-status"] + if (to) { + diffArgs.push(`${from}..${to}`) + } else { + diffArgs.push(from) // Diff commit 'from' against the (staged) working tree + } + + const nameStatusOutput = (await this.git.diff(diffArgs)).trim() + const fileInfos: Array<{ path: string; status: string; origPath?: string }> = [] + + if (nameStatusOutput) { + nameStatusOutput.split("\n").forEach((line) => { + if (!line.trim()) return + const parts = line.split("\t") + const status = parts[0][0] // First char of status, e.g., 'A', 'M', 'D', 'R', 'C' + let filePath = parts[1] + let origPath: string | undefined + + if ((status === "R" || status === "C") && parts.length > 2) { + // Renamed or Copied + origPath = parts[1] // The original path + filePath = parts[2] // The new path + } + fileInfos.push({ path: filePath, status, origPath }) + }) + } const cwdPath = (await this.getShadowGitConfigWorktree(this.git)) || this.workspaceDir || "" + const result: CheckpointDiff[] = [] - for (const file of files) { - const relPath = file.file + for (const info of fileInfos) { + const relPath = info.path const absPath = path.join(cwdPath, relPath) - const before = await this.git.show([`${from}:${relPath}`]).catch(() => "") + let beforeContent = "" + let afterContent = "" - const after = to - ? await this.git.show([`${to}:${relPath}`]).catch(() => "") - : await fs.readFile(absPath, "utf8").catch(() => "") + // Fetch 'before' content if the file was not Added (i.e., it existed in 'from' or was renamed/copied from an existing file) + if (info.status !== "A") { + const pathForBefore = info.origPath || relPath // Use original path for renames/copies + try { + beforeContent = await this.git.show([`${from}:${pathForBefore}`]) + } catch (showError) { + // File might not exist in 'from' if it was, for example, deleted and then re-added differently, + // or if it's binary and show fails. Log or handle as needed. + this.log( + `[${this.constructor.name}#getDiff] Could not git.show ${from}:${pathForBefore}. Error: ${showError}`, + ) + beforeContent = "" // Default to empty string + } + } - result.push({ paths: { relative: relPath, absolute: absPath }, content: { before, after } }) + // Fetch 'after' content if the file was not Deleted + if (info.status !== "D") { + if (to) { + // Diffing between two commits + try { + afterContent = await this.git.show([`${to}:${relPath}`]) + } catch (showError) { + this.log( + `[${this.constructor.name}#getDiff] Could not git.show ${to}:${relPath}. Error: ${showError}`, + ) + afterContent = "" + } + } else { + // Diffing against working directory + try { + // Ensure file exists before reading, especially if it's newly added in working dir but not yet committed. + if (await fileExistsAtPath(absPath)) { + afterContent = await fs.readFile(absPath, "utf8") + } else { + // This case should be rare if stageAll correctly added new files. + this.log( + `[${this.constructor.name}#getDiff] File ${absPath} not found in working directory for 'after' content.`, + ) + afterContent = "" + } + } catch (readError) { + this.log( + `[${this.constructor.name}#getDiff] Could not fs.readFile ${absPath}. Error: ${readError}`, + ) + afterContent = "" + } + } + } + result.push({ + paths: { relative: relPath, absolute: absPath }, + content: { before: beforeContent, after: afterContent }, + }) } - return result } @@ -358,32 +581,62 @@ export abstract class ShadowCheckpointService extends EventEmitter { return path.join(globalStorageDir, "checkpoints", this.hashWorkspaceDir(workspaceDir)) } - public static async deleteTask({ - taskId, - globalStorageDir, - workspaceDir, - }: { - taskId: string - globalStorageDir: string - workspaceDir: string - }) { + public static async deleteTask( + { + taskId, + globalStorageDir, + workspaceDir, + }: { + taskId: string + globalStorageDir: string + workspaceDir: string + }, + log: (message: string) => void = console.log, + ) { const workspaceRepoDir = this.workspaceRepoDir({ globalStorageDir, workspaceDir }) const branchName = `roo-${taskId}` const git = simpleGit(workspaceRepoDir) - const success = await this.deleteBranch(git, branchName) + const success = await this.deleteBranch(git, branchName, log) if (success) { - console.log(`[${this.name}#deleteTask.${taskId}] deleted branch ${branchName}`) + log(`[${this.name}#deleteTask.${taskId}] deleted branch ${branchName}`) } else { - console.error(`[${this.name}#deleteTask.${taskId}] failed to delete branch ${branchName}`) + log(`[${this.name}#deleteTask.${taskId}] ERROR: failed to delete branch ${branchName}`) } } - public static async deleteBranch(git: SimpleGit, branchName: string) { + private static async runBackgroundGC( + git: SimpleGit, + branchName: string, + log: (message: string) => void, + ): Promise { + try { + log(`[${this.name}#deleteBranch] Running gc --prune=now after deleting branch ${branchName}`) + // Using raw for --prune=now as simple-git's gc() doesn't directly support it. + // Fire-and-forget + git.raw(["gc", "--prune=now", "--quiet"]) + .then(() => { + log(`[${this.name}#deleteBranch] Background gc --prune=now completed for branch ${branchName}`) + }) + .catch((gcError) => { + log( + `[${this.name}#deleteBranch] ERROR: Background gc after deleting branch ${branchName} failed: ${gcError instanceof Error ? gcError.message : String(gcError)}`, + ) + }) + } catch (e) { + // This catch is for synchronous errors in *initiating* the gc, not for gc runtime errors. + // Runtime errors of gc are handled by the .catch() above. + log( + `[${this.name}#deleteBranch] ERROR: Failed to initiate gc: ${e instanceof Error ? e.message : String(e)}`, + ) + } + } + + public static async deleteBranch(git: SimpleGit, branchName: string, log: (message: string) => void = console.log) { const branches = await git.branchLocal() if (!branches.all.includes(branchName)) { - console.error(`[${this.constructor.name}#deleteBranch] branch ${branchName} does not exist`) + log(`[${this.name}#deleteBranch] ERROR: branch ${branchName} does not exist`) return false } @@ -408,10 +661,13 @@ export abstract class ShadowCheckpointService extends EventEmitter { ) await git.branch(["-D", branchName]) + // --- STAGE 3: Run GC after deleting a branch --- + await this.runBackgroundGC(git, branchName, log) + return true } catch (error) { - console.error( - `[${this.constructor.name}#deleteBranch] failed to delete branch ${branchName}: ${error instanceof Error ? error.message : String(error)}`, + log( + `[${this.name}#deleteBranch] ERROR: failed to delete branch ${branchName}: ${error instanceof Error ? error.message : String(error)}`, ) return false @@ -422,6 +678,9 @@ export abstract class ShadowCheckpointService extends EventEmitter { } } else { await git.branch(["-D", branchName]) + // --- STAGE 3: Run GC after deleting a branch --- + await this.runBackgroundGC(git, branchName, log) + return true } } diff --git a/src/services/checkpoints/__tests__/ShadowCheckpointService.test.ts b/src/services/checkpoints/__tests__/ShadowCheckpointService.test.ts index 84589c5fd2..de4691169a 100644 --- a/src/services/checkpoints/__tests__/ShadowCheckpointService.test.ts +++ b/src/services/checkpoints/__tests__/ShadowCheckpointService.test.ts @@ -11,9 +11,16 @@ import { fileExistsAtPath } from "../../../utils/fs" import * as fileSearch from "../../../services/search/file-search" import { RepoPerTaskCheckpointService } from "../RepoPerTaskCheckpointService" - jest.setTimeout(10_000) +// Mock ripgrep service +jest.mock("../../../services/ripgrep") + +// Mock file search service +jest.mock("../../../services/search/file-search", () => ({ + executeRipgrep: jest.fn().mockResolvedValue([]), +})) + const tmpDir = path.join(os.tmpdir(), "CheckpointService") const initWorkspaceRepo = async ({ @@ -423,30 +430,52 @@ describe.each([[RepoPerTaskCheckpointService, "RepoPerTaskCheckpointService"]])( const renameSpy = jest.spyOn(fs, "rename") - jest.spyOn(fileSearch, "executeRipgrep").mockImplementation(({ args }) => { - const searchPattern = args[4] - - if (searchPattern.includes(".git/HEAD")) { - return Promise.resolve([ - { - path: path.relative(workspaceDir, nestedGitDir), - type: "folder", - label: ".git", - }, - ]) - } else { + // Mock executeRipgrep to simulate finding the nested .git/HEAD file. + // nestedGitDir is an absolute path to the .git directory of the nested repo. + // workspaceDir is an absolute path to the main workspace. + jest.spyOn(fileSearch, "executeRipgrep").mockImplementation( + async ({ args: rgArgs, workspacePath: rgInvocationWorkspacePath }) => { + // ShadowCheckpointService calls ripgrep with: + // args: ["--files", "--hidden", "--follow", "-g", "**/.git/HEAD", workspaceDir] + // workspacePath: workspaceDir + + const globPattern = rgArgs[4] // e.g., "**/.git/HEAD" + const searchBaseDir = rgArgs[5] // The directory ripgrep is told to search within. + + if ( + globPattern.includes(path.join(".git", "HEAD")) && + searchBaseDir === workspaceDir && + rgInvocationWorkspacePath === workspaceDir + ) { + // Construct the relative path of the nested .git/HEAD file with respect to workspaceDir + const nestedHeadFileAbsPath = path.join(nestedGitDir, "HEAD") + const relativeNestedHeadFilePath = path.relative(workspaceDir, nestedHeadFileAbsPath) // e.g., "nested-project/.git/HEAD" + + return Promise.resolve([ + { + type: "file", // Must be 'file' for the service's filter + path: relativeNestedHeadFilePath, // Must end with '.git/HEAD' and be relative to workspaceDir + }, + ]) + } return Promise.resolve([]) - } - }) + }, + ) const service = new klass(taskId, shadowDir, workspaceDir, () => {}) await service.initShadowGit() - // Verify rename was called with correct paths. - expect(renameSpy.mock.calls).toHaveLength(1) + // Verify rename was called with correct paths - both disable and restore + expect(renameSpy.mock.calls).toHaveLength(2) + + // First call should rename .git to .git_disabled expect(renameSpy.mock.calls[0][0]).toBe(nestedGitDir) expect(renameSpy.mock.calls[0][1]).toBe(nestedGitDisabledDir) + // Second call should rename .git_disabled back to .git + expect(renameSpy.mock.calls[1][0]).toBe(nestedGitDisabledDir) + expect(renameSpy.mock.calls[1][1]).toBe(nestedGitDir) + jest.spyOn(require("../../../utils/fs"), "fileExistsAtPath").mockImplementation((path) => { if (path === nestedGitDir) { return Promise.resolve(true)