Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions src/services/checkpoints/ShadowCheckpointService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ import { fileExistsAtPath } from "../../utils/fs"
import { executeRipgrep } from "../../services/search/file-search"

import { CheckpointDiff, CheckpointResult, CheckpointEventMap } from "./types"
import { getExcludePatterns } from "./excludes"
import { getExcludePatterns, getExcludePatternsWithStats } from "./excludes"

export abstract class ShadowCheckpointService extends EventEmitter {
public readonly taskId: string
Expand Down Expand Up @@ -139,8 +139,15 @@ export abstract class ShadowCheckpointService extends EventEmitter {
// .gitignore.
protected async writeExcludeFile() {
await fs.mkdir(path.join(this.dotGitDir, "info"), { recursive: true })
const patterns = await getExcludePatterns(this.workspaceDir)
const { patterns, stats } = await getExcludePatternsWithStats(this.workspaceDir)
await fs.writeFile(path.join(this.dotGitDir, "info", "exclude"), patterns.join("\n"))

if (stats?.largeFilesExcluded && stats.largeFilesExcluded > 0) {
Copy link

Copilot AI Aug 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[nitpick] The condition can be simplified to if (stats.largeFilesExcluded > 0) since the optional chaining already handles the undefined case and numbers greater than 0 are truthy.

Suggested change
if (stats?.largeFilesExcluded && stats.largeFilesExcluded > 0) {
if (stats.largeFilesExcluded > 0) {

Copilot uses AI. Check for mistakes.
const mb = Math.round(stats.thresholdBytes / (1024 * 1024))
this.log(
`[${this.constructor.name}#writeExcludeFile] auto-excluding ${stats.largeFilesExcluded} large files (>= ${mb}MB) from checkpoints. Sample: ${stats.sample.join(", ")}`,
)
}
}

private async stageAll(git: SimpleGit) {
Expand Down
160 changes: 147 additions & 13 deletions src/services/checkpoints/excludes.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,51 @@
import fs from "fs/promises"
import { join } from "path"
import * as path from "path"

import { fileExistsAtPath } from "../../utils/fs"
import { executeRipgrepForFiles } from "../search/file-search"

const DEFAULT_LARGE_FILE_THRESHOLD_BYTES = 10 * 1024 * 1024 // 10 MB

// Common code/text extensions that should not be auto-excluded by size
const CODE_EXT_ALLOWLIST: Set<string> = new Set<string>([
".ts",
".tsx",
".js",
".jsx",
".json",
".md",
".txt",
".py",
".java",
".cs",
".cpp",
".c",
".h",
".hpp",
".go",
".rb",
".rs",
".kt",
".swift",
".m",
".mm",
".php",
".html",
".css",
".scss",
".less",
".xml",
".yml",
".yaml",
".toml",
".ini",
".gradle",
".csproj",
".sln",
".vue",
".svelte",
".astro",
])

const getBuildArtifactPatterns = () => [
".gradle/",
Expand Down Expand Up @@ -185,7 +229,7 @@ const getLogFilePatterns = () => [

const getLfsPatterns = async (workspacePath: string) => {
try {
const attributesPath = join(workspacePath, ".gitattributes")
const attributesPath = path.join(workspacePath, ".gitattributes")

if (await fileExistsAtPath(attributesPath)) {
return (await fs.readFile(attributesPath, "utf8"))
Expand All @@ -198,15 +242,105 @@ const getLfsPatterns = async (workspacePath: string) => {
return []
}

export const getExcludePatterns = async (workspacePath: string) => [
".git/",
...getBuildArtifactPatterns(),
...getMediaFilePatterns(),
...getCacheFilePatterns(),
...getConfigFilePatterns(),
...getLargeDataFilePatterns(),
...getDatabaseFilePatterns(),
...getGeospatialPatterns(),
...getLogFilePatterns(),
...(await getLfsPatterns(workspacePath)),
/**
* Additional patterns for common game engines and large asset-heavy projects (Unity, Unreal, etc.)
* This helps avoid checkpointing huge binary assets by default.
*/
const getGameEnginePatterns = () => [
// Unity
"Library/",
"Temp/",
"Build/",
"Builds/",
"Logs/",
"UserSettings/",
"*.unity",
"*.prefab",
"*.asset",
"*.fbx",
"*.blend",
"*.obj",
"*.unitypackage",
// Unreal
"*.uasset",
"*.umap",
]

/**
* Scan the workspace for very large non-code files and exclude them automatically.
* Uses ripgrep for fast file listing, then fs.stat for sizes.
*/
async function getLargeFileAutoExcludePatterns(
workspacePath: string,
thresholdBytes: number = DEFAULT_LARGE_FILE_THRESHOLD_BYTES,
): Promise<string[]> {
try {
const items = await executeRipgrepForFiles(workspacePath, 50000)
const large: string[] = []

for (const item of items) {
if (item.type !== "file") continue

const rel = item.path
const ext = path.extname(rel).toLowerCase()

// Keep code/text files even if large
if (CODE_EXT_ALLOWLIST.has(ext)) continue

try {
const stat = await fs.stat(path.join(workspacePath, rel))
if (stat.size >= thresholdBytes) {
// Normalize to forward slashes for git exclude
large.push(rel.replace(/\\/g, "/"))
}
} catch {
// Ignore stat errors for individual files
}
}

return Array.from(new Set(large))
} catch {
return []
}
}

/**
* Returns exclude patterns and statistics used for logging/UX decisions.
*/
export async function getExcludePatternsWithStats(workspacePath: string): Promise<{
patterns: string[]
stats: { largeFilesExcluded: number; thresholdBytes: number; sample: string[] }
}> {
const base = [
".git/",
...getBuildArtifactPatterns(),
...getMediaFilePatterns(),
...getCacheFilePatterns(),
...getConfigFilePatterns(),
...getLargeDataFilePatterns(),
...getDatabaseFilePatterns(),
...getGeospatialPatterns(),
...getLogFilePatterns(),
...getGameEnginePatterns(),
...(await getLfsPatterns(workspacePath)),
]

const dynamicLarge = await getLargeFileAutoExcludePatterns(workspacePath)

const patterns = Array.from(new Set([...base, ...dynamicLarge]))

return {
patterns,
stats: {
largeFilesExcluded: dynamicLarge.length,
thresholdBytes: DEFAULT_LARGE_FILE_THRESHOLD_BYTES,
sample: dynamicLarge.slice(0, 10),
},
}
}

/**
* Backwards-compatible helper used by existing callers/tests.
*/
export const getExcludePatterns = async (workspacePath: string) =>
(await getExcludePatternsWithStats(workspacePath)).patterns
Loading