Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 0 additions & 4 deletions apps/cli/src/commands/prepare/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,6 @@ interface RepoPin {
readonly path?: string;
readonly repo?: string;
readonly commit?: string;
readonly baseCommit?: string;
readonly ancestor?: number;
readonly sparse?: readonly string[];
}
Expand Down Expand Up @@ -89,7 +88,6 @@ interface RepoPinWire {
readonly path?: string;
readonly repo?: string;
readonly commit?: string;
readonly base_commit?: string;
readonly ancestor?: number;
readonly sparse?: readonly string[];
}
Expand Down Expand Up @@ -129,7 +127,6 @@ function toRepoPins(pins: readonly PreparedWorkspaceRepoPin[]): readonly RepoPin
...(pin.path !== undefined && { path: pin.path }),
...(pin.repo !== undefined && { repo: pin.repo }),
...(pin.commit !== undefined && { commit: pin.commit }),
...(pin.baseCommit !== undefined && { baseCommit: pin.baseCommit }),
...(pin.ancestor !== undefined && { ancestor: pin.ancestor }),
...(pin.sparse !== undefined && { sparse: pin.sparse }),
}));
Expand Down Expand Up @@ -242,7 +239,6 @@ function toManifestWire(result: PrepareResult): PrepareManifestWire {
...(pin.path !== undefined && { path: pin.path }),
...(pin.repo !== undefined && { repo: pin.repo }),
...(pin.commit !== undefined && { commit: pin.commit }),
...(pin.baseCommit !== undefined && { base_commit: pin.baseCommit }),
...(pin.ancestor !== undefined && { ancestor: pin.ancestor }),
...(pin.sparse !== undefined && { sparse: pin.sparse }),
})),
Expand Down
4 changes: 2 additions & 2 deletions apps/web/src/content/docs/docs/next/evaluation/eval-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -189,14 +189,14 @@ tests:
repos:
- path: ./repo
repo: sympy/sympy
base_commit: "abc123def"
commit: "abc123def"
hooks:
before_each:
command: ["python", "apply_test_patch.py"]
```

The `metadata` field is included in the stdin JSON passed to lifecycle commands as `case_metadata`.
Operational checkout state belongs under `workspace.repos[].base_commit`; matching metadata fields such as `source_commit` are informational only.
Operational checkout state belongs under `workspace.repos[].commit`; matching metadata fields such as `source_commit` are informational only.
For historical repo-state evals, pin the checkout under `workspace.repos[]`
instead of only mentioning the SHA in prompt prose:

Expand Down
5 changes: 2 additions & 3 deletions apps/web/src/content/docs/docs/next/evaluation/eval-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -138,9 +138,8 @@ checks, scope, and repo provenance in `workspace`. Put lifecycle setup that
does not acquire repos in `extensions`.

For historical or repo-state evals, put the checkout under
`workspace.repos[].commit` or `workspace.repos[].base_commit`. A commit SHA in
the prompt or metadata is useful context, but it does not materialize a repo for
the agent to inspect.
`workspace.repos[].commit`. A commit SHA in the prompt or metadata is useful
context, but it does not materialize a repo for the agent to inspect.

### Prompts, Vars, and Target Expansion

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ eval start
| 3. Repo materialization | For each workspace.repos entry:
| a. resolve acquisition | - registered project, configured mirror,
| b. git clone/fetch | AgentV cache, or remote fallback
| c. git checkout <ref> | - check out commit/base_commit/HEAD
| c. git checkout <ref> | - check out commit or HEAD
+---------------------------+
|
v
Expand Down Expand Up @@ -85,19 +85,15 @@ Supported repo fields:
| `path` | Directory inside the workspace where the repo is materialized |
| `repo` | Repository identity: full clone URL or GitHub `org/name` shorthand |
| `commit` | Branch, tag, or SHA to check out after clone |
| `base_commit` | Alias for `commit`, useful for SWE-bench-style datasets |
| `sparse` | Optional sparse-checkout paths |
| `ancestor` | Walk N parents back after resolving `commit` / `base_commit` |
| `resolver` | Optional `repo_resolvers[].name` override from AgentV config |
| `ancestor` | Walk N parents back after resolving `commit` |

`commit` is the canonical AgentV checkout pin. `base_commit` exists only as a
SWE-Bench-friendly alias for the same value; when both fields are present they
must match. Prefer `commit` in new AgentV-authored evals unless preserving an
upstream dataset column name makes the eval easier to audit.
`commit` is the AgentV checkout pin.

`source`, `type`, `checkout`, `checkout.resolve`, and `clone` are not part of
the repo schema. Acquisition settings are deliberately outside eval YAML so the
same benchmark can run against the same repository identity on every machine
`source`, `type`, `checkout`, `checkout.resolve`, `clone`, `resolve`, and
`resolver` are not part of the repo schema. Acquisition settings are
deliberately outside eval YAML so the same benchmark can run against the same
repository identity on every machine
while each harness uses the fastest safe local source available.

## Native workspace boundary
Expand Down
7 changes: 3 additions & 4 deletions apps/web/src/content/docs/docs/next/targets/configuration.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ case context:
"test_id": "case-01",
"eval_run_id": "run-123",
"case_input": "Fix the bug",
"case_metadata": { "repo": "sympy/sympy", "base_commit": "abc123" }
"case_metadata": { "repo": "sympy/sympy", "source_commit": "abc123" }
}
```

Expand Down Expand Up @@ -192,7 +192,6 @@ workspace:
| `repos[].path` | Directory within the workspace to clone into |
| `repos[].repo` | Repository identity: full clone URL or GitHub `org/name` shorthand |
| `repos[].commit` | Branch, tag, or SHA to check out (default: `HEAD`) |
| `repos[].base_commit` | Alias for `commit`, useful for SWE-bench-style datasets |
| `repos[].ancestor` | Walk N commits back from the checked-out ref (e.g., `1` for parent) |
| `repos[].sparse` | Sparse checkout paths |
| `hooks.after_each.reset` | Reset policy after each test: `none`, `fast`, `strict` |
Expand Down Expand Up @@ -227,12 +226,12 @@ workspace:
after_each:
reset: fast

# GitHub shorthand with a base_commit alias
# GitHub shorthand with a pinned commit
workspace:
repos:
- path: ./repo
repo: org/repo
base_commit: abc123def
commit: abc123def
```

### Cleanup Behavior
Expand Down
4 changes: 2 additions & 2 deletions apps/web/src/content/docs/docs/next/tools/import.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ uv run scripts/import-huggingface.py \
Each instance becomes an EVAL.yaml with:
- `input` — the problem statement
- `workspace.docker.image` — the pre-built SWE-bench Docker image (`ghcr.io/epoch-research/swe-bench.eval.x86_64.<instance_id>:latest`)
- `workspace.repos[].base_commit` — the commit to reset to before the agent runs
- `workspace.repos[].commit` — the commit to reset to before the agent runs
- `assertions` — `script` tasks that run `FAIL_TO_PASS` and `PASS_TO_PASS` pytest suites inside the container

Run an imported SWE-bench eval against any coding agent target:
Expand All @@ -234,4 +234,4 @@ uv run scripts/import-huggingface.py \
agentv eval /tmp/swebench-eval/*.EVAL.yaml --target codex
```

The Docker workspace spins up the pre-built SWE-bench image, checks out `base_commit`, runs the agent to apply a patch, then grades by running the test suite inside the container.
The Docker workspace spins up the pre-built SWE-bench image, checks out the imported `commit`, runs the agent to apply a patch, then grades by running the test suite inside the container.
4 changes: 2 additions & 2 deletions examples/features/docker-workspace/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,15 +37,15 @@ workspace:
cpus: 2 # optional Docker CPU limit
```

For evals that need a repo pinned to a dataset snapshot, use `workspace.repos[].base_commit`:
For evals that need a repo pinned to a dataset snapshot, use `workspace.repos[].commit`:

```yaml
workspace:
docker:
image: swebench/sweb.eval.x86_64.django__django-15180
repos:
- path: /testbed
base_commit: abc123def
commit: abc123def
```

Repos defined without `repo` are assumed to already exist inside the container (e.g., SWE-bench prebuilt images).
Expand Down
4 changes: 2 additions & 2 deletions examples/showcase/bug-fix-benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Compares four configurations on identical bug fix tasks:

```
┌─────────────────────────────────────────────────────────────────┐
│ 1. Clone public repo at base_commit (broken state) │
│ 1. Clone public repo at commit (broken state)
│ 2. Run target before_each hook (install plugin config) │
│ 3. Agent receives issue description │
│ 4. Agent diagnoses and writes fix │
Expand Down Expand Up @@ -81,7 +81,7 @@ The `setup-variant.sh` hook copies these files into the workspace before each te
## Adding New Test Cases

1. Find a bug fix from GitHub issues/PRs
2. Note the `base_commit` (before the fix)
2. Note the `commit` (before the fix)
3. Copy the issue description as the test `input`
4. Add assertions to verify the fix
5. Add to `evals/bug-fixes.eval.yaml`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ workspace:
repos:
- path: ./repo
repo: https://github.com/EntityProcess/agentv
base_commit: "6e446b722627e9df017b22e391fa63320362d8c7"
commit: "6e446b722627e9df017b22e391fa63320362d8c7"
hooks:
before_each:
reset: fast
Expand Down
2 changes: 0 additions & 2 deletions packages/core/src/evaluation/prepared-workspace.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,6 @@ export interface PreparedWorkspaceRepoPin {
readonly path?: string;
readonly repo?: string;
readonly commit?: string;
readonly baseCommit?: string;
readonly ancestor?: number;
readonly sparse?: readonly string[];
}
Expand Down Expand Up @@ -140,7 +139,6 @@ function toRepoPins(repos: readonly RepoConfig[] | undefined): readonly Prepared
...(repo.path !== undefined && { path: repo.path }),
...(repo.repo !== undefined && { repo: repo.repo }),
...(repo.commit !== undefined && { commit: repo.commit }),
...(repo.base_commit !== undefined && { baseCommit: repo.base_commit }),
...(repo.ancestor !== undefined && { ancestor: repo.ancestor }),
...(repo.sparse !== undefined && { sparse: repo.sparse }),
}));
Expand Down
4 changes: 1 addition & 3 deletions packages/core/src/evaluation/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -244,9 +244,7 @@ export type RepoConfig = {
readonly repo?: string;
/** Commit, branch, or tag to check out after materialization. */
readonly commit?: string;
/** SWE-bench-friendly alias for commit when pinning a dataset snapshot commit. */
readonly base_commit?: string;
/** Walk this many ancestors back after checking out commit/base_commit. */
/** Walk this many ancestors back after checking out commit. */
readonly ancestor?: number;
/** Optional sparse-checkout paths. */
readonly sparse?: readonly string[];
Expand Down
6 changes: 1 addition & 5 deletions packages/core/src/evaluation/validation/eval-file.schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -386,14 +386,10 @@ const RepoSchema = z
path: z.string().optional(),
repo: z.string().min(1).optional(),
commit: z.string().min(1).optional(),
base_commit: z.string().min(1).optional(),
ancestor: z.number().int().min(0).optional(),
sparse: z.array(z.string()).optional(),
})
.strict()
.refine((repo) => !repo.commit || !repo.base_commit || repo.commit === repo.base_commit, {
message: 'commit and base_commit must match when both are set',
});
.strict();

const WorkspaceHookSchema = z
.object({
Expand Down
28 changes: 14 additions & 14 deletions packages/core/src/evaluation/validation/eval-validator.ts
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,8 @@ const KNOWN_TEST_FIELDS = new Set([
'window_size',
]);

const SUPPORTED_WORKSPACE_REPO_FIELDS = new Set(['path', 'repo', 'commit', 'ancestor', 'sparse']);

/** Removed test-level fields with migration hints. */
const REMOVED_TEST_FIELDS = new Map<string, string>([]);

Expand Down Expand Up @@ -1633,7 +1635,7 @@ function validateWorkspaceRepoConfig(
filePath,
location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message:
'workspace.repos[].checkout has been removed. Use top-level commit, base_commit, and ancestor.',
'workspace.repos[].checkout has been removed. Use top-level commit and ancestor.',
});
}

Expand Down Expand Up @@ -1674,6 +1676,17 @@ function validateWorkspaceRepoConfig(
});
}

for (const key of Object.keys(repo)) {
if (!SUPPORTED_WORKSPACE_REPO_FIELDS.has(key)) {
errors.push({
severity: 'error',
filePath,
location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message: `workspace.repos[].${key} is not supported. Supported fields: path, repo, commit, ancestor, sparse.`,
});
}
}

if (!repo.repo && !isObject(docker)) {
errors.push({
severity: 'error',
Expand All @@ -1684,19 +1697,6 @@ function validateWorkspaceRepoConfig(
'Repo-less entries are only valid when workspace.docker is configured.',
});
}

if (
typeof repo.commit === 'string' &&
typeof repo.base_commit === 'string' &&
repo.commit !== repo.base_commit
) {
errors.push({
severity: 'error',
filePath,
location: `${location}.repos[path=${repo.path ?? '(none)'}]`,
message: 'repos[].commit and repos[].base_commit must match when both are set.',
});
}
}
}

Expand Down
4 changes: 2 additions & 2 deletions packages/core/src/evaluation/workspace/repo-checkout.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ export interface RepoCheckoutTarget {
}

export function getRepoCheckoutRef(repo: RepoConfig | undefined): string {
return repo?.commit ?? repo?.base_commit ?? 'HEAD';
return repo?.commit ?? 'HEAD';
}

export function getRepoCheckoutTargets(
repos: readonly RepoConfig[] | undefined,
): RepoCheckoutTarget[] {
if (!repos) return [];
return repos
.filter((repo) => repo.commit || repo.base_commit)
.filter((repo) => repo.commit)
.map((repo) => ({
path: repo.path,
ref: getRepoCheckoutRef(repo),
Expand Down
27 changes: 15 additions & 12 deletions packages/core/src/evaluation/workspace/repo-config-parser.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
/**
* Shared parser for eval workspace repo entries.
*
* Repo entries are provenance-only: `repo` names the canonical repository,
* `commit` pins the checkout, and `base_commit` is a SWE-bench-friendly alias
* for that pin. Acquisition details such as local mirrors, clone depth, filters,
* and source type are resolved by the workspace harness, not the eval YAML.
* Repo entries are provenance-only: `repo` names the canonical repository, and
* `commit` pins the checkout. Acquisition details such as local mirrors, clone
* depth, filters, and source type are resolved by the workspace harness, not
* the eval YAML.
*/
import type { RepoConfig } from '../types.js';
import { isJsonObject } from '../types.js';

const SUPPORTED_REPO_FIELDS = new Set(['path', 'repo', 'commit', 'ancestor', 'sparse']);

function readString(obj: Record<string, unknown>, key: string): string | undefined {
const value = obj[key];
return typeof value === 'string' && value.trim().length > 0 ? value : undefined;
Expand All @@ -30,7 +32,7 @@ export function parseRepoConfig(raw: unknown): RepoConfig | undefined {
}
if ('checkout' in obj) {
throw new Error(
'workspace.repos[].checkout has been removed. Use top-level commit, base_commit, and ancestor.',
'workspace.repos[].checkout has been removed. Use top-level commit and ancestor.',
);
}
if ('clone' in obj) {
Expand All @@ -49,27 +51,28 @@ export function parseRepoConfig(raw: unknown): RepoConfig | undefined {
'workspace.repos[].resolver has been removed. Configure repo_resolvers.repos patterns instead.',
);
}
for (const key of Object.keys(obj)) {
if (!SUPPORTED_REPO_FIELDS.has(key)) {
throw new Error(
`workspace.repos[].${key} is not supported. Supported fields: path, repo, commit, ancestor, sparse.`,
);
}
}

const repoPath = readString(obj, 'path');
const repo = readString(obj, 'repo');
const commit = readString(obj, 'commit');
const baseCommit = readString(obj, 'base_commit');
const ancestor = typeof obj.ancestor === 'number' ? obj.ancestor : undefined;
const sparse = readStringArray(obj, 'sparse');

if (commit !== undefined && baseCommit !== undefined && commit !== baseCommit) {
throw new Error('workspace.repos[].commit and workspace.repos[].base_commit must match.');
}

if (!repoPath && !repo && !commit && !baseCommit && ancestor === undefined && !sparse) {
if (!repoPath && !repo && !commit && ancestor === undefined && !sparse) {
return undefined;
}

return {
...(repoPath !== undefined && { path: repoPath }),
...(repo !== undefined && { repo }),
...(commit !== undefined && { commit }),
...(baseCommit !== undefined && { base_commit: baseCommit }),
...(ancestor !== undefined && { ancestor }),
...(sparse !== undefined && { sparse }),
};
Expand Down
Loading
Loading