EntityProcess · christso · Jul 3, 2026 · Jul 2, 2026 · Jul 2, 2026 · Jul 3, 2026
diff --git a/.agents/conventions.md b/.agents/conventions.md
@@ -136,16 +136,16 @@ Before adding a new pointer family, verify that the artifact is large enough or
 
 Grader types use kebab-case everywhere.
 
-- YAML config: `type: llm-grader`, `type: is-json`, `type: execution-metrics`
+- YAML config: `type: llm-rubric`, `type: llm-rubric`, `type: script`, `type: is-json`
 - Internal TypeScript: `EvaluatorKind = 'llm-grader' | 'is-json' | ...`
 - Output `scores[].type`: `"llm-grader"`, `"is-json"`
 - Registry keys: `registry.register('llm-grader', ...)`
 
-Source of truth: `EVALUATOR_KIND_VALUES` in `packages/core/src/evaluation/types.ts`.
+Source of truth: `GRADER_KIND_VALUES` in `packages/core/src/evaluation/types.ts`.
 
 Backward compatibility:
 
-- Snake_case is accepted in YAML by `normalizeGraderType()` in `grader-parser.ts`, for example `llm_judge` -> `llm-grader`.
+- Snake_case is accepted in YAML by `normalizeGraderType()` in `grader-parser.ts`, for example `llm_rubric` -> `llm-rubric`.
 - Single-word types such as `contains`, `equals`, `regex`, `latency`, and `cost` are unchanged.
 
 Two type definitions exist and must stay in sync:

diff --git a/.agents/product-boundary.md b/.agents/product-boundary.md
@@ -40,7 +40,7 @@ AgentV's core should remain minimal. Complex or domain-specific logic belongs in
 Prefer these extension points before adding a built-in:
 
 - `script` graders for custom executable evaluation logic
-- plain assertion strings or `g-eval` for structured rubric criteria
+- plain assertion strings or `llm-rubric` for structured rubric criteria
 - `llm-rubric` for promptfoo-compatible free-form rubric checks
 - `llm-grader` only when a custom prompt, custom grader target, or preprocessing is needed
 - CLI wrappers that consume AgentV JSON or JSONL output for post-processing such as aggregation, comparison, or reporting

diff --git a/README.md b/README.md
@@ -18,33 +18,10 @@ Test AI targets on real repo tasks and measure what actually works.
 - **Category** is derived from where the eval lives, such as folder path and file name. Use paths to organize the corpus instead of repeating category labels in every eval.
 - **Workspace / fixtures / graders** are task-owned context: repos, setup scripts, files, fixtures, isolation, deterministic checks, and LLM grading prompts.
 - **Target** is the system under test: an agent, provider, gateway, replay target, CLI wrapper, transcript provider, or future app/service wrapper. Each eval selects one `target`, either by label from `targets.yaml` or with an eval-local target object.
-- **Experiment** is the run/result grouping label being measured over that corpus, such as `with-skills` or `without-skills`. Keep suite/category and target/model names out of this label.
+- **Tags** are run/result grouping labels. `tags.experiment` is the default experiment namespace, such as `with-skills` or `without-skills`; keep suite/category and target/model names out of that tag.
 - **Evaluate options** configure runner-level behavior such as repeat policy, optional timeouts, and `max_concurrency` under `evaluate_options`.
 - **Default test** configures inherited per-test defaults such as score `threshold`.
-- **Run** is one concrete execution of an experiment against a resolved target that writes portable artifacts for readers such as Dashboard, compare, and trend.
-
-```mermaid
-flowchart LR
-  corpus["Eval suite / imports / tests<br/>task corpus"]
-  category["Category<br/>path-derived grouping"]
-  context["Workspace / fixtures / graders<br/>task-owned context"]
-  experiment["Experiment<br/>named run condition"]
-  target["Target<br/>system under test"]
-  controls["Run controls<br/>execution + gates"]
-  run["Run<br/>concrete execution"]
-  artifacts["Run artifacts<br/>summary.json + index.jsonl + sidecars"]
-  readers["Dashboard / compare / trend<br/>derived readers"]
-
-  corpus --> category
-  corpus --> run
-  context --> run
-  category --> run
-  experiment --> run
-  target --> run
-  controls --> run
-  run --> artifacts
-  artifacts --> readers
-```
+- **Run** is one concrete execution of a tagged eval against a resolved target that writes portable artifacts for readers such as Dashboard, compare, and trend.
 
 ## Quick start
 
@@ -80,7 +57,7 @@ default_test:
   threshold: 0.8
 
 workspace:
-  isolation: per_case
+  scope: attempt
   repos:
     - path: ./fixture
       repo: EntityProcess/agentv-contract-fixture
@@ -89,25 +66,25 @@ workspace:
 tests:
   - id: fizzbuzz
     input: Write FizzBuzz in Python
-    assertions:
+    assert:
       - type: contains
         value: "fizz"
       - Implements correct FizzBuzz logic for multiples of 3, 5, and 15
       - type: script
         command: ["python3", "./validators/check_syntax.py"]
-      - type: g-eval
-        criteria:
+      - type: llm-rubric
+        value:
           - outcome: Solution is simple and idiomatic Python
             weight: 0.5
           - outcome: Handles the 3, 5, and 15 branches correctly
             weight: 1.5
 ```
 
 Plain assertion strings are short-form rubric criteria: AgentV groups them into
-`g-eval` and writes each criterion to `grading.json.assertion_results` for the
-Dashboard. Use explicit `type: g-eval` when you need weights, required flags, or
-`score_ranges`; use `type: llm-rubric` for promptfoo-compatible free-form rubric
-assertions; use `type: llm-grader` only when you need a custom grader prompt,
+`llm-rubric` and writes each criterion to `grading.json.assertion_results` for the
+Dashboard. Use explicit `type: llm-rubric` when you need weights, required flags, or
+`score_ranges`; use string `value` for promptfoo-compatible free-form rubric
+checks; use `type: llm-grader` only when you need a custom grader prompt,
 grader target, or preprocessing. Executable graders use `type: script`.
 
 The target can be an eval-local object when this eval needs target settings of its own:
@@ -134,7 +111,20 @@ tests:
 
 `target: copilot-sdk` resolves the target label from `.agentv/targets.yaml` or `targets.yaml` and uses its default provider, model, hooks, and provider settings. The object form above starts from `copilot-sdk`, then applies the eval-local fields for this eval. If `extends` is omitted, the object defines the full target inline and must include enough provider configuration to run. AgentV records the resolved target information in run artifacts so results can be audited and replayed. The `tags.experiment` label stays `with-skills` because the condition is unchanged; the model/provider variation belongs to the resolved target metadata.
 
-Use `default_test.threshold` for the inherited per-test pass cutoff. Existing eval files with a top-level `threshold` still load during migration, and `--threshold` on the CLI still overrides YAML thresholds for a run.
+Use `default_test.threshold` for the inherited per-test pass cutoff. `default_test` can also point at a shared file, matching promptfoo's external defaults pattern:
+
+```yaml
+default_test: file://{{ env.AGENTV_REPO_ROOT }}/.agentv/default-test.yaml
+```
+
+AgentV makes `AGENTV_REPO_ROOT` available during eval/config interpolation. Projects that prefer a short name can define their own reference in `.agentv/config.yaml`; `global-default` below is just an example key:
+
+```yaml
+refs:
+  global-default: file://{{ env.AGENTV_REPO_ROOT }}/.agentv/default-test.yaml
+```
+
+Then eval files in that project can use `default_test: ref://global-default`.
 
 **4. Run it:**
 ```bash
@@ -195,11 +185,11 @@ const { results, summary } = await evaluate({
     {
       id: 'fizzbuzz',
       input: 'Write FizzBuzz in Python',
-      assertions: [
+      assert: [
         { type: 'contains', value: 'fizz' },
         'Implements correct FizzBuzz logic for multiples of 3, 5, and 15',
         { type: 'script', command: ['python3', './validators/check_syntax.py'] },
-        { type: 'g-eval', criteria: ['Solution is simple and idiomatic Python'] },
+        { type: 'llm-rubric', value: ['Solution is simple and idiomatic Python'] },
       ],
     },
   ],
@@ -227,7 +217,7 @@ export default defineEval({
   },
   threshold: 0.8,
   workspace: {
-    isolation: 'per_case',
+    scope: 'attempt',
     repos: [
       {
         path: './fixture',
@@ -240,11 +230,11 @@ export default defineEval({
     {
       id: 'fizzbuzz',
       input: 'Write FizzBuzz in Python',
-      assertions: [
+      assert: [
         { type: 'contains', value: 'fizz' },
         'Implements correct FizzBuzz logic for multiples of 3, 5, and 15',
         { type: 'script', command: ['python3', './validators/check_syntax.py'] },
-        { type: 'g-eval', criteria: ['Solution is simple and idiomatic Python'] },
+        { type: 'llm-rubric', value: ['Solution is simple and idiomatic Python'] },
       ],
     },
   ],

diff --git a/STRATEGY.md b/STRATEGY.md
@@ -28,7 +28,7 @@ AgentV stays repo-native and workspace-native: it runs or imports evaluations ar
 
 ### Workspace-native evaluation
 
-Make real repository workflows first-class: repo acquisition, hooks, pooled workspaces, replay/import paths, and reuse of existing harnesses.
+Make real repository workflows first-class: repo acquisition, hooks, suite/attempt workspaces, replay/import paths, and reuse of existing harnesses.
 
 _Why it serves the approach:_ This keeps AgentV attached to the actual work the agent is being judged on instead of collapsing it into a synthetic runner.
 

diff --git a/apps/cli/src/commands/convert/index.ts b/apps/cli/src/commands/convert/index.ts
@@ -72,9 +72,9 @@ export function convertEvalsJsonToYaml(inputPath: string): string {
   lines.push('# AgentV features you can add:');
   lines.push('#   - type: is-json, contains, regex for deterministic graders');
   lines.push('#   - type: script for custom scoring scripts');
-  lines.push('#   - type: g-eval criteria with weights and score ranges for rubrics');
+  lines.push('#   - type: llm-rubric value arrays with weights and score ranges for rubrics');
   lines.push('#   - Multi-turn conversations via input message arrays');
-  lines.push('#   - Multiple assertions with weighted scoring');
+  lines.push('#   - Multiple assert entries with weighted scoring');
   lines.push('#   - Workspace isolation with repos and hooks');
   lines.push('');
 
@@ -125,10 +125,10 @@ export function convertEvalsJsonToYaml(inputPath: string): string {
         '    # Promoted from evals.json expected_output, assertions[], and expectations[]',
       );
       lines.push('    # Replace with type: is-json, contains, or regex for deterministic checks');
-      lines.push('    assertions:');
-      lines.push('      - name: agent-skills-criteria');
-      lines.push('        type: g-eval');
-      lines.push('        criteria:');
+      lines.push('    assert:');
+      lines.push('      - metric: agent-skills-criteria');
+      lines.push('        type: llm-rubric');
+      lines.push('        value:');
       for (const criterion of test.criteria) {
         lines.push(`          - id: ${quoteYamlString(criterion.id)}`);
         lines.push(`            outcome: ${quoteYamlString(criterion.outcome)}`);

diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts
@@ -34,36 +34,35 @@ export default defineAssertion(({ output }) => {
 
 const EVAL_TEMPLATES: Record<string, (name: string) => string> = {
   default: (name: string) => `description: ${name} evaluation suite
-execution:
-  target: default
+target: default
 
 tests:
   - id: sample-test
     criteria: Agent responds correctly
     input: "Hello, how are you?"
     expected_output: "I'm doing well"
-    assertions:
+    assert:
       - type: contains
         value: "well"
 `,
   rubric: (name: string) => `description: ${name} evaluation suite
-execution:
-  target: default
+target: default
 
 tests:
   - id: sample-test
     criteria: Agent responds correctly and completely
     input: "Hello, how are you?"
     expected_output: "I'm doing well, thank you for asking!"
-    assertions:
-      - type: llm-grader
-        rubric:
-          accuracy:
+    assert:
+      - metric: response-quality
+        type: llm-rubric
+        value:
+          - id: accuracy
+            outcome: Response is factually correct
             weight: 0.6
-            criteria: Response is factually correct
-          completeness:
+          - id: completeness
+            outcome: Response addresses all parts of the question
             weight: 0.4
-            criteria: Response addresses all parts of the question
 `,
 };
 
@@ -128,7 +127,7 @@ export const createAssertionCommand = command({
     await mkdir(dir, { recursive: true });
     await writeFile(filePath, content);
     console.log(`Created ${path.relative(process.cwd(), filePath)} (template: ${templateName})`);
-    console.log(`\nUse in EVAL.yaml:\n  assertions:\n    - type: ${name}`);
+    console.log(`\nUse in EVAL.yaml:\n  assert:\n    - type: ${name}`);
   },
 });
 

diff --git a/apps/cli/src/commands/eval/commands/assert.ts b/apps/cli/src/commands/eval/commands/assert.ts
@@ -7,7 +7,7 @@ import { buildTraceFromMessages, executeScript } from '@agentv/core';
 
 export const evalAssertCommand = command({
   name: 'assert',
-  description: 'Run a single code-grader assertion from .agentv/graders/ and print the score',
+  description: 'Run a single script grader assertion from .agentv/graders/ and print the score',
   args: {
     graderName: positional({
       type: string,
@@ -62,8 +62,7 @@ export const evalAssertCommand = command({
       process.exit(1);
     }
 
-    // Build payload matching CodeGrader's expected format (snake_case).
-    // Include all fields that defineCodeGrader validates as required.
+    // Build payload matching the script grader protocol (snake_case).
     const messages = [{ role: 'assistant' as const, content: resolvedOutput }];
     const inputMessages = [{ role: 'user' as const, content: resolvedInput }];
     const trace = buildTraceFromMessages({

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -119,15 +119,10 @@ export const evalRunCommand = command({
       long: 'verbose',
       description: 'Enable verbose logging',
     }),
-    workspaceMode: option({
-      type: optional(string),
-      long: 'workspace-mode',
-      description: "Workspace mode: 'temp' (default), 'pooled', or 'static'",
-    }),
     workspacePath: option({
       type: optional(string),
       long: 'workspace-path',
-      description: 'Static workspace directory path (used when workspace mode is static)',
+      description: 'Static workspace directory path to reuse for this run',
     }),
     keepWorkspaces: flag({
       long: 'keep-workspaces',
@@ -271,7 +266,6 @@ export const evalRunCommand = command({
       cachePath: args.cachePath,
       noCache: args.noCache,
       verbose: args.verbose,
-      workspaceMode: args.workspaceMode,
       workspacePath: args.workspacePath,
       keepWorkspaces: args.keepWorkspaces,
       trace: false,

diff --git a/apps/cli/src/commands/eval/commands/vitest.ts b/apps/cli/src/commands/eval/commands/vitest.ts
@@ -9,7 +9,7 @@ function parseCommand(value: string | undefined): readonly string[] | undefined
 
 export const evalVitestCommand = command({
   name: 'vitest',
-  description: 'Run Vitest workspace verifier files as an AgentV code-grader protocol adapter',
+  description: 'Run Vitest workspace verifier files as an AgentV script grader adapter',
   args: {
     testFiles: restPositionals({
       type: string,

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -294,7 +294,6 @@ interface NormalizedOptions {
   readonly resume: boolean;
   readonly rerunFailed: boolean;
   readonly rerunFailedSource?: string;
-  readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly keepWorkspaces: boolean;
   /** Removed: use --output instead */
@@ -380,10 +379,6 @@ function normalizeOptionalNumber(value: unknown): number | undefined {
   return undefined;
 }
 
-function normalizeWorkspaceMode(value: unknown): 'pooled' | 'temp' | 'static' | undefined {
-  return value === 'pooled' || value === 'temp' || value === 'static' ? value : undefined;
-}
-
 function normalizeStringArray(value: unknown): readonly string[] {
   if (Array.isArray(value)) {
     return value.filter((v): v is string => typeof v === 'string' && v.trim().length > 0);
@@ -696,21 +691,7 @@ function normalizeOptions(
   const configOutputDir = normalizeString(config?.output?.dir);
   const cliWorkspacePath = normalizeString(rawOptions.workspacePath);
   const configWorkspacePath = normalizeString(yamlExecution?.workspace_path);
-  const cliWorkspaceModeRaw = normalizeString(rawOptions.workspaceMode);
-  const cliWorkspaceMode = normalizeWorkspaceMode(rawOptions.workspaceMode);
-  if (cliWorkspacePath && cliWorkspaceModeRaw && cliWorkspaceMode !== 'static') {
-    throw new Error('--workspace-path requires --workspace-mode=static (or omit --workspace-mode)');
-  }
-  const configWorkspaceMode = normalizeWorkspaceMode(yamlExecution?.workspace_mode);
-  if (configWorkspacePath && configWorkspaceMode && configWorkspaceMode !== 'static') {
-    throw new Error(
-      'execution.workspace_path requires execution.workspace_mode: static when both are provided',
-    );
-  }
-  const useConfigWorkspacePath = cliWorkspaceMode === undefined || cliWorkspaceMode === 'static';
-  const workspacePath =
-    cliWorkspacePath ?? (useConfigWorkspacePath ? configWorkspacePath : undefined);
-  const workspaceMode = workspacePath ? 'static' : (cliWorkspaceMode ?? configWorkspaceMode);
+  const workspacePath = cliWorkspacePath ?? configWorkspacePath;
   const resultsRepo = normalizeString(rawOptions.resultsRepo);
   const resultsPush = normalizeBoolean(rawOptions.resultsPush);
   const resultsNoPush = normalizeBoolean(rawOptions.noResultsPush);
@@ -772,7 +753,6 @@ function normalizeOptions(
       normalizeBoolean(rawOptions.resume) || normalizeString(rawOptions.rerunFailed) !== undefined,
     rerunFailed: normalizeString(rawOptions.rerunFailed) !== undefined,
     rerunFailedSource: normalizeString(rawOptions.rerunFailed),
-    workspaceMode,
     workspacePath,
     // Precedence: CLI > YAML config > TS config
     keepWorkspaces:
@@ -865,7 +845,6 @@ const CLI_RUNTIME_SOURCE_OPTION_KEYS = [
   'recordReplay',
   'recordReplayVariant',
   'workspacePath',
-  'workspaceMode',
 ] as const;
 
 function hasCliRuntimeSource(rawOptions: Record<string, unknown>): boolean {
@@ -1031,7 +1010,6 @@ function applyExperimentOptions(
     ...options,
     target: options.target,
     agentTimeoutSeconds: options.agentTimeoutSeconds ?? experiment.timeoutSeconds,
-    workspaceMode: options.workspaceMode,
     workspacePath: options.workspacePath,
     budgetUsd: options.budgetUsd ?? experiment.budgetUsd,
     threshold: options.threshold ?? experiment.threshold,
@@ -1760,7 +1738,6 @@ async function runSingleEvalFile(params: {
     evalCases: testCases,
     verbose: options.verbose,
     maxConcurrency: resolvedWorkers,
-    workspaceMode: options.workspaceMode,
     workspacePath: options.workspacePath,
     keepWorkspaces: options.keepWorkspaces,
     trials: trialsConfig,
@@ -2830,9 +2807,7 @@ export async function runEvalCommand(
     }
 
     // Hint about --keep-workspaces when workspaces were used but some cleaned up
-    const usedWorkspaces =
-      resultsWithWorkspaces.length > 0 ||
-      (options.workspaceMode && options.workspaceMode !== 'static');
+    const usedWorkspaces = resultsWithWorkspaces.length > 0;
     if (!options.keepWorkspaces && usedWorkspaces) {
       console.log('Use --keep-workspaces to preserve all workspaces for inspection.');
     }