Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 24 additions & 8 deletions apps/web/astro.config.mjs
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
import { readFileSync } from 'node:fs';
import { fileURLToPath } from 'node:url';
import starlight from '@astrojs/starlight';
import { defineConfig } from 'astro/config';

// Static builds can't redirect an open-ended `/docs/[...slug]` wildcard to
// v4.42.4 (that requires enumerable paths), so generate one concrete
// redirect per known v4.42.4 route from its route manifest instead.
const v4RoutesPath = fileURLToPath(new URL('./src/data/docs-v4.42.4-routes.json', import.meta.url));
const v4Routes = JSON.parse(readFileSync(v4RoutesPath, 'utf8'));
const v4Redirects = Object.fromEntries(
v4Routes.map((route) => {
const bareRoute = route.replace('/docs/v4.42.4/', '/docs/');
const from = bareRoute === '/docs/' ? '/docs' : bareRoute.replace(/\/$/, '');
return [from, route];
}),
);

export default defineConfig({
site: 'https://agentv.dev',
image: { service: { entrypoint: 'astro/assets/services/noop' } },
redirects: {
'/docs/v4': '/docs/v4.42.4/',
...v4Redirects,
},
integrations: [
starlight({
Expand Down Expand Up @@ -48,14 +64,14 @@ export default defineConfig({
{ icon: 'github', label: 'GitHub', href: 'https://github.com/EntityProcess/agentv' },
],
sidebar: [
{ label: 'Getting Started', autogenerate: { directory: 'docs/getting-started' } },
{ label: 'Evaluation', autogenerate: { directory: 'docs/evaluation' } },
{ label: 'Graders', autogenerate: { directory: 'docs/graders' } },
{ label: 'Targets', autogenerate: { directory: 'docs/targets' } },
{ label: 'Tools', autogenerate: { directory: 'docs/tools' } },
{ label: 'Guides', autogenerate: { directory: 'docs/guides' } },
{ label: 'Integrations', autogenerate: { directory: 'docs/integrations' } },
{ label: 'Reference', autogenerate: { directory: 'docs/reference' } },
{ label: 'Getting Started', autogenerate: { directory: 'docs/next/getting-started' } },
{ label: 'Evaluation', autogenerate: { directory: 'docs/next/evaluation' } },
{ label: 'Graders', autogenerate: { directory: 'docs/next/graders' } },
{ label: 'Targets', autogenerate: { directory: 'docs/next/targets' } },
{ label: 'Tools', autogenerate: { directory: 'docs/next/tools' } },
{ label: 'Guides', autogenerate: { directory: 'docs/next/guides' } },
{ label: 'Integrations', autogenerate: { directory: 'docs/next/integrations' } },
{ label: 'Reference', autogenerate: { directory: 'docs/next/reference' } },
],
editLink: {
baseUrl: 'https://github.com/EntityProcess/agentv/edit/main/apps/web/',
Expand Down
18 changes: 11 additions & 7 deletions apps/web/src/components/VersionSelect.astro
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
---
const versions = [
{ label: 'Canary', base: '/docs' },
{ label: 'Next', base: '/docs/next' },
{ label: 'v4.42.4', base: '/docs/v4.42.4' },
];

// Longest base first so more specific versions match before shorter prefixes.
const versionsByBaseLength = [...versions].sort((a, b) => b.base.length - a.base.length);

const pathname = Astro.url.pathname.replace(/\/$/, '') || '/';

function getCurrentVersion(path) {
if (path === '/docs/v4.42.4' || path.startsWith('/docs/v4.42.4/')) return versions[1];
return versions[0];
return (
versionsByBaseLength.find(
(version) => path === version.base || path.startsWith(`${version.base}/`),
) ?? versions[0]
);
}

function getVersionSuffix(path) {
if (path === '/docs' || path === '/docs/v4.42.4') return '';
if (path.startsWith('/docs/v4.42.4/')) return path.slice('/docs/v4.42.4'.length);
if (path.startsWith('/docs/')) return path.slice('/docs'.length);
return '';
const current = getCurrentVersion(path);
return path === current.base ? '' : path.slice(current.base.length);
}

function withTrailingSlash(path) {
Expand Down
36 changes: 24 additions & 12 deletions apps/web/src/components/VersionedSidebar.astro
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,28 @@ import MobileMenuFooter from 'virtual:starlight/components/MobileMenuFooter';
import SidebarPersister from '@astrojs/starlight/components/SidebarPersister.astro';
import SidebarSublist from '@astrojs/starlight/components/SidebarSublist.astro';
import type { SidebarEntry } from '@astrojs/starlight/utils/routing/types';
import archiveRoutes from '../data/docs-v4.42.4-routes.json';
import v4Routes from '../data/docs-v4.42.4-routes.json';

// The Starlight sidebar config autogenerates from docs/next/*, so the base
// sidebar's hrefs already point at the live /docs/next/ tree unmodified.
// Only genuinely archived versions below need their hrefs remapped.
const LIVE_PREFIX = '/docs/next/';
const ARCHIVED_VERSIONS = [{ slug: 'v4.42.4', routes: v4Routes }];

const ARCHIVE_PREFIX = '/docs/v4.42.4/';
const { sidebar } = Astro.locals.starlightRoute;
const pathname = withTrailingSlash(Astro.url.pathname);
const routeSet = new Set(archiveRoutes);
const renderedSidebar = isArchivePath(pathname) ? toArchiveSidebar(sidebar) : sidebar;
const archiveVersion = ARCHIVED_VERSIONS.find((version) => isArchivePath(pathname, version.slug));
const renderedSidebar = archiveVersion ? toArchiveSidebar(sidebar, archiveVersion) : sidebar;

function toArchiveSidebar(
entries: SidebarEntry[],
archiveVersion: (typeof ARCHIVED_VERSIONS)[number],
): SidebarEntry[] {
const routeSet = new Set(archiveVersion.routes);

function toArchiveSidebar(entries: SidebarEntry[]): SidebarEntry[] {
return entries.flatMap((entry) => {
if (entry.type === 'link') {
const archiveHref = toArchiveHref(entry.href);
const archiveHref = toArchiveHref(entry.href, archiveVersion.slug);
if (!routeSet.has(stripHash(archiveHref))) return [];

return [
Expand All @@ -26,7 +36,7 @@ function toArchiveSidebar(entries: SidebarEntry[]): SidebarEntry[] {
];
}

const childEntries = toArchiveSidebar(entry.entries);
const childEntries = toArchiveSidebar(entry.entries, archiveVersion);
if (!childEntries.length) return [];

return [
Expand All @@ -38,17 +48,19 @@ function toArchiveSidebar(entries: SidebarEntry[]): SidebarEntry[] {
});
}

function toArchiveHref(href: string) {
if (!href.startsWith('/docs/') || href.startsWith(ARCHIVE_PREFIX)) return href;
return href.replace('/docs/', ARCHIVE_PREFIX);
function toArchiveHref(href: string, slug: string) {
const archivePrefix = `/docs/${slug}/`;
if (!href.startsWith(LIVE_PREFIX) || href.startsWith(archivePrefix)) return href;
return href.replace(LIVE_PREFIX, archivePrefix);
}

function withTrailingSlash(path: string) {
return path.endsWith('/') ? path : `${path}/`;
}

function isArchivePath(path: string) {
return path === ARCHIVE_PREFIX || path.startsWith(ARCHIVE_PREFIX);
function isArchivePath(path: string, slug: string) {
const archivePrefix = `/docs/${slug}/`;
return path === archivePrefix || path.startsWith(archivePrefix);
}

function stripHash(href: string) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ tests:
- Output contains the transformed spreadsheet text including the revenue rows
```

See [`examples/features/preprocessors/`](../../../../examples/features/preprocessors/) for a runnable end-to-end example with a file-producing target and custom grader target.
See [`examples/features/preprocessors/`](../../../../../examples/features/preprocessors/) for a runnable end-to-end example with a file-producing target and custom grader target.

## Tool Trajectory

Expand Down Expand Up @@ -189,7 +189,7 @@ assert:
prompt: ../prompts/grader-pass-fail-v1.md
```

See [`examples/showcase/offline-grader-benchmark/`](../../../../examples/showcase/offline-grader-benchmark/) for the full workflow, replay target, export contract, scoring script, and A/B compare commands.
See [`examples/showcase/offline-grader-benchmark/`](../../../../../examples/showcase/offline-grader-benchmark/) for the full workflow, replay target, export contract, scoring script, and A/B compare commands.

## Static Trace

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ Resolution order:
- if no preprocessor matches, AgentV falls back to a UTF-8 text read
- if the fallback read looks binary or invalid, the grader receives a warning note instead of failing the test run

See [`examples/features/preprocessors/`](../../../../examples/features/preprocessors/) for a runnable example with a file-producing target and a custom preprocessor script.
See [`examples/features/preprocessors/`](../../../../../examples/features/preprocessors/) for a runnable example with a file-producing target and a custom preprocessor script.

## Available Context Fields

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ sidebar:
---

import { Image } from 'astro:assets';
import trajectoryChart from '../../../../assets/screenshots/autoresearch-trajectory.png';
import trajectoryChart from '../../../../../assets/screenshots/autoresearch-trajectory.png';

Autoresearch is an unattended optimization loop that **automatically improves your agent skills** through repeated eval cycles. It runs the same evaluate → analyze → improve loop described in the [Skill Improvement Workflow](/docs/guides/skill-improvement-workflow/), but does it hands-free — no human review between cycles.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,20 @@ sidebar:
---

import { Image } from 'astro:assets';
import studioRuns from '../../../../assets/screenshots/studio-runs.png';
import studioRunDetail from '../../../../assets/screenshots/studio-run-detail.png';
import studioExperiments from '../../../../assets/screenshots/studio-experiments.png';
import studioProjects from '../../../../assets/screenshots/studio-projects.png';
import studioProjectsMulti from '../../../../assets/screenshots/studio-projects-multi.png';
import studioCompareAggregated from '../../../../assets/screenshots/studio-compare-aggregated.png';
import studioComparePerRun from '../../../../assets/screenshots/studio-compare-per-run.png';
import studioCompareSideBySide from '../../../../assets/screenshots/studio-compare-side-by-side.png';
import studioRunsBench from '../../../../assets/screenshots/studio-runs-bench.png';
import studioAnalyticsAggregated from '../../../../assets/screenshots/studio-analytics-aggregated.png';
import studioAnalyticsCharts from '../../../../assets/screenshots/studio-analytics-charts.png';
import studioAnalyticsTrend from '../../../../assets/screenshots/studio-analytics-trend.png';
import studioRemoteResultsBeforeSync from '../../../../assets/screenshots/studio-remote-results-before-sync.png';
import studioRemoteResultsAfterSync from '../../../../assets/screenshots/studio-remote-results-after-sync.png';
import studioRuns from '../../../../../assets/screenshots/studio-runs.png';
import studioRunDetail from '../../../../../assets/screenshots/studio-run-detail.png';
import studioExperiments from '../../../../../assets/screenshots/studio-experiments.png';
import studioProjects from '../../../../../assets/screenshots/studio-projects.png';
import studioProjectsMulti from '../../../../../assets/screenshots/studio-projects-multi.png';
import studioCompareAggregated from '../../../../../assets/screenshots/studio-compare-aggregated.png';
import studioComparePerRun from '../../../../../assets/screenshots/studio-compare-per-run.png';
import studioCompareSideBySide from '../../../../../assets/screenshots/studio-compare-side-by-side.png';
import studioRunsBench from '../../../../../assets/screenshots/studio-runs-bench.png';
import studioAnalyticsAggregated from '../../../../../assets/screenshots/studio-analytics-aggregated.png';
import studioAnalyticsCharts from '../../../../../assets/screenshots/studio-analytics-charts.png';
import studioAnalyticsTrend from '../../../../../assets/screenshots/studio-analytics-trend.png';
import studioRemoteResultsBeforeSync from '../../../../../assets/screenshots/studio-remote-results-before-sync.png';
import studioRemoteResultsAfterSync from '../../../../../assets/screenshots/studio-remote-results-after-sync.png';

The `dashboard` command launches a web-based dashboard for browsing evaluation runs, inspecting individual test results, and reviewing scores. It shows both local runs and runs synced from a remote results repository.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ sidebar:
---

import { Image } from 'astro:assets';
import resultsReportOverview from '../../../../assets/screenshots/results-report-overview.png';
import resultsReportDetails from '../../../../assets/screenshots/results-report-details.png';
import resultsReportOverview from '../../../../../assets/screenshots/results-report-overview.png';
import resultsReportDetails from '../../../../../assets/screenshots/results-report-details.png';

The `results` command family works on existing local AgentV run workspaces and `index.jsonl` manifests. Use it after an eval run to inspect failures, validate manifests, export artifact layouts, combine/delete local run workspaces, or generate a shareable HTML report.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/batch-cli
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Batch CLI evaluation handles tools that process multiple inputs at once — bulk classifiers, screening engines, or any runner that reads all tests and outputs results in one pass.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/eval-cases
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Tests are individual test entries within an evaluation file. Each test defines input messages, expected outcomes, and optional grader overrides.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/eval-files
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Evaluation files define the test cases, targets, and graders for an evaluation run. AgentV supports two formats: YAML and JSONL.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/examples
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

This page collects complete eval file examples you can copy and adapt. Each demonstrates a different AgentV pattern.
Expand Down
4 changes: 0 additions & 4 deletions apps/web/src/content/docs/docs/v4.42.4/evaluation/rubrics.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/rubrics
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Rubrics are defined with `assertions` entries and support binary checklist grading and score-range analytic grading.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/running-evals
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

## Run an Evaluation
Expand Down
4 changes: 0 additions & 4 deletions apps/web/src/content/docs/docs/v4.42.4/evaluation/sdk.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/evaluation/sdk
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

YAML remains AgentV's canonical, portable eval format. The SDK surfaces below are for cases where you want to generate YAML-shaped definitions in code, embed eval runs inside another application, or write executable graders and prompt templates. For authoring helpers, `@agentv/sdk` is AgentV's public lightweight SDK package.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/getting-started/installation
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

## Prerequisites
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/getting-started/quickstart
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Follow these steps to create and run your first evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/code-graders
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Code graders are scripts that evaluate agent responses deterministically. Write them in any language — Python, TypeScript, Node, or any executable.
Expand Down
4 changes: 0 additions & 4 deletions apps/web/src/content/docs/docs/v4.42.4/graders/composite.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/composite
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Composite graders combine multiple graders and aggregate their results into a single score. This enables sophisticated evaluation patterns like safety gates, weighted scoring, and conflict resolution.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/custom-assertions
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

Custom assertions let you add evaluation logic that goes beyond built-in types. Define a TypeScript function, drop it in `.agentv/assertions/`, and reference it by name in your YAML eval files.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/custom-graders
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

AgentV supports multiple grader types that can be combined for comprehensive evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/execution-metrics
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

AgentV provides built-in graders for checking execution metrics against thresholds. These are useful for enforcing efficiency constraints without writing custom code.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,6 @@ sidebar:
slug: docs/v4.42.4/graders/llm-graders
editUrl: false
pagefind: false
banner:
content: |
You are viewing the frozen v4.42.4 docs. Use <a href="/docs/">Canary docs</a> for the current development version.

---

LLM graders use a language model to evaluate agent responses against custom criteria defined in a prompt file.
Expand Down
Loading
Loading