From 4ea8b690e17446f0556fb27653878f1ecdbb0397 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Thu, 27 Nov 2025 12:19:04 +0100 Subject: [PATCH 1/2] Basic evals for /api-review command This builds a basic go test suite that uses claude as a judge to review the output of the /api-review command. See tests/eval/DESIGN.md for more details. --- tests/eval/DESIGN.md | 165 ++++++++++++ tests/eval/eval_test.go | 252 ++++++++++++++++++ .../missing-optional-doc/expected.txt | 4 + .../testdata/missing-optional-doc/patch.diff | 15 ++ .../testdata/undocumented-enum/expected.txt | 2 + .../testdata/undocumented-enum/patch.diff | 16 ++ .../testdata/valid-api-change/expected.txt | 0 .../eval/testdata/valid-api-change/patch.diff | 19 ++ 8 files changed, 473 insertions(+) create mode 100644 tests/eval/DESIGN.md create mode 100644 tests/eval/eval_test.go create mode 100644 tests/eval/testdata/missing-optional-doc/expected.txt create mode 100644 tests/eval/testdata/missing-optional-doc/patch.diff create mode 100644 tests/eval/testdata/undocumented-enum/expected.txt create mode 100644 tests/eval/testdata/undocumented-enum/patch.diff create mode 100644 tests/eval/testdata/valid-api-change/expected.txt create mode 100644 tests/eval/testdata/valid-api-change/patch.diff diff --git a/tests/eval/DESIGN.md b/tests/eval/DESIGN.md new file mode 100644 index 00000000000..98850f3f3aa --- /dev/null +++ b/tests/eval/DESIGN.md @@ -0,0 +1,165 @@ +# API Review Command Eval Test Suite + +Design document for a Go/Ginkgo-based evaluation framework to test the `/api-review` Claude command against known API review scenarios. + +## Overview + +This test suite validates that the `/api-review` Claude command correctly identifies API documentation issues. Each test case consists of a patch file and expected issues. The suite applies patches to a clean clone of the repository, runs the API review command, and verifies the output matches expectations exactly (no missing issues, no hallucinated issues). + +## Directory Structure + +``` +tests/eval/ +├── eval_test.go # Main Ginkgo test suite +├── DESIGN.md # This file +├── testdata/ +│ ├── missing-optional-doc/ +│ │ ├── patch.diff # Git patch to apply +│ │ └── expected.txt # Expected issues (one per line) +│ ├── undocumented-enum/ +│ │ ├── patch.diff +│ │ └── expected.txt +│ └── valid-api-change/ +│ ├── patch.diff +│ └── expected.txt # Empty file = no issues expected +``` + +## Test Case Format + +### patch.diff + +Standard git diff format: + +```diff +diff --git a/config/v1/types.go b/config/v1/types.go +--- a/config/v1/types.go ++++ b/config/v1/types.go +@@ -10,0 +11,5 @@ ++// MyField does something ++// +optional ++// +kubebuilder:validation:Enum=Foo;Bar ++MyField string `json:"myField"` +``` + +### expected.txt + +One expected issue per line: + +``` +enum values Foo and Bar not documented in comment +optional field does not explain behavior when omitted +``` + +Empty file means the API change should pass review with no issues. + +**Note**: Order of issues in `expected.txt` does not matter. Comparison uses semantic matching, not exact string matching. + +## Test Flow + +``` +┌─────────────────────────────────────────────────────────────┐ +│ 1. Pre-flight: │ +│ a. Verify local AGENTS.md and │ +│ .claude/commands/api-review.md exist │ +│ b. These will be copied to temp dir after clone │ +│ (ensures local changes are tested) │ +├─────────────────────────────────────────────────────────────┤ +│ 2. Setup (once): │ +│ a. Shallow clone openshift/api to temp dir │ +│ b. Copy local AGENTS.md and .claude/ to temp dir │ +├─────────────────────────────────────────────────────────────┤ +│ 3. For each test case (sequential): │ +│ a. Reset repo to clean state │ +│ b. Apply patch.diff │ +│ c. Run claude api-review on changed files │ +│ d. Run claude to compare output vs expected.txt │ +│ e. Parse true/false response, assert │ +├─────────────────────────────────────────────────────────────┤ +│ 4. Teardown: Remove temp dir │ +└─────────────────────────────────────────────────────────────┘ +``` + +## Reset Between Tests + +```bash +git reset --hard origin/master && git clean -fd +``` + +- `git reset --hard origin/master`: Resets all tracked files to match the remote master branch, discarding any local commits and staged/unstaged changes +- `git clean -f`: Force remove untracked files (files not in git) +- `git clean -d`: Also remove untracked directories + +After reset, re-copy `AGENTS.md` and `.claude/` from local source to preserve local modifications being tested. + +**Remote origin handling**: The shallow clone creates `origin` automatically. Before reset, verify remote exists: + +```bash +git remote get-url origin || git remote add origin https://github.com/openshift/api.git +``` + +## Claude Invocations + +### Step 1 - Run API Review + +```bash +claude --print -p "/api-review" --allowedTools "Bash,Read,Grep,Glob,Task" +``` + +### Step 2 - Compare Results + +```bash +claude --print -p "Given this API review output: + + +Expected issues (one per line): + + +Compare the review output against the expected issues list. +The result is 'true' ONLY if: +1. ALL expected issues are identified in the output +2. NO additional issues are reported beyond what is expected + +If any expected issue is missing, reply 'false'. +If any issue is reported that is NOT in the expected list, reply 'false'. + +Reply with exactly 'true' or 'false' (no other text)." +``` + +Parse response, trim whitespace, check for `true` or `false`. + +## Pre-flight Check + +Before cloning, verify these local files exist in the source repo: + +- `AGENTS.md` +- `.claude/commands/api-review.md` + +These are copied into the temp clone so that any local modifications to the review command are tested, not the remote versions. + +## Configuration + +| Setting | Value | +|---------|-------| +| Timeout per Claude call | 5 minutes | +| Execution mode | Sequential | +| Clone depth | 1 (shallow) | +| Clone source | `https://github.com/openshift/api.git` | +| Reset between tests | Verify origin remote exists, `git reset --hard origin/master && git clean -fd`, re-copy local AGENTS.md and .claude/ | + +--- + +## Phase 2 (Future Work) + +### Patch Stability + +Patches may fail to apply as `origin/master` evolves over time. Need a strategy to handle this (e.g., pinning to a specific commit). + +### Error Handling + +Current design does not address failure scenarios: + +- Patch application failures +- Claude CLI timeouts or crashes +- Empty or malformed output from Claude +- Authentication failures +- Resource cleanup on test failures \ No newline at end of file diff --git a/tests/eval/eval_test.go b/tests/eval/eval_test.go new file mode 100644 index 00000000000..5eec55c381b --- /dev/null +++ b/tests/eval/eval_test.go @@ -0,0 +1,252 @@ +package eval + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os" + "os/exec" + "path/filepath" + "strings" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +const ( + claudeTimeout = 5 * time.Minute + cloneURL = "https://github.com/openshift/api.git" + testdataDir = "testdata" + patchFileName = "patch.diff" + expectedFileName = "expected.txt" +) + +var ( + tempDir string + localRepoRoot string + testCases []string +) + +func TestEval(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "API Review Eval Suite") +} + +var _ = BeforeSuite(func() { + var err error + localRepoRoot, err = filepath.Abs(filepath.Join("..", "..")) + Expect(err).NotTo(HaveOccurred()) + + By("verifying local AGENTS.md exists") + _, err = os.Stat(filepath.Join(localRepoRoot, "AGENTS.md")) + Expect(err).NotTo(HaveOccurred(), "AGENTS.md must exist in repository root") + + By("verifying local .claude/commands/api-review.md exists") + _, err = os.Stat(filepath.Join(localRepoRoot, ".claude", "commands", "api-review.md")) + Expect(err).NotTo(HaveOccurred(), ".claude/commands/api-review.md must exist") + + By("creating temp directory for clone") + tempDir, err = os.MkdirTemp("", "api-review-eval-*") + Expect(err).NotTo(HaveOccurred()) + + By("shallow cloning openshift/api") + cmd := exec.Command("git", "clone", "--depth", "1", cloneURL, tempDir) + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "git clone failed: %s", string(output)) + + copyLocalFiles() + + testdataPath := filepath.Join(localRepoRoot, "tests", "eval", testdataDir) + testCases, err = discoverTestCases(testdataPath) + Expect(err).NotTo(HaveOccurred()) + Expect(testCases).NotTo(BeEmpty(), "no test cases found in testdata directory") +}) + +var _ = AfterSuite(func() { + if tempDir != "" { + By("cleaning up temp directory") + os.RemoveAll(tempDir) + } +}) + +func copyLocalFiles() { + By("copying local AGENTS.md to temp clone") + src := filepath.Join(localRepoRoot, "AGENTS.md") + dst := filepath.Join(tempDir, "AGENTS.md") + data, err := os.ReadFile(src) + Expect(err).NotTo(HaveOccurred()) + err = os.WriteFile(dst, data, 0644) + Expect(err).NotTo(HaveOccurred()) + + By("copying local .claude directory to temp clone") + srcClaudeDir := filepath.Join(localRepoRoot, ".claude") + dstClaudeDir := filepath.Join(tempDir, ".claude") + os.RemoveAll(dstClaudeDir) + claudeFS := os.DirFS(srcClaudeDir) + err = os.CopyFS(dstClaudeDir, claudeFS) + Expect(err).NotTo(HaveOccurred()) +} + +func resetRepo() { + By("verifying origin remote exists") + cmd := exec.Command("git", "remote", "get-url", "origin") + cmd.Dir = tempDir + if err := cmd.Run(); err != nil { + addCmd := exec.Command("git", "remote", "add", "origin", cloneURL) + addCmd.Dir = tempDir + output, err := addCmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "failed to add origin remote: %s", string(output)) + } + + By("resetting repo to clean state") + cmd = exec.Command("git", "reset", "--hard", "origin/master") + cmd.Dir = tempDir + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "git reset failed: %s", string(output)) + + cmd = exec.Command("git", "clean", "-fd") + cmd.Dir = tempDir + output, err = cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "git clean failed: %s", string(output)) + + copyLocalFiles() +} + +func discoverTestCases(testdataPath string) ([]string, error) { + entries, err := os.ReadDir(testdataPath) + if err != nil { + return nil, fmt.Errorf("failed to read testdata directory: %w", err) + } + + var cases []string + for _, entry := range entries { + if entry.IsDir() { + patchPath := filepath.Join(testdataPath, entry.Name(), patchFileName) + expectedPath := filepath.Join(testdataPath, entry.Name(), expectedFileName) + + if _, err := os.Stat(patchPath); err != nil { + return nil, fmt.Errorf("patch.diff missing in %s: %w", entry.Name(), err) + } + if _, err := os.Stat(expectedPath); err != nil { + return nil, fmt.Errorf("expected.txt missing in %s: %w", entry.Name(), err) + } + + cases = append(cases, entry.Name()) + } + } + return cases, nil +} + +func loadTableEntries() []TableEntry { + cwd, err := os.Getwd() + Expect(err).NotTo(HaveOccurred()) + + testdataPath := filepath.Join(cwd, testdataDir) + cases, err := discoverTestCases(testdataPath) + Expect(err).NotTo(HaveOccurred()) + + var entries []TableEntry + for _, tc := range cases { + entries = append(entries, Entry(tc, tc)) + } + return entries +} + +var _ = Describe("API Review Evaluation", func() { + tableEntries := loadTableEntries() + + DescribeTable("should correctly review", + func(tc string) { + resetRepo() + + testCaseDir := filepath.Join(localRepoRoot, "tests", "eval", testdataDir, tc) + patchPath := filepath.Join(testCaseDir, patchFileName) + expectedPath := filepath.Join(testCaseDir, expectedFileName) + + By("reading patch file") + patchContent, err := os.ReadFile(patchPath) + Expect(err).NotTo(HaveOccurred()) + + By("applying patch") + cmd := exec.Command("git", "apply", "-") + cmd.Dir = tempDir + cmd.Stdin = bytes.NewReader(patchContent) + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "git apply failed: %s", string(output)) + + By("reading expected issues") + expectedContent, err := os.ReadFile(expectedPath) + Expect(err).NotTo(HaveOccurred()) + + By("running API review via Claude") + ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout) + defer cancel() + + reviewCmd := exec.CommandContext(ctx, "claude", + "--print", + "--dangerously-skip-permissions", + "-p", "/api-review", + "--allowedTools", "Bash,Read,Grep,Glob,Task", + ) + reviewCmd.Dir = tempDir + + reviewOutput, err := reviewCmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(reviewOutput)) + + reviewResult := string(reviewOutput) + + By("comparing results with Claude") + expectedStr := strings.TrimSpace(string(expectedContent)) + + comparePrompt := `You are a judge evaluating an API review output against expected issues. + +API review output: +` + reviewResult + ` + +Expected issues (one per line): +` + expectedStr + ` + +Compare using SEMANTIC matching - focus on whether the same fundamental problems were identified, not exact wording or action item counts. + +You should return pass=true ONLY if BOTH conditions are met: +1. ALL expected issues are semantically covered in the output (the same core problem is identified, even if described differently or split into sub-items) +2. NO unrelated issues are reported - if the review identifies a problem that is NOT semantically related to any expected issue, you should return pass=false + +Expanding on an expected issue is OK (e.g., "missing FeatureGate" expanding to include "register in features.go"). +Reporting an entirely different issue is NOT OK (e.g., if "missing length validation" is not in expected list, you should return pass=false). + +Examples of semantic matches: +- "missing FeatureGate" matches "needs FeatureGate and must register it in features.go" +- "optional field missing omitted behavior" matches "field does not document what happens when not specified" + +You should respond with ONLY a JSON object in this exact format (no markdown, no other text): +{"pass": true, "reason": "Brief summary of matched issues"} +or +{"pass": false, "reason": "Explanation of what was missing or what unexpected issue was found"}` + + ctx2, cancel2 := context.WithTimeout(context.Background(), claudeTimeout) + defer cancel2() + + compareCmd := exec.CommandContext(ctx2, "claude", "--print", "--dangerously-skip-permissions", "-p", comparePrompt) + compareCmd.Dir = tempDir + + compareOutput, err := compareCmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "claude compare command failed: %s", string(compareOutput)) + + var judgeResult struct { + Pass bool `json:"pass"` + Reason string `json:"reason"` + } + compareStr := strings.TrimSpace(string(compareOutput)) + err = json.Unmarshal([]byte(compareStr), &judgeResult) + Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", compareStr) + + GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", judgeResult.Pass, judgeResult.Reason) + Expect(judgeResult.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", judgeResult.Reason, reviewResult, expectedStr) + }, + tableEntries, + ) +}) diff --git a/tests/eval/testdata/missing-optional-doc/expected.txt b/tests/eval/testdata/missing-optional-doc/expected.txt new file mode 100644 index 00000000000..cd688369f48 --- /dev/null +++ b/tests/eval/testdata/missing-optional-doc/expected.txt @@ -0,0 +1,4 @@ +optional field customLogoURL does not explain behavior when omitted +missing URL validation pattern for customLogoURL +missing FeatureGate for new field on stable API +missing length validation for customLogoURL diff --git a/tests/eval/testdata/missing-optional-doc/patch.diff b/tests/eval/testdata/missing-optional-doc/patch.diff new file mode 100644 index 00000000000..ded8aac9385 --- /dev/null +++ b/tests/eval/testdata/missing-optional-doc/patch.diff @@ -0,0 +1,15 @@ +diff --git a/config/v1/types_console.go b/config/v1/types_console.go +--- a/config/v1/types_console.go ++++ b/config/v1/types_console.go +@@ -33,7 +33,11 @@ type ConsoleSpec struct { + // ConsoleSpec is the specification of the desired behavior of the Console. + type ConsoleSpec struct { + // +optional + Authentication ConsoleAuthentication `json:"authentication"` ++ ++ // customLogoURL specifies a URL for a custom logo image. ++ // +optional ++ CustomLogoURL string `json:"customLogoURL,omitempty"` + } + + // ConsoleStatus defines the observed status of the Console. diff --git a/tests/eval/testdata/undocumented-enum/expected.txt b/tests/eval/testdata/undocumented-enum/expected.txt new file mode 100644 index 00000000000..f3b2cf342f1 --- /dev/null +++ b/tests/eval/testdata/undocumented-enum/expected.txt @@ -0,0 +1,2 @@ +enum values Light and Dark not documented in comment +optional field theme does not explain behavior when omitted diff --git a/tests/eval/testdata/undocumented-enum/patch.diff b/tests/eval/testdata/undocumented-enum/patch.diff new file mode 100644 index 00000000000..0f5e90511ce --- /dev/null +++ b/tests/eval/testdata/undocumented-enum/patch.diff @@ -0,0 +1,16 @@ +diff --git a/config/v1/types_console.go b/config/v1/types_console.go +--- a/config/v1/types_console.go ++++ b/config/v1/types_console.go +@@ -33,7 +33,12 @@ type ConsoleSpec struct { + // ConsoleSpec is the specification of the desired behavior of the Console. + type ConsoleSpec struct { + // +optional + Authentication ConsoleAuthentication `json:"authentication"` ++ ++ // theme specifies the console color theme. ++ // +optional ++ // +kubebuilder:validation:Enum=Light;Dark ++ Theme string `json:"theme,omitempty"` + } + + // ConsoleStatus defines the observed status of the Console. diff --git a/tests/eval/testdata/valid-api-change/expected.txt b/tests/eval/testdata/valid-api-change/expected.txt new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/eval/testdata/valid-api-change/patch.diff b/tests/eval/testdata/valid-api-change/patch.diff new file mode 100644 index 00000000000..92fe9cc2927 --- /dev/null +++ b/tests/eval/testdata/valid-api-change/patch.diff @@ -0,0 +1,19 @@ +diff --git a/config/v1/types_console.go b/config/v1/types_console.go +--- a/config/v1/types_console.go ++++ b/config/v1/types_console.go +@@ -33,7 +33,15 @@ type ConsoleSpec struct { + // ConsoleSpec is the specification of the desired behavior of the Console. + type ConsoleSpec struct { + // +optional + Authentication ConsoleAuthentication `json:"authentication"` ++ ++ // bannerText is an optional field that specifies a custom banner message ++ // to display at the top of the console. Valid values are "Info", "Warning", ++ // and "Error" which control the banner styling. When omitted, no banner ++ // is displayed. ++ // +optional ++ // +kubebuilder:validation:Enum=Info;Warning;Error ++ BannerText string `json:"bannerText,omitempty"` + } + + // ConsoleStatus defines the observed status of the Console. From 0c769f001519c05c06c67b48a46e242cc8321bc2 Mon Sep 17 00:00:00 2001 From: Theo Barber-Bany Date: Thu, 27 Nov 2025 17:13:13 +0100 Subject: [PATCH 2/2] WIP: Refactor + golden/integration + different models --- tests/eval/DESIGN.md | 122 +++++++-- tests/eval/eval_test.go | 250 ++++++++++++------ .../golden/missing-optional-doc/expected.txt | 1 + .../golden/missing-optional-doc/patch.diff | 19 ++ .../golden/undocumented-enum/expected.txt | 1 + .../{ => golden}/undocumented-enum/patch.diff | 3 +- .../valid-api-change/expected.txt | 0 .../{ => golden}/valid-api-change/patch.diff | 0 .../new-field-multiple-issues}/expected.txt | 0 .../new-field-multiple-issues}/patch.diff | 0 .../testdata/undocumented-enum/expected.txt | 2 - 11 files changed, 295 insertions(+), 103 deletions(-) create mode 100644 tests/eval/testdata/golden/missing-optional-doc/expected.txt create mode 100644 tests/eval/testdata/golden/missing-optional-doc/patch.diff create mode 100644 tests/eval/testdata/golden/undocumented-enum/expected.txt rename tests/eval/testdata/{ => golden}/undocumented-enum/patch.diff (85%) rename tests/eval/testdata/{ => golden}/valid-api-change/expected.txt (100%) rename tests/eval/testdata/{ => golden}/valid-api-change/patch.diff (100%) rename tests/eval/testdata/{missing-optional-doc => integration/new-field-multiple-issues}/expected.txt (100%) rename tests/eval/testdata/{missing-optional-doc => integration/new-field-multiple-issues}/patch.diff (100%) delete mode 100644 tests/eval/testdata/undocumented-enum/expected.txt diff --git a/tests/eval/DESIGN.md b/tests/eval/DESIGN.md index 98850f3f3aa..ca1c0bc5a88 100644 --- a/tests/eval/DESIGN.md +++ b/tests/eval/DESIGN.md @@ -13,15 +13,20 @@ tests/eval/ ├── eval_test.go # Main Ginkgo test suite ├── DESIGN.md # This file ├── testdata/ -│ ├── missing-optional-doc/ -│ │ ├── patch.diff # Git patch to apply -│ │ └── expected.txt # Expected issues (one per line) -│ ├── undocumented-enum/ -│ │ ├── patch.diff -│ │ └── expected.txt -│ └── valid-api-change/ -│ ├── patch.diff -│ └── expected.txt # Empty file = no issues expected +│ ├── golden/ # Single-issue tests +│ │ ├── missing-optional-doc/ +│ │ │ ├── patch.diff +│ │ │ └── expected.txt +│ │ ├── undocumented-enum/ +│ │ │ ├── patch.diff +│ │ │ └── expected.txt +│ │ └── valid-api-change/ +│ │ ├── patch.diff +│ │ └── expected.txt # Empty file = no issues expected +│ └── integration/ # Multi-issue tests +│ └── new-field-multiple-issues/ +│ ├── patch.diff +│ └── expected.txt ``` ## Test Case Format @@ -148,18 +153,105 @@ These are copied into the temp clone so that any local modifications to the revi --- -## Phase 2 (Future Work) +## Phase 2 + +### Cost Tracking + +Use `--output-format json` to capture `total_cost_usd` from each Claude invocation. Accumulate across all calls (review + judge) and print the total in `AfterSuite`. + +### Test Structure Reorganization ✅ IMPLEMENTED + +Reorganize `testdata/` into two categories: + +``` +tests/eval/testdata/ +├── golden/ # Base truth tests - single isolated issues +│ ├── missing-optional-doc/ +│ │ ├── patch.diff # Triggers ONLY missing-optional-doc +│ │ └── expected.txt +│ ├── undocumented-enum/ +│ │ ├── patch.diff # Triggers ONLY undocumented-enum +│ │ └── expected.txt +│ ├── missing-featuregate/ +│ │ ├── patch.diff # Triggers ONLY missing-featuregate +│ │ └── expected.txt +│ └── valid-api-change/ +│ ├── patch.diff # Triggers NO issues +│ └── expected.txt +└── integration/ # Complex scenarios - multiple issues + ├── new-field-all-issues/ + │ ├── patch.diff # Triggers multiple issues together + │ └── expected.txt + └── partial-documentation/ + ├── patch.diff + └── expected.txt +``` + +**Golden tests**: Each patch is carefully crafted to trigger exactly one issue type. These validate that the review command correctly identifies individual issue categories in isolation. + +**Integration tests**: Patches that trigger multiple issues, testing the review command's ability to identify combinations of problems in realistic scenarios. + +### Model Selection ✅ IMPLEMENTED + +Each test tier has a default model, overridable via environment variable: + +| Test Type | Default Model | Override Env Var | +|-----------|---------------|------------------| +| Golden tests | Sonnet | `EVAL_GOLDEN_MODEL` | +| Integration tests | Opus | `EVAL_INTEGRATION_MODEL` | +| Judge LLM | Haiku | `EVAL_JUDGE_MODEL` | + +The test suite reads these at startup and applies per-tier: + +```go +goldenModel := getEnvOrDefault("EVAL_GOLDEN_MODEL", "claude-sonnet-4-5@20250929") +integrationModel := getEnvOrDefault("EVAL_INTEGRATION_MODEL", "claude-opus-4-5@20251101") +judgeModel := getEnvOrDefault("EVAL_JUDGE_MODEL", "claude-haiku-4-5-20251001") +``` + +Usage: +```bash +# Use defaults +go test ./tests/eval/... + +# Override golden tests to use Haiku +EVAL_GOLDEN_MODEL=claude-3-haiku-20240307 go test ./tests/eval/... + +# Override all models +EVAL_GOLDEN_MODEL=claude-3-haiku-20240307 \ +EVAL_INTEGRATION_MODEL=claude-sonnet-4-20250514 \ +go test ./tests/eval/... +``` ### Patch Stability -Patches may fail to apply as `origin/master` evolves over time. Need a strategy to handle this (e.g., pinning to a specific commit). +Patches may fail to apply as `origin/master` evolves over time. Strategies: + +- Pin to a specific commit SHA in the clone step +- Use `git apply --3way` for better conflict handling +- Periodic patch refresh CI job ### Error Handling Current design does not address failure scenarios: - Patch application failures -- Claude CLI timeouts or crashes -- Empty or malformed output from Claude -- Authentication failures -- Resource cleanup on test failures \ No newline at end of file +- Resource cleanup on test failures + +Using `--output-format json` also enables better error handling in future phases: + +- Claude CLI timeouts or crashes (detect via JSON parse failure or missing fields) +- Empty or malformed output (validate JSON structure) +- Authentication failures (check for error fields in JSON response) + +### Performance Optimizations + +The API review step is the slowest part of the eval suite. Options to improve: + +1. **Skip linting by default** - Update api-review command to skip `make lint` unless explicitly requested. Linting adds significant time. + +2. **Cache review outputs** - For development, cache the review output keyed by patch hash. Skip re-running if cached result exists. Clear cache on command changes. + +3. **Parallel test execution** - Run golden tests in parallel (requires separate repo clones per test). + +4. **Smaller/faster model for development** - Use Haiku for rapid iteration, Sonnet/Opus for CI validation. \ No newline at end of file diff --git a/tests/eval/eval_test.go b/tests/eval/eval_test.go index 5eec55c381b..cfccbc884b2 100644 --- a/tests/eval/eval_test.go +++ b/tests/eval/eval_test.go @@ -20,14 +20,49 @@ const ( claudeTimeout = 5 * time.Minute cloneURL = "https://github.com/openshift/api.git" testdataDir = "testdata" + goldenDir = "golden" + integrationDir = "integration" patchFileName = "patch.diff" expectedFileName = "expected.txt" + + defaultGoldenModel = "claude-sonnet-4-5@20250929" + defaultIntegrationModel = "claude-opus-4-5@20251101" + defaultJudgeModel = "claude-haiku-4-5@20251001" + + judgePromptTemplate = `You are a judge evaluating an API review output against expected issues. + +API review output: +%s + +Expected issues (one per line): +%s + +Compare using SEMANTIC matching - focus on whether the same fundamental problems were identified, not exact wording or action item counts. + +You should return pass=true ONLY if BOTH conditions are met: +1. ALL expected issues are semantically covered in the output (the same core problem is identified, even if described differently or split into sub-items) +2. NO unrelated issues are reported - if the review identifies a problem that is NOT semantically related to any expected issue, you should return pass=false + +Expanding on an expected issue is OK (e.g., "missing FeatureGate" expanding to include "register in features.go"). +Reporting an entirely different issue is NOT OK (e.g., if "missing length validation" is not in expected list, you should return pass=false). + +Examples of semantic matches: +- "missing FeatureGate" matches "needs FeatureGate and must register it in features.go" +- "optional field missing omitted behavior" matches "field does not document what happens when not specified" + +You MUST respond with ONLY a raw JSON object. Do NOT wrap in markdown code blocks. Do NOT include any other text. +{"pass": true, "reason": "Brief summary of matched issues"} +or +{"pass": false, "reason": "Explanation of what was missing or what unexpected issue was found"}` ) var ( - tempDir string - localRepoRoot string - testCases []string + tempDir string + localRepoRoot string + testCases []string + goldenModel string + integrationModel string + judgeModel string ) func TestEval(t *testing.T) { @@ -35,7 +70,18 @@ func TestEval(t *testing.T) { RunSpecs(t, "API Review Eval Suite") } +func envOrDefault(key, defaultVal string) string { + if val, ok := os.LookupEnv(key); ok { + return val + } + return defaultVal +} + var _ = BeforeSuite(func() { + goldenModel = envOrDefault("EVAL_GOLDEN_MODEL", defaultGoldenModel) + integrationModel = envOrDefault("EVAL_INTEGRATION_MODEL", defaultIntegrationModel) + judgeModel = envOrDefault("EVAL_JUDGE_MODEL", defaultJudgeModel) + var err error localRepoRoot, err = filepath.Abs(filepath.Join("..", "..")) Expect(err).NotTo(HaveOccurred()) @@ -59,10 +105,10 @@ var _ = BeforeSuite(func() { copyLocalFiles() - testdataPath := filepath.Join(localRepoRoot, "tests", "eval", testdataDir) - testCases, err = discoverTestCases(testdataPath) + goldenPath := filepath.Join(localRepoRoot, "tests", "eval", testdataDir, goldenDir) + testCases, err = discoverTestCases(goldenPath) Expect(err).NotTo(HaveOccurred()) - Expect(testCases).NotTo(BeEmpty(), "no test cases found in testdata directory") + Expect(testCases).NotTo(BeEmpty(), "no test cases found in testdata/golden directory") }) var _ = AfterSuite(func() { @@ -140,12 +186,12 @@ func discoverTestCases(testdataPath string) ([]string, error) { return cases, nil } -func loadTableEntries() []TableEntry { +func loadGoldenEntries() []TableEntry { cwd, err := os.Getwd() Expect(err).NotTo(HaveOccurred()) - testdataPath := filepath.Join(cwd, testdataDir) - cases, err := discoverTestCases(testdataPath) + goldenPath := filepath.Join(cwd, testdataDir, goldenDir) + cases, err := discoverTestCases(goldenPath) Expect(err).NotTo(HaveOccurred()) var entries []TableEntry @@ -155,98 +201,132 @@ func loadTableEntries() []TableEntry { return entries } -var _ = Describe("API Review Evaluation", func() { - tableEntries := loadTableEntries() - - DescribeTable("should correctly review", - func(tc string) { - resetRepo() - - testCaseDir := filepath.Join(localRepoRoot, "tests", "eval", testdataDir, tc) - patchPath := filepath.Join(testCaseDir, patchFileName) - expectedPath := filepath.Join(testCaseDir, expectedFileName) - - By("reading patch file") - patchContent, err := os.ReadFile(patchPath) - Expect(err).NotTo(HaveOccurred()) - - By("applying patch") - cmd := exec.Command("git", "apply", "-") - cmd.Dir = tempDir - cmd.Stdin = bytes.NewReader(patchContent) - output, err := cmd.CombinedOutput() - Expect(err).NotTo(HaveOccurred(), "git apply failed: %s", string(output)) +func loadIntegrationEntries() []TableEntry { + cwd, err := os.Getwd() + Expect(err).NotTo(HaveOccurred()) - By("reading expected issues") - expectedContent, err := os.ReadFile(expectedPath) - Expect(err).NotTo(HaveOccurred()) + integrationPath := filepath.Join(cwd, testdataDir, integrationDir) + cases, err := discoverTestCases(integrationPath) + if err != nil || len(cases) == 0 { + return nil + } - By("running API review via Claude") - ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout) - defer cancel() + var entries []TableEntry + for _, tc := range cases { + entries = append(entries, Entry(tc, tc)) + } + return entries +} - reviewCmd := exec.CommandContext(ctx, "claude", - "--print", - "--dangerously-skip-permissions", - "-p", "/api-review", - "--allowedTools", "Bash,Read,Grep,Glob,Task", - ) - reviewCmd.Dir = tempDir +type evalResult struct { + Pass bool `json:"pass"` + Reason string `json:"reason"` +} - reviewOutput, err := reviewCmd.CombinedOutput() - Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(reviewOutput)) +func stripMarkdownCodeBlock(s string) string { + s = strings.TrimSpace(s) + s = strings.TrimPrefix(s, "```json") + s = strings.TrimPrefix(s, "```") + s = strings.TrimSuffix(s, "```") + return strings.TrimSpace(s) +} - reviewResult := string(reviewOutput) +func readAndApplyPatch(patchPath string) { + By("reading and applying patch") + patchContent, err := os.ReadFile(patchPath) + Expect(err).NotTo(HaveOccurred()) - By("comparing results with Claude") - expectedStr := strings.TrimSpace(string(expectedContent)) + cmd := exec.Command("git", "apply", "-") + cmd.Dir = tempDir + cmd.Stdin = bytes.NewReader(patchContent) + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "git apply failed: %s", string(output)) +} - comparePrompt := `You are a judge evaluating an API review output against expected issues. +// runAPIReview and runJudge can probably share some common code. +func runAPIReview(model string) string { + By(fmt.Sprintf("running API review via Claude (%s)", model)) + ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout) + defer cancel() + + cmd := exec.CommandContext(ctx, "claude", + "--print", + "--dangerously-skip-permissions", + "--model", model, + "-p", "/api-review", + "--allowedTools", "Bash,Read,Grep,Glob,Task", + ) + cmd.Dir = tempDir -API review output: -` + reviewResult + ` + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "claude command failed: %s", string(output)) + return string(output) +} -Expected issues (one per line): -` + expectedStr + ` +func runJudge(model, reviewOutput, expectedIssues string) evalResult { + By(fmt.Sprintf("comparing results with Claude judge (%s)", model)) + ctx, cancel := context.WithTimeout(context.Background(), claudeTimeout) + defer cancel() + + prompt := fmt.Sprintf(judgePromptTemplate, reviewOutput, expectedIssues) + cmd := exec.CommandContext(ctx, "claude", + "--print", + "--dangerously-skip-permissions", + "--model", model, + "-p", prompt, + ) + cmd.Dir = tempDir -Compare using SEMANTIC matching - focus on whether the same fundamental problems were identified, not exact wording or action item counts. + output, err := cmd.CombinedOutput() + Expect(err).NotTo(HaveOccurred(), "claude judge command failed: %s", string(output)) -You should return pass=true ONLY if BOTH conditions are met: -1. ALL expected issues are semantically covered in the output (the same core problem is identified, even if described differently or split into sub-items) -2. NO unrelated issues are reported - if the review identifies a problem that is NOT semantically related to any expected issue, you should return pass=false + var result evalResult + jsonStr := stripMarkdownCodeBlock(string(output)) + err = json.Unmarshal([]byte(jsonStr), &result) + Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", string(output)) + return result +} -Expanding on an expected issue is OK (e.g., "missing FeatureGate" expanding to include "register in features.go"). -Reporting an entirely different issue is NOT OK (e.g., if "missing length validation" is not in expected list, you should return pass=false). +func runTestCase(tier, tc, reviewModel, judgeModelName string) { + resetRepo() -Examples of semantic matches: -- "missing FeatureGate" matches "needs FeatureGate and must register it in features.go" -- "optional field missing omitted behavior" matches "field does not document what happens when not specified" + testCaseDir := filepath.Join(localRepoRoot, "tests", "eval", testdataDir, tier, tc) + readAndApplyPatch(filepath.Join(testCaseDir, patchFileName)) -You should respond with ONLY a JSON object in this exact format (no markdown, no other text): -{"pass": true, "reason": "Brief summary of matched issues"} -or -{"pass": false, "reason": "Explanation of what was missing or what unexpected issue was found"}` + expectedContent, err := os.ReadFile(filepath.Join(testCaseDir, expectedFileName)) + Expect(err).NotTo(HaveOccurred()) + expectedIssues := strings.TrimSpace(string(expectedContent)) - ctx2, cancel2 := context.WithTimeout(context.Background(), claudeTimeout) - defer cancel2() + reviewOutput := runAPIReview(reviewModel) + result := runJudge(judgeModelName, reviewOutput, expectedIssues) - compareCmd := exec.CommandContext(ctx2, "claude", "--print", "--dangerously-skip-permissions", "-p", comparePrompt) - compareCmd.Dir = tempDir + GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", result.Pass, result.Reason) + Expect(result.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", result.Reason, reviewOutput, expectedIssues) +} - compareOutput, err := compareCmd.CombinedOutput() - Expect(err).NotTo(HaveOccurred(), "claude compare command failed: %s", string(compareOutput)) +var _ = Describe("API Review Evaluation", func() { + Context("Golden Tests", func() { + goldenEntries := loadGoldenEntries() + + DescribeTable("should correctly identify single issues", + func(tc string) { + runTestCase(goldenDir, tc, goldenModel, judgeModel) + }, + goldenEntries, + ) + }) + + Context("Integration Tests", func() { + integrationEntries := loadIntegrationEntries() + if len(integrationEntries) == 0 { + return + } - var judgeResult struct { - Pass bool `json:"pass"` - Reason string `json:"reason"` - } - compareStr := strings.TrimSpace(string(compareOutput)) - err = json.Unmarshal([]byte(compareStr), &judgeResult) - Expect(err).NotTo(HaveOccurred(), "failed to parse judge response as JSON: %s", compareStr) - - GinkgoWriter.Printf("Judge result: pass=%v, reason=%s\n", judgeResult.Pass, judgeResult.Reason) - Expect(judgeResult.Pass).To(BeTrue(), "API review did not match expected issues.\nJudge reason: %s\nReview output:\n%s\nExpected issues:\n%s", judgeResult.Reason, reviewResult, expectedStr) - }, - tableEntries, - ) + DescribeTable("should correctly identify multiple issues", + func(tc string) { + runTestCase(integrationDir, tc, integrationModel, judgeModel) + }, + integrationEntries, + ) + }) }) diff --git a/tests/eval/testdata/golden/missing-optional-doc/expected.txt b/tests/eval/testdata/golden/missing-optional-doc/expected.txt new file mode 100644 index 00000000000..05fdebf7d9d --- /dev/null +++ b/tests/eval/testdata/golden/missing-optional-doc/expected.txt @@ -0,0 +1 @@ +optional field does not explain behavior when omitted diff --git a/tests/eval/testdata/golden/missing-optional-doc/patch.diff b/tests/eval/testdata/golden/missing-optional-doc/patch.diff new file mode 100644 index 00000000000..0158a228f4a --- /dev/null +++ b/tests/eval/testdata/golden/missing-optional-doc/patch.diff @@ -0,0 +1,19 @@ +diff --git a/config/v1/types_console.go b/config/v1/types_console.go +--- a/config/v1/types_console.go ++++ b/config/v1/types_console.go +@@ -33,7 +33,15 @@ type ConsoleSpec struct { + // ConsoleSpec is the specification of the desired behavior of the Console. + type ConsoleSpec struct { + // +optional + Authentication ConsoleAuthentication `json:"authentication"` ++ ++ // customLogoURL specifies a URL for a custom logo image. ++ // The URL must be a valid HTTPS URL and cannot exceed 2048 characters. ++ // +optional ++ // +openshift:enable:FeatureGate=CustomConsoleLogo ++ // +kubebuilder:validation:Pattern=`^$|^https://[^\s]+$` ++ // +kubebuilder:validation:MaxLength=2048 ++ CustomLogoURL string `json:"customLogoURL,omitempty"` + } + + // ConsoleStatus defines the observed status of the Console. diff --git a/tests/eval/testdata/golden/undocumented-enum/expected.txt b/tests/eval/testdata/golden/undocumented-enum/expected.txt new file mode 100644 index 00000000000..3a6cd6f7a13 --- /dev/null +++ b/tests/eval/testdata/golden/undocumented-enum/expected.txt @@ -0,0 +1 @@ +enum values Light and Dark not documented in comment diff --git a/tests/eval/testdata/undocumented-enum/patch.diff b/tests/eval/testdata/golden/undocumented-enum/patch.diff similarity index 85% rename from tests/eval/testdata/undocumented-enum/patch.diff rename to tests/eval/testdata/golden/undocumented-enum/patch.diff index 0f5e90511ce..13f28656c00 100644 --- a/tests/eval/testdata/undocumented-enum/patch.diff +++ b/tests/eval/testdata/golden/undocumented-enum/patch.diff @@ -1,7 +1,7 @@ diff --git a/config/v1/types_console.go b/config/v1/types_console.go --- a/config/v1/types_console.go +++ b/config/v1/types_console.go -@@ -33,7 +33,12 @@ type ConsoleSpec struct { +@@ -33,7 +33,13 @@ type ConsoleSpec struct { // ConsoleSpec is the specification of the desired behavior of the Console. type ConsoleSpec struct { // +optional @@ -10,6 +10,7 @@ diff --git a/config/v1/types_console.go b/config/v1/types_console.go + // theme specifies the console color theme. + // +optional + // +kubebuilder:validation:Enum=Light;Dark ++ // When omitted the default theme is used. + Theme string `json:"theme,omitempty"` } diff --git a/tests/eval/testdata/valid-api-change/expected.txt b/tests/eval/testdata/golden/valid-api-change/expected.txt similarity index 100% rename from tests/eval/testdata/valid-api-change/expected.txt rename to tests/eval/testdata/golden/valid-api-change/expected.txt diff --git a/tests/eval/testdata/valid-api-change/patch.diff b/tests/eval/testdata/golden/valid-api-change/patch.diff similarity index 100% rename from tests/eval/testdata/valid-api-change/patch.diff rename to tests/eval/testdata/golden/valid-api-change/patch.diff diff --git a/tests/eval/testdata/missing-optional-doc/expected.txt b/tests/eval/testdata/integration/new-field-multiple-issues/expected.txt similarity index 100% rename from tests/eval/testdata/missing-optional-doc/expected.txt rename to tests/eval/testdata/integration/new-field-multiple-issues/expected.txt diff --git a/tests/eval/testdata/missing-optional-doc/patch.diff b/tests/eval/testdata/integration/new-field-multiple-issues/patch.diff similarity index 100% rename from tests/eval/testdata/missing-optional-doc/patch.diff rename to tests/eval/testdata/integration/new-field-multiple-issues/patch.diff diff --git a/tests/eval/testdata/undocumented-enum/expected.txt b/tests/eval/testdata/undocumented-enum/expected.txt deleted file mode 100644 index f3b2cf342f1..00000000000 --- a/tests/eval/testdata/undocumented-enum/expected.txt +++ /dev/null @@ -1,2 +0,0 @@ -enum values Light and Dark not documented in comment -optional field theme does not explain behavior when omitted