Skip to content

Commit 4bc19e1

Browse files
committed
Add matrix filtering, retry, resume, cleanup, and HTML reports to harness
- Golem state cleanup uses golem server clean + restart between scenarios - Interactive TTY confirmation when not running on CI - --no-cleanup flag to skip cleanup - HTML summary report generation
2 parents 2a11e49 + 30f42f8 commit 4bc19e1

38 files changed

+3147
-2086
lines changed

.github/workflows/skills-test.yaml

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,10 @@ name: Skills Test Harness
33
on:
44
push:
55
paths:
6-
- 'skills/**'
7-
- 'tests/harness/**'
6+
- 'golem-skills/**'
87
pull_request:
98
paths:
10-
- 'skills/**'
11-
- 'tests/harness/**'
9+
- 'golem-skills/**'
1210
schedule:
1311
- cron: '0 6 * * 1' # Weekly Monday 6am UTC
1412
workflow_dispatch:
@@ -30,17 +28,17 @@ jobs:
3028
with:
3129
node-version: 20
3230
cache: 'npm'
33-
cache-dependency-path: tests/harness/package.json
31+
cache-dependency-path: golem-skills/tests/harness/package.json
3432

3533
- name: Install and Build
3634
run: |
37-
cd tests/harness
35+
cd golem-skills/tests/harness
3836
npm install
3937
npm run build
4038
4139
- name: Run Unit Tests
4240
run: |
43-
cd tests/harness
41+
cd golem-skills/tests/harness
4442
npm test
4543
4644
integration-test:
@@ -50,7 +48,7 @@ jobs:
5048
strategy:
5149
fail-fast: false
5250
matrix:
53-
agent: [claude-code, gemini, opencode]
51+
agent: [claude-code, gemini, opencode, codex]
5452
language: [ts]
5553
exclude:
5654
- agent: claude-code
@@ -63,7 +61,7 @@ jobs:
6361
with:
6462
node-version: 20
6563
cache: 'npm'
66-
cache-dependency-path: tests/harness/package.json
64+
cache-dependency-path: golem-skills/tests/harness/package.json
6765

6866
- name: Install Rust (with wasm32-wasip2)
6967
uses: dtolnay/rust-toolchain@stable
@@ -97,28 +95,54 @@ jobs:
9795

9896
- name: Install Gemini CLI
9997
if: matrix.agent == 'gemini'
100-
run: npm install -g @google/gemini-cli
98+
run: |
99+
npm install -g @google/gemini-cli
100+
mkdir -p ~/.gemini
101101
102102
- name: Install OpenCode
103103
if: matrix.agent == 'opencode'
104104
run: npm install -g opencode-ai
105105

106+
- name: Install Codex CLI
107+
if: matrix.agent == 'codex'
108+
env:
109+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
110+
run: |
111+
npm install -g @openai/codex
112+
printenv OPENAI_API_KEY | codex login --with-api-key
113+
106114
- name: Install Harness Dependencies
107115
run: |
108-
cd tests/harness
116+
cd golem-skills/tests/harness
109117
npm install
118+
npm run build
119+
npm install -g tsx
120+
121+
- name: Setup isolated test directory
122+
run: |
123+
mkdir -p /tmp/harness-run
124+
cp -r golem-skills/tests/harness/dist /tmp/harness-run/dist
125+
cp -r golem-skills/tests/harness/node_modules /tmp/harness-run/node_modules
126+
cp -r golem-skills/tests/harness/src /tmp/harness-run/src
127+
cp -r golem-skills/tests/harness/scenarios /tmp/harness-run/scenarios
128+
cp golem-skills/tests/harness/package.json /tmp/harness-run/
129+
cp golem-skills/tests/harness/tsconfig.json /tmp/harness-run/
130+
cp -r golem-skills/skills /tmp/harness-run/skills
131+
cd /tmp/harness-run && git init -b main && git config user.email "ci@golem.cloud" && git config user.name "CI" && git add -A && git commit -m "init"
110132
111133
- name: Run Skill Tests
112134
env:
113135
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
114-
GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
136+
GEMINI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
137+
GOOGLE_GENERATIVE_AI_API_KEY: ${{ secrets.GEMINI_API_KEY }}
138+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
115139
run: |
116-
cd tests/harness
117-
npx tsx src/run.ts --agent ${{ matrix.agent }} --language ${{ matrix.language }} --scenarios scenarios
140+
cd /tmp/harness-run
141+
tsx src/run.ts --agent ${{ matrix.agent }} --language ${{ matrix.language }} --scenarios scenarios --skills ./skills
118142
119143
- name: Upload Results
120144
if: always()
121145
uses: actions/upload-artifact@v4
122146
with:
123147
name: results-${{ matrix.agent }}-${{ matrix.language }}
124-
path: tests/harness/results/
148+
path: /tmp/harness-run/results/

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ logs
2121
data
2222
sdks/**/node_modules/
2323
sdks/**/dist/
24+
golem-skills/**/node_modules/
25+
golem-skills/**/dist/
2426
.DS_Store
2527

2628
# locally kept custom dev files

AGENTS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,5 +99,6 @@ All crate dependencies must have their versions specified in the root workspace
9999
- `golem-rib/` - Rib language implementation
100100
- `cli/` - CLI tools (golem-cli, golem)
101101
- `sdks/` - Language-specific SDKs (Rust, TypeScript) - **not part of main build flow, see SDK-specific AGENTS.md**
102+
- `golem-skills/` - Skill definitions and skill testing harness
102103
- `integration-tests/` - Integration test suite
103104
- `test-components/` - Test WASM components

skills/golem-new-project/SKILL.md renamed to golem-skills/skills/golem-new-project/SKILL.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,10 @@ description: "Creating a new Golem application project. Use when scaffolding a n
55

66
# Creating a New Golem Project
77

8-
Use the `golem new` CLI command to scaffold a new Golem application with the desired language template.
8+
**Important: Do not try to build golem from scratch or install it manually.**
9+
10+
Assume the `golem` or `golem-cli` binary exists and is added to PATH.
11+
Try `golem --version` to check if it exists. If not, try `golem-cli --version`. Every command below works for `golem` and `golem-cli`
912

1013
## Step 1: Run `golem new`
1114

@@ -39,6 +42,7 @@ After running `golem new`, verify the following:
3942
## Step 3: Build the Project
4043

4144
```shell
45+
cd <APPLICATION_NAME>
4246
golem build
4347
```
4448
## Checklist
File renamed without changes.
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
name: "golem-new-project-ts"
2+
settings:
3+
timeout_per_subprompt: 300
4+
steps:
5+
- id: "create-project"
6+
prompt: "Create a new Golem application called test-app with TypeScript"
7+
expectedSkills:
8+
- "golem-new-project"
9+
verify:
10+
build: true
11+
12+
- id: "check-project-files"
13+
shell:
14+
command: "ls"
15+
args: ["test-app/golem.yaml"]
16+
expect:
17+
exit_code: 0
18+
19+
# Deploy requires components to be added to the project first;
20+
# golem-new-project only scaffolds the app structure.

tests/harness/src/assertions.ts renamed to golem-skills/tests/harness/src/assertions.ts

Lines changed: 27 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
import { JSONPath } from 'jsonpath-plus';
2-
import { z } from 'zod';
1+
import { JSONPath } from "jsonpath-plus";
2+
import { z } from "zod";
33

44
const ResultJsonAssertionSchema = z.object({
55
path: z.string(),
@@ -35,23 +35,27 @@ export interface AssertionResult {
3535
message: string;
3636
}
3737

38-
export function evaluate(context: AssertionContext, expect: ExpectSpec): AssertionResult[] {
38+
export function evaluate(
39+
context: AssertionContext,
40+
expect: ExpectSpec,
41+
): AssertionResult[] {
3942
const results: AssertionResult[] = [];
4043

4144
if (expect.exit_code !== undefined) {
4245
results.push({
43-
assertion: 'exit_code',
46+
assertion: "exit_code",
4447
passed: context.exitCode === expect.exit_code,
45-
message: context.exitCode === expect.exit_code
46-
? `exit code is ${expect.exit_code}`
47-
: `expected exit code ${expect.exit_code}, got ${context.exitCode}`,
48+
message:
49+
context.exitCode === expect.exit_code
50+
? `exit code is ${expect.exit_code}`
51+
: `expected exit code ${expect.exit_code}, got ${context.exitCode}`,
4852
});
4953
}
5054

5155
if (expect.stdout_contains !== undefined) {
5256
const passed = context.stdout.includes(expect.stdout_contains);
5357
results.push({
54-
assertion: 'stdout_contains',
58+
assertion: "stdout_contains",
5559
passed,
5660
message: passed
5761
? `stdout contains "${expect.stdout_contains}"`
@@ -62,7 +66,7 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
6266
if (expect.stdout_not_contains !== undefined) {
6367
const passed = !context.stdout.includes(expect.stdout_not_contains);
6468
results.push({
65-
assertion: 'stdout_not_contains',
69+
assertion: "stdout_not_contains",
6670
passed,
6771
message: passed
6872
? `stdout does not contain "${expect.stdout_not_contains}"`
@@ -74,7 +78,7 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
7478
const regex = new RegExp(expect.stdout_matches);
7579
const passed = regex.test(context.stdout);
7680
results.push({
77-
assertion: 'stdout_matches',
81+
assertion: "stdout_matches",
7882
passed,
7983
message: passed
8084
? `stdout matches /${expect.stdout_matches}/`
@@ -85,7 +89,7 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
8589
if (expect.status !== undefined) {
8690
const passed = context.status === expect.status;
8791
results.push({
88-
assertion: 'status',
92+
assertion: "status",
8993
passed,
9094
message: passed
9195
? `status is ${expect.status}`
@@ -94,10 +98,10 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
9498
}
9599

96100
if (expect.body_contains !== undefined) {
97-
const body = context.body ?? '';
101+
const body = context.body ?? "";
98102
const passed = body.includes(expect.body_contains);
99103
results.push({
100-
assertion: 'body_contains',
104+
assertion: "body_contains",
101105
passed,
102106
message: passed
103107
? `body contains "${expect.body_contains}"`
@@ -106,11 +110,11 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
106110
}
107111

108112
if (expect.body_matches !== undefined) {
109-
const body = context.body ?? '';
113+
const body = context.body ?? "";
110114
const regex = new RegExp(expect.body_matches);
111115
const passed = regex.test(body);
112116
results.push({
113-
assertion: 'body_matches',
117+
assertion: "body_matches",
114118
passed,
115119
message: passed
116120
? `body matches /${expect.body_matches}/`
@@ -120,10 +124,15 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
120124

121125
if (expect.result_json && expect.result_json.length > 0) {
122126
for (const jsonAssert of expect.result_json) {
123-
const pathResults = JSONPath({ path: jsonAssert.path, json: context.resultJson as object });
127+
const pathResults = JSONPath({
128+
path: jsonAssert.path,
129+
json: context.resultJson as object,
130+
});
124131

125132
if (jsonAssert.equals !== undefined) {
126-
const passed = pathResults.length > 0 && JSON.stringify(pathResults[0]) === JSON.stringify(jsonAssert.equals);
133+
const passed =
134+
pathResults.length > 0 &&
135+
JSON.stringify(pathResults[0]) === JSON.stringify(jsonAssert.equals);
127136
results.push({
128137
assertion: `result_json[${jsonAssert.path}].equals`,
129138
passed,
@@ -134,7 +143,7 @@ export function evaluate(context: AssertionContext, expect: ExpectSpec): Asserti
134143
}
135144

136145
if (jsonAssert.contains !== undefined) {
137-
const value = pathResults.length > 0 ? String(pathResults[0]) : '';
146+
const value = pathResults.length > 0 ? String(pathResults[0]) : "";
138147
const passed = value.includes(jsonAssert.contains);
139148
results.push({
140149
assertion: `result_json[${jsonAssert.path}].contains`,

0 commit comments

Comments
 (0)