Skip to content

Commit 30f42f8

Browse files
authored
Add template variables, conditional steps, dry-run, abort handling, and GitHub summary to harness (#2960)
- Template variable substitution ({{agent}}, {{language}}, {{workspace}}, {{scenario}}) in step fields - Conditional step execution with only_if/skip_if on agent, language, and os - --dry-run flag to validate scenarios and print step summaries without executing - Graceful Ctrl+C handling with partial result writing via AbortController - GitHub Actions job summary markdown output via GITHUB_STEP_SUMMARY - Remove issue number references from comments
1 parent f632aae commit 30f42f8

File tree

8 files changed

+969
-5
lines changed

8 files changed

+969
-5
lines changed

golem-skills/tests/harness/src/executor.ts

Lines changed: 147 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@ import { SkillWatcher } from "./watcher.js";
88
import { evaluate, ExpectSchema, type AssertionContext } from "./assertions.js";
99

1010
export const DEFAULT_STEP_TIMEOUT_SECONDS = 300;
11-
1211
// --- Schemas ---
1312

1413
const InvokeSchema = z.object({
@@ -39,6 +38,12 @@ const DeleteAgentSchema = z.object({
3938
name: z.string(),
4039
});
4140

41+
const StepConditionSchema = z.object({
42+
agent: z.string().optional(),
43+
language: z.string().optional(),
44+
os: z.string().optional(),
45+
});
46+
4247
const ACTION_FIELDS = [
4348
"prompt",
4449
"invoke",
@@ -71,6 +76,8 @@ const StepSpecSchema = z
7176
trigger: TriggerSchema.optional(),
7277
create_agent: CreateAgentSchema.optional(),
7378
delete_agent: DeleteAgentSchema.optional(),
79+
only_if: StepConditionSchema.optional(),
80+
skip_if: StepConditionSchema.optional(),
7481
})
7582
.refine(
7683
(step) => {
@@ -94,7 +101,6 @@ const SettingsSchema = z
94101
cleanup: z.boolean().optional(),
95102
})
96103
.optional();
97-
98104
const PrerequisitesSchema = z
99105
.object({
100106
env: z.record(z.string()).optional(),
@@ -120,6 +126,8 @@ interface StepCommon {
120126
deploy?: boolean;
121127
};
122128
expect?: z.infer<typeof ExpectSchema>;
129+
only_if?: StepCondition;
130+
skip_if?: StepCondition;
123131
}
124132

125133
type InvokeSpec = { agent: string; function: string; args?: string };
@@ -248,6 +256,57 @@ export interface ScenarioRunResult {
248256

249257
export interface ScenarioExecutorOptions {
250258
globalTimeoutSeconds?: number;
259+
agent?: string;
260+
language?: string;
261+
abortSignal?: AbortSignal;
262+
}
263+
264+
// --- Template variable substitution ---
265+
266+
export function substituteVariables(
267+
text: string,
268+
variables: Record<string, string>,
269+
): string {
270+
return text.replace(/\{\{(\w+)\}\}/g, (match, name: string) => {
271+
return variables[name] ?? match;
272+
});
273+
}
274+
275+
// --- Conditional step execution ---
276+
277+
export interface StepCondition {
278+
agent?: string;
279+
language?: string;
280+
os?: string;
281+
}
282+
283+
function normalizePlatform(platform: string): string {
284+
if (platform === "darwin") return "macos";
285+
if (platform === "win32") return "windows";
286+
return platform;
287+
}
288+
289+
export function shouldRunStep(
290+
step: StepSpec,
291+
context: { agent?: string; language?: string; os: string },
292+
): boolean {
293+
const normalizedOs = normalizePlatform(context.os);
294+
295+
if (step.only_if) {
296+
const cond = step.only_if;
297+
if (cond.agent && cond.agent !== context.agent) return false;
298+
if (cond.language && cond.language !== context.language) return false;
299+
if (cond.os && cond.os !== normalizedOs) return false;
300+
}
301+
302+
if (step.skip_if) {
303+
const cond = step.skip_if;
304+
if (cond.agent && cond.agent === context.agent) return false;
305+
if (cond.language && cond.language === context.language) return false;
306+
if (cond.os && cond.os === normalizedOs) return false;
307+
}
308+
309+
return true;
251310
}
252311

253312
export class ScenarioExecutor {
@@ -271,6 +330,64 @@ export class ScenarioExecutor {
271330
this.options = options ?? {};
272331
}
273332

333+
private buildVariables(scenarioName: string): Record<string, string> {
334+
const vars: Record<string, string> = {
335+
workspace: this.workspace,
336+
scenario: scenarioName,
337+
};
338+
if (this.options.agent) vars["agent"] = this.options.agent;
339+
if (this.options.language) vars["language"] = this.options.language;
340+
return vars;
341+
}
342+
343+
private substituteStepVariables(
344+
step: StepSpec,
345+
variables: Record<string, string>,
346+
): StepSpec {
347+
const sub = (s: string | undefined) =>
348+
s ? substituteVariables(s, variables) : s;
349+
const subArr = (arr: string[] | undefined) =>
350+
arr?.map((s) => substituteVariables(s, variables));
351+
352+
return {
353+
...step,
354+
prompt: sub(step.prompt),
355+
shell: step.shell
356+
? {
357+
command: substituteVariables(step.shell.command, variables),
358+
args: subArr(step.shell.args),
359+
cwd: sub(step.shell.cwd),
360+
}
361+
: step.shell,
362+
invoke: step.invoke
363+
? {
364+
agent: substituteVariables(step.invoke.agent, variables),
365+
function: substituteVariables(step.invoke.function, variables),
366+
args: sub(step.invoke.args),
367+
}
368+
: step.invoke,
369+
trigger: step.trigger
370+
? {
371+
agent: substituteVariables(step.trigger.agent, variables),
372+
function: substituteVariables(step.trigger.function, variables),
373+
args: sub(step.trigger.args),
374+
}
375+
: step.trigger,
376+
create_agent: step.create_agent
377+
? {
378+
...step.create_agent,
379+
name: substituteVariables(step.create_agent.name, variables),
380+
}
381+
: step.create_agent,
382+
delete_agent: step.delete_agent
383+
? {
384+
...step.delete_agent,
385+
name: substituteVariables(step.delete_agent.name, variables),
386+
}
387+
: step.delete_agent,
388+
} as StepSpec;
389+
}
390+
274391
async execute(spec: ScenarioSpec): Promise<ScenarioRunResult> {
275392
const results: StepResult[] = [];
276393
const savedEnv: Record<string, string | undefined> = {};
@@ -304,11 +421,37 @@ export class ScenarioExecutor {
304421

305422
// Build extra env for commands from settings
306423
const commandEnv = this.buildCommandEnv(spec);
424+
const variables = this.buildVariables(spec.name);
425+
const conditionContext = {
426+
agent: this.options.agent,
427+
language: this.options.language,
428+
os: process.platform,
429+
};
307430

308431
const startTime = Date.now();
309432
let isFirstPrompt = true;
310433
try {
311-
for (const step of spec.steps) {
434+
for (const originalStep of spec.steps) {
435+
// Check abort signal
436+
if (this.options.abortSignal?.aborted) break;
437+
438+
// Substitute template variables
439+
const step = this.substituteStepVariables(originalStep, variables);
440+
441+
// Conditional execution
442+
if (!shouldRunStep(step, conditionContext)) {
443+
console.log(
444+
`Step ${step.id ?? "(unnamed)"}: skipped (condition not met)`,
445+
);
446+
results.push({
447+
step: originalStep,
448+
success: true,
449+
durationSeconds: 0,
450+
expectedSkills: step.expectedSkills ?? [],
451+
activatedSkills: [],
452+
});
453+
continue;
454+
}
312455
const stepStartTime = Date.now();
313456
let stepSuccess = true;
314457
const stepErrors: string[] = [];
@@ -601,7 +744,7 @@ export class ScenarioExecutor {
601744
}
602745

603746
results.push({
604-
step,
747+
step: originalStep,
605748
success: stepSuccess,
606749
durationSeconds: (Date.now() - stepStartTime) / 1000,
607750
expectedSkills: step.expectedSkills ?? [],

golem-skills/tests/harness/src/run.ts

Lines changed: 108 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ async function main() {
6060
timeout: { type: "string" },
6161
skills: { type: "string", default: "../../skills" },
6262
help: { type: "boolean", short: "h", default: false },
63+
"dry-run": { type: "boolean", default: false },
6364
},
6465
});
6566

@@ -70,6 +71,7 @@ async function main() {
7071
timeout,
7172
skills: skillsDirRel,
7273
help,
74+
"dry-run": dryRun,
7375
} = values;
7476
const agentArg = values.agent ?? "all";
7577
const languageArg = values.language ?? "all";
@@ -89,6 +91,7 @@ Options:
8991
--output <dir> Results output directory (default: ./results)
9092
--timeout <seconds> Global timeout per scenario step in seconds (default: ${DEFAULT_STEP_TIMEOUT_SECONDS})
9193
--skills <dir> Path to skills directory (default: ../../skills)
94+
--dry-run Validate scenarios and print step summaries without executing
9295
-h, --help Show this help message
9396
`.trim();
9497

@@ -145,6 +148,64 @@ Options:
145148
(f) => f.endsWith(".yaml") || f.endsWith(".yml"),
146149
);
147150

151+
// Dry-run mode: validate and print step summaries, then exit
152+
if (dryRun) {
153+
console.log(chalk.bold("=== Dry Run ==="));
154+
for (const file of scenarioFiles) {
155+
const spec = await ScenarioLoader.load(path.join(scenariosDir, file));
156+
if (scenarioFilter && spec.name !== scenarioFilter) continue;
157+
158+
console.log(chalk.blue(`\nScenario: ${spec.name}`));
159+
console.log(` Steps: ${spec.steps.length}`);
160+
for (let i = 0; i < spec.steps.length; i++) {
161+
const step = spec.steps[i];
162+
const label = step.id ?? `step-${i + 1}`;
163+
const promptPreview = step.prompt
164+
? step.prompt.length > 60
165+
? step.prompt.slice(0, 57) + "..."
166+
: step.prompt
167+
: "(no prompt)";
168+
const skills = step.expectedSkills?.join(", ") || "(none)";
169+
const timeoutVal =
170+
step.timeout ?? spec.settings?.timeout_per_subprompt ?? "default";
171+
const conditions: string[] = [];
172+
if (step.only_if) {
173+
conditions.push(`only_if: ${JSON.stringify(step.only_if)}`);
174+
}
175+
if (step.skip_if) {
176+
conditions.push(`skip_if: ${JSON.stringify(step.skip_if)}`);
177+
}
178+
console.log(` [${label}] ${promptPreview}`);
179+
console.log(
180+
` skills: ${skills} | timeout: ${typeof timeoutVal === "number" ? `${timeoutVal}s` : timeoutVal}`,
181+
);
182+
if (conditions.length > 0) {
183+
console.log(` conditions: ${conditions.join(", ")}`);
184+
}
185+
}
186+
}
187+
console.log(chalk.green("\nAll scenarios validated successfully."));
188+
return;
189+
}
190+
191+
// Set up graceful Ctrl+C handling
192+
const abortController = new AbortController();
193+
let interrupted = false;
194+
195+
process.on("SIGINT", () => {
196+
if (interrupted) {
197+
console.log(chalk.red("\nForce exit."));
198+
process.exit(130);
199+
}
200+
interrupted = true;
201+
console.log(
202+
chalk.yellow(
203+
"\nInterrupted. Finishing current step and writing partial results... (press Ctrl+C again to force exit)",
204+
),
205+
);
206+
abortController.abort();
207+
});
208+
148209
const scenarioReports: ScenarioReport[] = [];
149210
let hasFailures = false;
150211

@@ -159,6 +220,14 @@ Options:
159220
);
160221

161222
for (const file of scenarioFiles) {
223+
// Check if interrupted before starting next scenario
224+
if (interrupted) {
225+
console.log(
226+
chalk.yellow(`Skipping remaining scenarios due to interruption.`),
227+
);
228+
break;
229+
}
230+
162231
const spec = await ScenarioLoader.load(path.join(scenariosDir, file));
163232

164233
if (scenarioFilter && spec.name !== scenarioFilter) continue;
@@ -178,7 +247,12 @@ Options:
178247
watcher,
179248
workspace,
180249
skillsDir,
181-
{ globalTimeoutSeconds },
250+
{
251+
globalTimeoutSeconds,
252+
agent: currentAgent,
253+
language: currentLanguage,
254+
abortSignal: abortController.signal,
255+
},
182256
);
183257

184258
const scenarioResult = await executor.execute(spec);
@@ -265,6 +339,39 @@ Options:
265339
const summaryPath = path.join(resultsDir, "summary.json");
266340
await fs.writeFile(summaryPath, JSON.stringify(summary, null, 2));
267341

342+
// GitHub Actions job summary
343+
const ghSummaryPath = process.env["GITHUB_STEP_SUMMARY"];
344+
if (ghSummaryPath) {
345+
const lines: string[] = [];
346+
lines.push("## Skill Test Results");
347+
lines.push("");
348+
lines.push("| Scenario | Agent | Language | Status | Duration |");
349+
lines.push("|----------|-------|----------|--------|----------|");
350+
for (const r of scenarioReports) {
351+
const icon = r.status === "pass" ? "\u2705" : "\u274c";
352+
lines.push(
353+
`| ${r.scenario} | ${r.matrix.agent} | ${r.matrix.language} | ${icon} ${r.status} | ${r.durationSeconds.toFixed(1)}s |`,
354+
);
355+
}
356+
lines.push("");
357+
lines.push(
358+
`**Total:** ${totalScenarios} | **Passed:** ${passed} | **Failed:** ${failed} | **Duration:** ${totalDuration.toFixed(1)}s`,
359+
);
360+
361+
if (worstFailures.length > 0) {
362+
lines.push("");
363+
lines.push("### Failures");
364+
for (const f of worstFailures) {
365+
const truncatedError =
366+
f.error.length > 200 ? f.error.slice(0, 197) + "..." : f.error;
367+
lines.push(`- **${f.scenario}**: ${truncatedError}`);
368+
}
369+
}
370+
lines.push("");
371+
372+
await fs.appendFile(ghSummaryPath, lines.join("\n"));
373+
}
374+
268375
// Print summary
269376
console.log("");
270377
console.log(chalk.bold("=== Test Summary ==="));

0 commit comments

Comments
 (0)