Skip to content

Commit dfcb7ba

Browse files
Autonomy tests added grid
Added three testing layers for FlowPilot autonomy: - Unit tests for token tracking, soul prompt builder, and system prompt compilation (unit-tests.test.ts) - Edge-function integration tests for agent-execute and flowpilot-heartbeat (integration-autonomy.test.ts) - Scenario eval suite with seeded DB checks for heartbeat state, atomic checkout, memory isolation, token logging, and CMS schema counts (scenario-eval-suite.test.ts) These tests cover deterministic functions, API endpoints, and end-to-end scenario benchmarks to validate claims and guard against race conditions during autonomous improvements. X-Lovable-Edit-ID: edt-73b0757f-7a16-4a0b-bbe8-67b32b442946 Co-authored-by: magnusfroste <38864257+magnusfroste@users.noreply.github.com>
2 parents 9c7d012 + ab8ff55 commit dfcb7ba

File tree

3 files changed

+692
-0
lines changed

3 files changed

+692
-0
lines changed
Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
/**
2+
* Lager 1: Unit Tests — Autonomy Functions
3+
*
4+
* Tests pure, deterministic functions from the autonomy layer:
5+
* - buildSystemPrompt (prompt compiler)
6+
* - extractTokenUsage / accumulateTokens / isOverBudget
7+
* - buildSoulPrompt
8+
*
9+
* These don't need DB or network — fast and reliable.
10+
*/
11+
import { describe, it, expect } from "vitest";
12+
13+
// We re-implement the pure functions here since they live in Deno edge functions.
14+
// This mirrors the logic from agent-reason.ts for testability.
15+
16+
// ─── Token Tracking ───────────────────────────────────────────────────────────
17+
18+
interface TokenUsage {
19+
prompt_tokens: number;
20+
completion_tokens: number;
21+
total_tokens: number;
22+
}
23+
24+
function extractTokenUsage(aiData: any): TokenUsage {
25+
const usage = aiData.usage || {};
26+
return {
27+
prompt_tokens: usage.prompt_tokens || 0,
28+
completion_tokens: usage.completion_tokens || 0,
29+
total_tokens: (usage.prompt_tokens || 0) + (usage.completion_tokens || 0),
30+
};
31+
}
32+
33+
function accumulateTokens(current: TokenUsage, incoming: TokenUsage): TokenUsage {
34+
return {
35+
prompt_tokens: current.prompt_tokens + incoming.prompt_tokens,
36+
completion_tokens: current.completion_tokens + incoming.completion_tokens,
37+
total_tokens: current.total_tokens + incoming.total_tokens,
38+
};
39+
}
40+
41+
function isOverBudget(usage: TokenUsage, budget: number): boolean {
42+
return usage.total_tokens >= budget;
43+
}
44+
45+
// ─── Soul Prompt Builder ──────────────────────────────────────────────────────
46+
47+
function buildSoulPrompt(soul: any, identity: any): string {
48+
let prompt = '';
49+
if (identity.name || identity.role) {
50+
prompt += `\n\nIDENTITY:\nName: ${identity.name || 'FlowPilot'}\nRole: ${identity.role || 'CMS operator'}`;
51+
if (identity.capabilities?.length) prompt += `\nCapabilities: ${identity.capabilities.join(', ')}`;
52+
if (identity.boundaries?.length) prompt += `\nBoundaries: ${identity.boundaries.join('; ')}`;
53+
}
54+
if (soul.purpose) prompt += `\n\nSOUL:\nPurpose: ${soul.purpose}`;
55+
if (soul.values?.length) prompt += `\nValues: ${soul.values.join('; ')}`;
56+
if (soul.tone) prompt += `\nTone: ${soul.tone}`;
57+
if (soul.philosophy) prompt += `\nPhilosophy: ${soul.philosophy}`;
58+
return prompt;
59+
}
60+
61+
// ─── Prompt Compiler (simplified) ─────────────────────────────────────────────
62+
63+
type PromptMode = 'operate' | 'heartbeat' | 'chat';
64+
65+
interface PromptCompilerInput {
66+
mode: PromptMode;
67+
soulPrompt: string;
68+
memoryContext: string;
69+
objectiveContext: string;
70+
activityContext?: string;
71+
statsContext?: string;
72+
automationContext?: string;
73+
healingReport?: string;
74+
cmsSchemaContext?: string;
75+
heartbeatState?: string;
76+
tokenBudget?: number;
77+
maxIterations?: number;
78+
chatSystemPrompt?: string;
79+
}
80+
81+
function buildSystemPrompt(input: PromptCompilerInput): string {
82+
const { mode, soulPrompt, memoryContext, objectiveContext } = input;
83+
84+
if (mode === 'chat' && input.chatSystemPrompt) {
85+
return input.chatSystemPrompt;
86+
}
87+
88+
const parts: string[] = [];
89+
90+
if (mode === 'heartbeat') {
91+
parts.push(`You are FlowPilot running in AUTONOMOUS HEARTBEAT mode. No human is watching.`);
92+
} else {
93+
parts.push(`You are FlowPilot — an autonomous, self-improving AI agent that operates a CMS platform.`);
94+
}
95+
96+
parts.push(soulPrompt);
97+
98+
if (input.cmsSchemaContext) {
99+
parts.push(input.cmsSchemaContext);
100+
}
101+
102+
// Core instructions (abbreviated for test)
103+
parts.push('CORE_INSTRUCTIONS');
104+
105+
if (mode === 'heartbeat') {
106+
parts.push(`\nCONTEXT:`);
107+
parts.push(memoryContext);
108+
parts.push(objectiveContext);
109+
if (input.automationContext) parts.push(input.automationContext);
110+
if (input.activityContext) parts.push(input.activityContext);
111+
if (input.statsContext) parts.push(input.statsContext);
112+
if (input.healingReport) parts.push(input.healingReport);
113+
if (input.heartbeatState) parts.push(input.heartbeatState);
114+
if (input.tokenBudget) {
115+
parts.push(`\nTOKEN BUDGET: ${input.tokenBudget} tokens max. Be efficient — stop early if approaching the limit.`);
116+
}
117+
parts.push('');
118+
parts.push('HEARTBEAT_PROTOCOL');
119+
parts.push(`\n- Max ${input.maxIterations || 8} tool iterations per heartbeat`);
120+
} else {
121+
parts.push(memoryContext);
122+
parts.push(`\nOBJECTIVES:\n- After executing skills that contribute to an objective, update progress.\n- When all success_criteria are met, mark as complete.`);
123+
parts.push(objectiveContext);
124+
}
125+
126+
return parts.filter(Boolean).join('\n');
127+
}
128+
129+
// ═══════════════════════════════════════════════════════════════════════════════
130+
// TESTS
131+
// ═══════════════════════════════════════════════════════════════════════════════
132+
133+
describe("Token Tracking", () => {
134+
it("extracts token usage from AI response", () => {
135+
const aiData = { usage: { prompt_tokens: 100, completion_tokens: 50 } };
136+
const result = extractTokenUsage(aiData);
137+
expect(result).toEqual({ prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 });
138+
});
139+
140+
it("handles missing usage gracefully", () => {
141+
expect(extractTokenUsage({})).toEqual({ prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 });
142+
expect(extractTokenUsage({ usage: {} })).toEqual({ prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 });
143+
});
144+
145+
it("accumulates tokens correctly across iterations", () => {
146+
const a: TokenUsage = { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 };
147+
const b: TokenUsage = { prompt_tokens: 200, completion_tokens: 80, total_tokens: 280 };
148+
const result = accumulateTokens(a, b);
149+
expect(result).toEqual({ prompt_tokens: 300, completion_tokens: 130, total_tokens: 430 });
150+
});
151+
152+
it("accumulates from zero", () => {
153+
const zero: TokenUsage = { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
154+
const incoming: TokenUsage = { prompt_tokens: 500, completion_tokens: 100, total_tokens: 600 };
155+
expect(accumulateTokens(zero, incoming)).toEqual(incoming);
156+
});
157+
158+
it("detects budget exceeded", () => {
159+
expect(isOverBudget({ prompt_tokens: 40000, completion_tokens: 10000, total_tokens: 50000 }, 50000)).toBe(true);
160+
expect(isOverBudget({ prompt_tokens: 40000, completion_tokens: 10001, total_tokens: 50001 }, 50000)).toBe(true);
161+
});
162+
163+
it("allows within budget", () => {
164+
expect(isOverBudget({ prompt_tokens: 30000, completion_tokens: 10000, total_tokens: 40000 }, 50000)).toBe(false);
165+
expect(isOverBudget({ prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }, 50000)).toBe(false);
166+
});
167+
});
168+
169+
describe("Soul Prompt Builder", () => {
170+
it("builds prompt with full soul and identity", () => {
171+
const soul = { purpose: "Help grow the business", values: ["Honesty", "Growth"], tone: "professional", philosophy: "Always add value" };
172+
const identity = { name: "Aria", role: "Digital consultant", capabilities: ["SEO", "Content"], boundaries: ["No spam"] };
173+
const prompt = buildSoulPrompt(soul, identity);
174+
175+
expect(prompt).toContain("Name: Aria");
176+
expect(prompt).toContain("Role: Digital consultant");
177+
expect(prompt).toContain("Capabilities: SEO, Content");
178+
expect(prompt).toContain("Boundaries: No spam");
179+
expect(prompt).toContain("Purpose: Help grow the business");
180+
expect(prompt).toContain("Values: Honesty; Growth");
181+
expect(prompt).toContain("Tone: professional");
182+
expect(prompt).toContain("Philosophy: Always add value");
183+
});
184+
185+
it("uses defaults for missing identity fields", () => {
186+
const prompt = buildSoulPrompt({}, { name: null, role: null });
187+
// No identity or soul sections when empty
188+
expect(prompt).toBe('');
189+
});
190+
191+
it("handles partial soul", () => {
192+
const prompt = buildSoulPrompt({ purpose: "Grow traffic" }, {});
193+
expect(prompt).toContain("Purpose: Grow traffic");
194+
expect(prompt).not.toContain("IDENTITY");
195+
});
196+
});
197+
198+
describe("Prompt Compiler (buildSystemPrompt)", () => {
199+
const baseInput: PromptCompilerInput = {
200+
mode: 'operate',
201+
soulPrompt: 'SOUL: Test',
202+
memoryContext: 'MEMORY: user likes blue',
203+
objectiveContext: 'OBJ: Grow traffic by 20%',
204+
};
205+
206+
it("operates mode includes objectives section", () => {
207+
const prompt = buildSystemPrompt(baseInput);
208+
expect(prompt).toContain("autonomous, self-improving AI agent");
209+
expect(prompt).toContain("SOUL: Test");
210+
expect(prompt).toContain("MEMORY: user likes blue");
211+
expect(prompt).toContain("OBJECTIVES:");
212+
expect(prompt).toContain("OBJ: Grow traffic by 20%");
213+
expect(prompt).not.toContain("HEARTBEAT");
214+
});
215+
216+
it("heartbeat mode includes protocol and context", () => {
217+
const prompt = buildSystemPrompt({
218+
...baseInput,
219+
mode: 'heartbeat',
220+
activityContext: 'Recent: blog post created',
221+
statsContext: 'Views: 500',
222+
automationContext: 'Cron: daily-report',
223+
maxIterations: 5,
224+
});
225+
expect(prompt).toContain("AUTONOMOUS HEARTBEAT mode");
226+
expect(prompt).toContain("CONTEXT:");
227+
expect(prompt).toContain("Recent: blog post created");
228+
expect(prompt).toContain("Views: 500");
229+
expect(prompt).toContain("Cron: daily-report");
230+
expect(prompt).toContain("HEARTBEAT_PROTOCOL");
231+
expect(prompt).toContain("Max 5 tool iterations");
232+
});
233+
234+
it("injects CMS schema when provided", () => {
235+
const prompt = buildSystemPrompt({
236+
...baseInput,
237+
cmsSchemaContext: 'CMS: 10 pages, 5 products, Stripe active',
238+
});
239+
expect(prompt).toContain("CMS: 10 pages, 5 products, Stripe active");
240+
});
241+
242+
it("injects heartbeat state in heartbeat mode", () => {
243+
const prompt = buildSystemPrompt({
244+
...baseInput,
245+
mode: 'heartbeat',
246+
heartbeatState: 'Last run: 2026-03-14, 3 objectives advanced',
247+
});
248+
expect(prompt).toContain("Last run: 2026-03-14, 3 objectives advanced");
249+
});
250+
251+
it("includes token budget warning in heartbeat mode", () => {
252+
const prompt = buildSystemPrompt({
253+
...baseInput,
254+
mode: 'heartbeat',
255+
tokenBudget: 50000,
256+
});
257+
expect(prompt).toContain("TOKEN BUDGET: 50000 tokens max");
258+
});
259+
260+
it("chat mode uses chatSystemPrompt override", () => {
261+
const prompt = buildSystemPrompt({
262+
...baseInput,
263+
mode: 'chat',
264+
chatSystemPrompt: 'You are a helpful assistant.',
265+
});
266+
expect(prompt).toBe('You are a helpful assistant.');
267+
});
268+
269+
it("does not leak heartbeat state into operate mode", () => {
270+
const prompt = buildSystemPrompt({
271+
...baseInput,
272+
mode: 'operate',
273+
heartbeatState: 'SHOULD NOT APPEAR',
274+
});
275+
expect(prompt).not.toContain("SHOULD NOT APPEAR");
276+
});
277+
});
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/**
2+
* Lager 2: Edge Function Integration Tests — Autonomy
3+
*
4+
* Tests real edge function endpoints with controlled inputs.
5+
* Validates: response shapes, error handling, checkout logic, goal-aware execution.
6+
*
7+
* Run with: supabase--test_edge_functions or Deno test
8+
*/
9+
import "https://deno.land/std@0.224.0/dotenv/load.ts";
10+
import { assertEquals, assertExists } from "https://deno.land/std@0.224.0/assert/mod.ts";
11+
12+
const SUPABASE_URL = Deno.env.get("VITE_SUPABASE_URL") || Deno.env.get("SUPABASE_URL")!;
13+
const SUPABASE_ANON_KEY = Deno.env.get("VITE_SUPABASE_PUBLISHABLE_KEY") || Deno.env.get("SUPABASE_ANON_KEY")!;
14+
15+
const headers = {
16+
"Content-Type": "application/json",
17+
"Authorization": `Bearer ${SUPABASE_ANON_KEY}`,
18+
};
19+
20+
// ─── Helper ───────────────────────────────────────────────────────────────────
21+
22+
async function callEdgeFunction(name: string, body: any): Promise<{ status: number; data: any }> {
23+
const response = await fetch(`${SUPABASE_URL}/functions/v1/${name}`, {
24+
method: "POST",
25+
headers,
26+
body: JSON.stringify(body),
27+
});
28+
const text = await response.text();
29+
let data;
30+
try { data = JSON.parse(text); } catch { data = { raw: text }; }
31+
return { status: response.status, data };
32+
}
33+
34+
// ═══════════════════════════════════════════════════════════════════════════════
35+
// TESTS
36+
// ═══════════════════════════════════════════════════════════════════════════════
37+
38+
Deno.test("agent-execute: rejects missing skill_name/skill_id", async () => {
39+
const { status, data } = await callEdgeFunction("agent-execute", {
40+
arguments: {},
41+
agent_type: "flowpilot",
42+
});
43+
assertEquals(status, 400);
44+
assertEquals(data.error, "skill_id or skill_name required");
45+
});
46+
47+
Deno.test("agent-execute: returns 404 for nonexistent skill", async () => {
48+
const { status, data } = await callEdgeFunction("agent-execute", {
49+
skill_name: "nonexistent_skill_xyz_99",
50+
arguments: {},
51+
agent_type: "flowpilot",
52+
});
53+
assertEquals(status, 404);
54+
assertExists(data.error);
55+
});
56+
57+
Deno.test("agent-execute: accepts objective_context without error", async () => {
58+
// This tests that the goal-aware execution path doesn't crash.
59+
// Even if the skill doesn't exist, it should 404 not 500.
60+
const { status } = await callEdgeFunction("agent-execute", {
61+
skill_name: "nonexistent_skill_xyz_99",
62+
arguments: { test: true },
63+
agent_type: "flowpilot",
64+
objective_context: {
65+
goal: "Increase traffic by 20%",
66+
step: "Step 1: Audit current SEO",
67+
why: "Testing goal-aware execution",
68+
},
69+
});
70+
// Should be 404 (skill not found), NOT 500 (crash)
71+
assertEquals(status, 404);
72+
});
73+
74+
Deno.test("agent-execute: blocks external scope from chat", async () => {
75+
// This tests scope validation. We need a skill that's internal-only.
76+
// Since we can't guarantee a specific skill exists, we just verify
77+
// the endpoint responds correctly for valid input shapes.
78+
const { status } = await callEdgeFunction("agent-execute", {
79+
skill_name: "manage_site_settings",
80+
arguments: { action: "get", section: "general" },
81+
agent_type: "chat", // Chat trying to access internal skill
82+
});
83+
// Either 403 (blocked) or 404 (skill not found) are acceptable
84+
const validStatuses = [403, 404];
85+
assertEquals(validStatuses.includes(status), true, `Expected 403 or 404, got ${status}`);
86+
});
87+
88+
Deno.test("flowpilot-heartbeat: responds with correct shape", async () => {
89+
// Heartbeat might fail due to no AI key in test env, but should not 500 on CORS
90+
const response = await fetch(`${SUPABASE_URL}/functions/v1/flowpilot-heartbeat`, {
91+
method: "OPTIONS",
92+
headers,
93+
});
94+
// OPTIONS should return 200 with CORS headers
95+
const text = await response.text(); // Consume body
96+
assertEquals(response.status, 200);
97+
assertExists(response.headers.get("access-control-allow-origin"));
98+
});
99+
100+
Deno.test("heartbeat: POST returns structured response or AI error", async () => {
101+
const { status, data } = await callEdgeFunction("flowpilot-heartbeat", {});
102+
// Either success (200 with status/actions/token_usage) or expected error (500 with message)
103+
if (status === 200) {
104+
assertExists(data.status);
105+
assertExists(data.actions);
106+
assertExists(data.token_usage);
107+
assertEquals(typeof data.duration_ms, "number");
108+
} else {
109+
// AI provider not configured is expected in test environments
110+
assertExists(data.error);
111+
}
112+
});

0 commit comments

Comments
 (0)