Skip to content

Commit 8d7b481

Browse files
authored
Merge pull request #32 from tikalk/add_arch_evals
add eval for arch, clarify and trace
2 parents 4dac832 + fc8ab4e commit 8d7b481

15 files changed

+1770
-91
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ docs/dev
5959
.specify/extensions/*/local-config.yml
6060

6161
# Evaluation artifacts
62+
eval-results/
6263
eval-results*.json
6364
*.backup
6465
.promptfoo/
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
// PromptFoo configuration for Architecture Template tests
2+
module.exports = {
3+
description: 'Architecture Template Quality Evaluation',
4+
5+
// Rate limiting to avoid 429 errors
6+
maxConcurrency: 1,
7+
delay: 2000, // 2 second delay between tests
8+
9+
// Architecture prompt
10+
prompts: ['file://../prompts/arch-prompt.txt'],
11+
12+
// Configure LLM provider using OpenAI-compatible endpoint
13+
providers: [
14+
{
15+
id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
16+
label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
17+
config: {
18+
apiBaseUrl: process.env.LLM_BASE_URL,
19+
apiKey: process.env.LLM_AUTH_TOKEN,
20+
temperature: 0.7,
21+
max_tokens: 6000,
22+
},
23+
env: {
24+
OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
25+
OPENAI_BASE_URL: process.env.LLM_BASE_URL,
26+
},
27+
},
28+
],
29+
30+
defaultTest: {
31+
options: {
32+
provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
33+
},
34+
},
35+
36+
tests: [
37+
// Test 1: Architecture Init Quality - Structure Validation
38+
{
39+
description: 'Architecture: Init produces valid Rozanski & Woods structure',
40+
vars: {
41+
user_input:
42+
'Create an architecture description for an e-commerce platform with a web frontend, REST API backend, PostgreSQL database, and Redis cache. The system handles user authentication, product catalog, shopping cart, and order processing.',
43+
},
44+
assert: [
45+
{ type: 'icontains', value: 'context view' },
46+
{ type: 'icontains', value: 'functional view' },
47+
{ type: 'icontains', value: 'deployment view' },
48+
{ type: 'icontains', value: 'stakeholder' },
49+
{
50+
type: 'python',
51+
value: 'file://../graders/custom_graders.py:check_arch_structure',
52+
},
53+
],
54+
},
55+
56+
// Test 2: Blackbox Context View
57+
{
58+
description: 'Architecture: Context View enforces blackbox system representation',
59+
vars: {
60+
user_input:
61+
'Create an architecture description for a SaaS project management tool that integrates with GitHub, Slack, and Google Calendar. Users access it via web browser. An admin manages team settings.',
62+
},
63+
assert: [
64+
{
65+
type: 'python',
66+
value: 'file://../graders/custom_graders.py:check_blackbox_context_view',
67+
},
68+
{
69+
type: 'llm-rubric',
70+
value:
71+
'Check if the Context View section treats the system as a single blackbox.\n' +
72+
'The Context View should:\n' +
73+
'1. Show the system as ONE unified node (not broken into internal services)\n' +
74+
'2. Show external actors (users, admins) interacting with the system\n' +
75+
'3. Show external systems (GitHub, Slack, Google Calendar) as separate nodes\n' +
76+
'4. NOT show internal databases, caches, queues, or microservices in this view\n' +
77+
'Return 1.0 if blackbox constraint is followed, 0.5 if partially, 0.0 if internal details are exposed.',
78+
threshold: 0.7,
79+
},
80+
],
81+
},
82+
83+
// Test 3: Architecture Simplicity for Simple Systems
84+
{
85+
description: 'Architecture: Simple app gets simple architecture (no over-engineering)',
86+
vars: {
87+
user_input:
88+
'Create an architecture description for a simple personal blog with basic CRUD for posts, a SQLite database, and static file serving. Single developer, no team.',
89+
},
90+
assert: [
91+
{
92+
type: 'python',
93+
value: 'file://../graders/custom_graders.py:check_arch_simplicity',
94+
},
95+
{
96+
type: 'llm-rubric',
97+
value:
98+
'Is the architecture appropriately simple for a personal blog?\n' +
99+
'Check for:\n' +
100+
'- No microservices architecture for a blog\n' +
101+
'- No Kubernetes or complex orchestration\n' +
102+
'- No message queues or event sourcing\n' +
103+
'- Simple deployment (single server or basic hosting)\n' +
104+
'- Monolith or simple client-server is appropriate\n' +
105+
'Return 1.0 if appropriately simple, 0.5 if somewhat over-engineered, 0.0 if heavily over-engineered.',
106+
threshold: 0.7,
107+
},
108+
],
109+
},
110+
111+
// Test 4: ADR Quality
112+
{
113+
description: 'Architecture: ADRs follow template structure with required sections',
114+
vars: {
115+
user_input:
116+
'Create an architecture description for a real-time chat application with WebSocket support, message persistence, user presence tracking, and file sharing. The system must handle 10,000 concurrent users.',
117+
},
118+
assert: [
119+
{ type: 'icontains', value: 'adr' },
120+
{
121+
type: 'python',
122+
value: 'file://../graders/custom_graders.py:check_adr_quality',
123+
},
124+
{
125+
type: 'llm-rubric',
126+
value:
127+
'Grade the ADR quality in this architecture document (0-1):\n' +
128+
'1. Does each ADR have a clear Status (Proposed/Accepted/Deprecated/Discovered)?\n' +
129+
'2. Does each ADR have a Context section explaining why the decision was needed?\n' +
130+
'3. Does each ADR have a clear Decision statement?\n' +
131+
'4. Does each ADR document Consequences (positive, negative, risks)?\n' +
132+
'5. Are alternatives documented with neutral trade-offs (not "rejected because")?\n' +
133+
'Return average score 0-1.',
134+
threshold: 0.7,
135+
},
136+
],
137+
},
138+
],
139+
};
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// PromptFoo configuration for Clarify Command tests
2+
module.exports = {
3+
description: 'Clarify Command Quality Evaluation',
4+
5+
// Rate limiting to avoid 429 errors
6+
maxConcurrency: 1,
7+
delay: 2000, // 2 second delay between tests
8+
9+
// Clarify prompt
10+
prompts: ['file://../prompts/clarify-prompt.txt'],
11+
12+
// Configure LLM provider using OpenAI-compatible endpoint
13+
providers: [
14+
{
15+
id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
16+
label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
17+
config: {
18+
apiBaseUrl: process.env.LLM_BASE_URL,
19+
apiKey: process.env.LLM_AUTH_TOKEN,
20+
temperature: 0.3,
21+
max_tokens: 4000,
22+
},
23+
env: {
24+
OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
25+
OPENAI_BASE_URL: process.env.LLM_BASE_URL,
26+
},
27+
},
28+
],
29+
30+
defaultTest: {
31+
options: {
32+
provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
33+
// Strip any preamble/thinking before the actual content
34+
transform: 'output.replace(/^.*?(?=## 1\\.\\s+Ambiguity Analysis)/s, "").trim()',
35+
},
36+
},
37+
38+
tests: [
39+
// Test 1: Clarify identifies gaps in a deliberately vague spec
40+
{
41+
description: 'Clarify: Identifies ambiguities in a vague specification',
42+
vars: {
43+
user_input:
44+
'Build a notification system. It should be fast and support multiple channels. Users should be able to configure their preferences. The system needs to handle high volumes.',
45+
},
46+
assert: [
47+
{
48+
type: 'llm-rubric',
49+
value:
50+
'Grade the clarification quality (0-1):\n' +
51+
'1. Does it identify that "fast" is vague and needs quantification?\n' +
52+
'2. Does it ask what "multiple channels" means (email, SMS, push, webhook)?\n' +
53+
'3. Does it question what "high volumes" means with specific numbers?\n' +
54+
'4. Does it ask about preference configuration scope (per-channel, per-event, schedules)?\n' +
55+
'5. Are questions specific and actionable (not generic)?\n' +
56+
'Return average score 0-1.',
57+
threshold: 0.7,
58+
},
59+
{ type: 'icontains', value: 'clarification' },
60+
],
61+
},
62+
63+
// Test 2: Architect Clarify focuses on architectural concerns
64+
{
65+
description: 'Clarify: Focuses on architectural concerns for system-level spec',
66+
vars: {
67+
user_input:
68+
'We have an existing monolith handling 500 req/s. We want to add real-time features (live updates, presence indicators) and eventually support 50,000 concurrent users. The team has 3 backend developers. Current stack is Django + PostgreSQL.',
69+
},
70+
assert: [
71+
{
72+
type: 'llm-rubric',
73+
value:
74+
'Grade the architectural focus of clarification questions (0-1):\n' +
75+
'1. Does it ask about the WebSocket/SSE approach for real-time (architecture decision)?\n' +
76+
'2. Does it question scaling strategy (horizontal vs vertical, breaking the monolith)?\n' +
77+
'3. Does it address data flow for real-time updates (pub/sub, polling, change data capture)?\n' +
78+
'4. Does it consider team size vs complexity (3 devs vs microservices risk)?\n' +
79+
'5. Does it focus on ARCHITECTURE concerns rather than feature details?\n' +
80+
'Return average score 0-1.',
81+
threshold: 0.7,
82+
},
83+
],
84+
},
85+
],
86+
};
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// PromptFoo configuration for Extension System tests
2+
module.exports = {
3+
description: 'Extension System Quality Evaluation',
4+
5+
// Rate limiting to avoid 429 errors
6+
maxConcurrency: 1,
7+
delay: 2000, // 2 second delay between tests
8+
9+
// Extension prompt
10+
prompts: ['file://../prompts/ext-prompt.txt'],
11+
12+
// Configure LLM provider using OpenAI-compatible endpoint
13+
providers: [
14+
{
15+
id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
16+
label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
17+
config: {
18+
apiBaseUrl: process.env.LLM_BASE_URL,
19+
apiKey: process.env.LLM_AUTH_TOKEN,
20+
temperature: 0.7,
21+
max_tokens: 5000,
22+
},
23+
env: {
24+
OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
25+
OPENAI_BASE_URL: process.env.LLM_BASE_URL,
26+
},
27+
},
28+
],
29+
30+
defaultTest: {
31+
options: {
32+
provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
33+
},
34+
},
35+
36+
tests: [
37+
// Test 1: Extension Manifest Validation
38+
{
39+
description: 'Extension: Manifest contains all required fields',
40+
vars: {
41+
user_input:
42+
'Create a Spec Kit extension for Jira integration that syncs spec tasks to Jira issues, maps priority levels, and tracks issue status updates.',
43+
},
44+
assert: [
45+
{ type: 'icontains', value: 'schema_version' },
46+
{ type: 'icontains', value: 'extension' },
47+
{ type: 'icontains', value: 'provides' },
48+
{ type: 'icontains', value: 'commands' },
49+
{
50+
type: 'python',
51+
value: 'file://../graders/custom_graders.py:check_extension_manifest',
52+
},
53+
],
54+
},
55+
56+
// Test 2: Extension Skill Quality (self-containment)
57+
{
58+
description: 'Extension: Command is self-contained with no external references',
59+
vars: {
60+
user_input:
61+
'Create a Spec Kit extension for automated code review that runs linting, checks test coverage, and generates a review summary report.',
62+
},
63+
assert: [
64+
{
65+
type: 'python',
66+
value: 'file://../graders/custom_graders.py:check_extension_self_containment',
67+
},
68+
{
69+
type: 'llm-rubric',
70+
value:
71+
'Grade the extension command quality (0-1):\n' +
72+
'1. Does the command have a clear Purpose section?\n' +
73+
'2. Does it list Prerequisites?\n' +
74+
'3. Does it have step-by-step execution instructions?\n' +
75+
'4. Does it include error handling guidance?\n' +
76+
'5. Is it self-contained (no @rule, @persona, @example references)?\n' +
77+
'Return average score 0-1.',
78+
threshold: 0.7,
79+
},
80+
],
81+
},
82+
83+
// Test 3: Extension Config Template Quality
84+
{
85+
description: 'Extension: Config template has documented options and defaults',
86+
vars: {
87+
user_input:
88+
'Create a Spec Kit extension for Slack notifications that posts spec status updates to channels, supports thread replies, and allows custom message templates.',
89+
},
90+
assert: [
91+
{
92+
type: 'python',
93+
value: 'file://../graders/custom_graders.py:check_extension_config',
94+
},
95+
{
96+
type: 'llm-rubric',
97+
value:
98+
'Grade the configuration template quality (0-1):\n' +
99+
'1. Are configuration options clearly documented with comments?\n' +
100+
'2. Are required vs optional fields marked?\n' +
101+
'3. Are sensible default values provided?\n' +
102+
'4. Is there guidance on environment variable overrides?\n' +
103+
'5. Is the YAML structure logical and well-organized?\n' +
104+
'Return average score 0-1.',
105+
threshold: 0.7,
106+
},
107+
],
108+
},
109+
],
110+
};

0 commit comments

Comments
 (0)