tikalk
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎evals/configs/promptfooconfig-arch.js‎
Lines changed: 139 additions & 0 deletions b/‎evals/configs/promptfooconfig-arch.js‎
Lines changed: 139 additions & 0 deletions
diff --git a/‎evals/configs/promptfooconfig-clarify.js‎
Lines changed: 86 additions & 0 deletions b/‎evals/configs/promptfooconfig-clarify.js‎
Lines changed: 86 additions & 0 deletions
diff --git a/‎evals/configs/promptfooconfig-ext.js‎
Lines changed: 110 additions & 0 deletions b/‎evals/configs/promptfooconfig-ext.js‎
Lines changed: 110 additions & 0 deletions
@@ -59,6 +59,7 @@ docs/dev
 .specify/extensions/*/local-config.yml
 
 # Evaluation artifacts
+eval-results/
 eval-results*.json
 *.backup
 .promptfoo/
 
@@ -0,0 +1,139 @@
+// PromptFoo configuration for Architecture Template tests
+module.exports = {
+  description: 'Architecture Template Quality Evaluation',
+
+  // Rate limiting to avoid 429 errors
+  maxConcurrency: 1,
+  delay: 2000, // 2 second delay between tests
+
+  // Architecture prompt
+  prompts: ['file://../prompts/arch-prompt.txt'],
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.7,
+        max_tokens: 6000,
+      },
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+    },
+  },
+
+  tests: [
+    // Test 1: Architecture Init Quality - Structure Validation
+    {
+      description: 'Architecture: Init produces valid Rozanski & Woods structure',
+      vars: {
+        user_input:
+          'Create an architecture description for an e-commerce platform with a web frontend, REST API backend, PostgreSQL database, and Redis cache. The system handles user authentication, product catalog, shopping cart, and order processing.',
+      },
+      assert: [
+        { type: 'icontains', value: 'context view' },
+        { type: 'icontains', value: 'functional view' },
+        { type: 'icontains', value: 'deployment view' },
+        { type: 'icontains', value: 'stakeholder' },
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_arch_structure',
+        },
+      ],
+    },
+
+    // Test 2: Blackbox Context View
+    {
+      description: 'Architecture: Context View enforces blackbox system representation',
+      vars: {
+        user_input:
+          'Create an architecture description for a SaaS project management tool that integrates with GitHub, Slack, and Google Calendar. Users access it via web browser. An admin manages team settings.',
+      },
+      assert: [
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_blackbox_context_view',
+        },
+        {
+          type: 'llm-rubric',
+          value:
+            'Check if the Context View section treats the system as a single blackbox.\n' +
+            'The Context View should:\n' +
+            '1. Show the system as ONE unified node (not broken into internal services)\n' +
+            '2. Show external actors (users, admins) interacting with the system\n' +
+            '3. Show external systems (GitHub, Slack, Google Calendar) as separate nodes\n' +
+            '4. NOT show internal databases, caches, queues, or microservices in this view\n' +
+            'Return 1.0 if blackbox constraint is followed, 0.5 if partially, 0.0 if internal details are exposed.',
+          threshold: 0.7,
+        },
+      ],
+    },
+
+    // Test 3: Architecture Simplicity for Simple Systems
+    {
+      description: 'Architecture: Simple app gets simple architecture (no over-engineering)',
+      vars: {
+        user_input:
+          'Create an architecture description for a simple personal blog with basic CRUD for posts, a SQLite database, and static file serving. Single developer, no team.',
+      },
+      assert: [
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_arch_simplicity',
+        },
+        {
+          type: 'llm-rubric',
+          value:
+            'Is the architecture appropriately simple for a personal blog?\n' +
+            'Check for:\n' +
+            '- No microservices architecture for a blog\n' +
+            '- No Kubernetes or complex orchestration\n' +
+            '- No message queues or event sourcing\n' +
+            '- Simple deployment (single server or basic hosting)\n' +
+            '- Monolith or simple client-server is appropriate\n' +
+            'Return 1.0 if appropriately simple, 0.5 if somewhat over-engineered, 0.0 if heavily over-engineered.',
+          threshold: 0.7,
+        },
+      ],
+    },
+
+    // Test 4: ADR Quality
+    {
+      description: 'Architecture: ADRs follow template structure with required sections',
+      vars: {
+        user_input:
+          'Create an architecture description for a real-time chat application with WebSocket support, message persistence, user presence tracking, and file sharing. The system must handle 10,000 concurrent users.',
+      },
+      assert: [
+        { type: 'icontains', value: 'adr' },
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_adr_quality',
+        },
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the ADR quality in this architecture document (0-1):\n' +
+            '1. Does each ADR have a clear Status (Proposed/Accepted/Deprecated/Discovered)?\n' +
+            '2. Does each ADR have a Context section explaining why the decision was needed?\n' +
+            '3. Does each ADR have a clear Decision statement?\n' +
+            '4. Does each ADR document Consequences (positive, negative, risks)?\n' +
+            '5. Are alternatives documented with neutral trade-offs (not "rejected because")?\n' +
+            'Return average score 0-1.',
+          threshold: 0.7,
+        },
+      ],
+    },
+  ],
+};
@@ -0,0 +1,86 @@
+// PromptFoo configuration for Clarify Command tests
+module.exports = {
+  description: 'Clarify Command Quality Evaluation',
+
+  // Rate limiting to avoid 429 errors
+  maxConcurrency: 1,
+  delay: 2000, // 2 second delay between tests
+
+  // Clarify prompt
+  prompts: ['file://../prompts/clarify-prompt.txt'],
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.3,
+        max_tokens: 4000,
+      },
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      // Strip any preamble/thinking before the actual content
+      transform: 'output.replace(/^.*?(?=## 1\\.\\s+Ambiguity Analysis)/s, "").trim()',
+    },
+  },
+
+  tests: [
+    // Test 1: Clarify identifies gaps in a deliberately vague spec
+    {
+      description: 'Clarify: Identifies ambiguities in a vague specification',
+      vars: {
+        user_input:
+          'Build a notification system. It should be fast and support multiple channels. Users should be able to configure their preferences. The system needs to handle high volumes.',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the clarification quality (0-1):\n' +
+            '1. Does it identify that "fast" is vague and needs quantification?\n' +
+            '2. Does it ask what "multiple channels" means (email, SMS, push, webhook)?\n' +
+            '3. Does it question what "high volumes" means with specific numbers?\n' +
+            '4. Does it ask about preference configuration scope (per-channel, per-event, schedules)?\n' +
+            '5. Are questions specific and actionable (not generic)?\n' +
+            'Return average score 0-1.',
+          threshold: 0.7,
+        },
+        { type: 'icontains', value: 'clarification' },
+      ],
+    },
+
+    // Test 2: Architect Clarify focuses on architectural concerns
+    {
+      description: 'Clarify: Focuses on architectural concerns for system-level spec',
+      vars: {
+        user_input:
+          'We have an existing monolith handling 500 req/s. We want to add real-time features (live updates, presence indicators) and eventually support 50,000 concurrent users. The team has 3 backend developers. Current stack is Django + PostgreSQL.',
+      },
+      assert: [
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the architectural focus of clarification questions (0-1):\n' +
+            '1. Does it ask about the WebSocket/SSE approach for real-time (architecture decision)?\n' +
+            '2. Does it question scaling strategy (horizontal vs vertical, breaking the monolith)?\n' +
+            '3. Does it address data flow for real-time updates (pub/sub, polling, change data capture)?\n' +
+            '4. Does it consider team size vs complexity (3 devs vs microservices risk)?\n' +
+            '5. Does it focus on ARCHITECTURE concerns rather than feature details?\n' +
+            'Return average score 0-1.',
+          threshold: 0.7,
+        },
+      ],
+    },
+  ],
+};
@@ -0,0 +1,110 @@
+// PromptFoo configuration for Extension System tests
+module.exports = {
+  description: 'Extension System Quality Evaluation',
+
+  // Rate limiting to avoid 429 errors
+  maxConcurrency: 1,
+  delay: 2000, // 2 second delay between tests
+
+  // Extension prompt
+  prompts: ['file://../prompts/ext-prompt.txt'],
+
+  // Configure LLM provider using OpenAI-compatible endpoint
+  providers: [
+    {
+      id: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+      label: `${process.env.LLM_MODEL || 'Sonnet 4.5'} (via AI API Gateway)`,
+      config: {
+        apiBaseUrl: process.env.LLM_BASE_URL,
+        apiKey: process.env.LLM_AUTH_TOKEN,
+        temperature: 0.7,
+        max_tokens: 5000,
+      },
+      env: {
+        OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
+        OPENAI_BASE_URL: process.env.LLM_BASE_URL,
+      },
+    },
+  ],
+
+  defaultTest: {
+    options: {
+      provider: `openai:chat:${process.env.LLM_MODEL || 'claude-sonnet-4-5-20250929'}`,
+    },
+  },
+
+  tests: [
+    // Test 1: Extension Manifest Validation
+    {
+      description: 'Extension: Manifest contains all required fields',
+      vars: {
+        user_input:
+          'Create a Spec Kit extension for Jira integration that syncs spec tasks to Jira issues, maps priority levels, and tracks issue status updates.',
+      },
+      assert: [
+        { type: 'icontains', value: 'schema_version' },
+        { type: 'icontains', value: 'extension' },
+        { type: 'icontains', value: 'provides' },
+        { type: 'icontains', value: 'commands' },
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_extension_manifest',
+        },
+      ],
+    },
+
+    // Test 2: Extension Skill Quality (self-containment)
+    {
+      description: 'Extension: Command is self-contained with no external references',
+      vars: {
+        user_input:
+          'Create a Spec Kit extension for automated code review that runs linting, checks test coverage, and generates a review summary report.',
+      },
+      assert: [
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_extension_self_containment',
+        },
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the extension command quality (0-1):\n' +
+            '1. Does the command have a clear Purpose section?\n' +
+            '2. Does it list Prerequisites?\n' +
+            '3. Does it have step-by-step execution instructions?\n' +
+            '4. Does it include error handling guidance?\n' +
+            '5. Is it self-contained (no @rule, @persona, @example references)?\n' +
+            'Return average score 0-1.',
+          threshold: 0.7,
+        },
+      ],
+    },
+
+    // Test 3: Extension Config Template Quality
+    {
+      description: 'Extension: Config template has documented options and defaults',
+      vars: {
+        user_input:
+          'Create a Spec Kit extension for Slack notifications that posts spec status updates to channels, supports thread replies, and allows custom message templates.',
+      },
+      assert: [
+        {
+          type: 'python',
+          value: 'file://../graders/custom_graders.py:check_extension_config',
+        },
+        {
+          type: 'llm-rubric',
+          value:
+            'Grade the configuration template quality (0-1):\n' +
+            '1. Are configuration options clearly documented with comments?\n' +
+            '2. Are required vs optional fields marked?\n' +
+            '3. Are sensible default values provided?\n' +
+            '4. Is there guidance on environment variable overrides?\n' +
+            '5. Is the YAML structure logical and well-organized?\n' +
+            'Return average score 0-1.',
+          threshold: 0.7,
+        },
+      ],
+    },
+  ],
+};