Merge pull request #3 from tikalk/eval_github_actions

kfinkels · web-flow · commit 4007491ceb0c · 2026-02-24T10:16:50.000+02:00
Run AI evals on PRs and fix doc links
diff --git a/.github/workflows/eval.yml b/.github/workflows/eval.yml
@@ -1,12 +1,14 @@
 name: AI Evals
 
 on:
-  workflow_dispatch:  # Manual trigger only
+  pull_request:
+    branches: [main]
+  workflow_dispatch:  # Manual trigger still available
     inputs:
       model:
         description: 'Model to use for evaluation'
         required: false
-        default: 'claude-sonnet-4-5-20250929'
+        default: 'GLM-4.6V-Flash'
         type: string
 
 jobs:
@@ -24,6 +26,14 @@ jobs:
         with:
           node-version: '20'
 
+      - name: Restore promptfoo cache
+        uses: actions/cache@v4
+        with:
+          path: ~/.promptfoo/cache
+          key: promptfoo-${{ hashFiles('evals/configs/**', 'evals/prompts/**') }}-${{ github.event.inputs.model || secrets.LLM_MODEL || 'default' }}
+          restore-keys: |
+            promptfoo-${{ hashFiles('evals/configs/**', 'evals/prompts/**') }}-
+
       - name: Setup Python
         uses: actions/setup-python@v5
         with:
@@ -32,24 +42,32 @@ jobs:
       - name: Install Python dependencies
         run: |
           python -m pip install --upgrade pip
-          # No requirements.txt needed for check_eval_scores.py (uses stdlib only)
+          pip install -e ".[test]"
+
+      - name: Run pytest
+        run: |
+          pytest tests/ -v --tb=short
 
       - name: Run Evaluations
+        continue-on-error: true
         env:
           LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
           LLM_AUTH_TOKEN: ${{ secrets.LLM_AUTH_TOKEN }}
-          LLM_MODEL: ${{ github.event.inputs.model || 'claude-sonnet-4-5-20250929' }}
+          LLM_MODEL: ${{ github.event.inputs.model || secrets.LLM_MODEL || 'llama-3.3-70b-versatile' }}
+          PROMPTFOO_REQUEST_BACKOFF_MS: '30000'
         run: |
           chmod +x ./evals/scripts/run-promptfoo-eval.sh
           ./evals/scripts/run-promptfoo-eval.sh --json
 
       - name: Check Quality Thresholds
         id: check_thresholds
+        continue-on-error: true
         run: |
           python3 evals/scripts/check_eval_scores.py \
             --results eval-results.json \
-            --min-score 0.70 \
-            --min-pass-rate 0.70 \
+            --min-score 0.50 \
+            --min-pass-rate 0.85 \
+            --allow-api-errors \
             --verbose || echo "threshold_failed=true" >> $GITHUB_OUTPUT
 
       - name: Generate Summary
@@ -101,10 +119,10 @@ jobs:
                       summary += f"- {test_name} (score: {score:.2f})\n"
 
           # Success message
-          if pass_rate >= 70:
+          if pass_rate >= 85:
               summary += "\n✅ **Quality thresholds met!**"
           else:
-              summary += "\n⚠️ **Quality thresholds not met.** Please review failures."
+              summary += "\n⚠️ **Quality thresholds not met (target: 85%).** Please review failures."
 
           # Write to output file for PR comment
           with open('eval_summary.txt', 'w') as f:
@@ -169,8 +187,21 @@ jobs:
             eval_summary.txt
           retention-days: 30
 
-      - name: Fail if thresholds not met
-        if: steps.check_thresholds.outputs.threshold_failed == 'true'
+      # Note: This step will fail but not block the PR (informational only)
+      - name: Report threshold status
+        if: always()
         run: |
-          echo "❌ Quality thresholds not met"
-          exit 1
+          if [ "${{ steps.check_thresholds.outcome }}" = "failure" ]; then
+            echo "⚠️ Quality thresholds not met (target: 85% pass rate)"
+            echo "This is informational only and does not block the PR."
+            exit 0
+          else
+            echo "✅ Quality thresholds met!"
+          fi
+
+#      - name: Fail if thresholds not met
+#        if: steps.check_thresholds.outputs.threshold_failed == 'true'
+#        run: |
+#          echo "❌ Quality thresholds not met"
+#          exit 1
+
diff --git a/evals/configs/promptfooconfig-arch.js b/evals/configs/promptfooconfig-arch.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Architecture Template Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 2000, // 2 second delay between tests
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
+  },
 
   // Architecture prompt
   prompts: ['file://../prompts/arch-prompt.txt'],
diff --git a/evals/configs/promptfooconfig-clarify.js b/evals/configs/promptfooconfig-clarify.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Clarify Command Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 2000, // 2 second delay between tests
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
+  },
 
   // Clarify prompt
   prompts: ['file://../prompts/clarify-prompt.txt'],
diff --git a/evals/configs/promptfooconfig-ext.js b/evals/configs/promptfooconfig-ext.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Extension System Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 2000, // 2 second delay between tests
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
+  },
 
   // Extension prompt
   prompts: ['file://../prompts/ext-prompt.txt'],
diff --git a/evals/configs/promptfooconfig-plan.js b/evals/configs/promptfooconfig-plan.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Plan Template Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 2000, // 2 second delay between tests
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
+  },
 
   // Plan prompt only
   prompts: ['file://../prompts/plan-prompt.txt'],
diff --git a/evals/configs/promptfooconfig-spec.js b/evals/configs/promptfooconfig-spec.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Spec Template Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 5000, // 5 second delay between tests (increased for Groq)
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 5000, // 15s in CI to avoid rate limiting, 5s locally
+  },
 
   // Spec prompt only
   prompts: ['file://../prompts/spec-prompt.txt'],
diff --git a/evals/configs/promptfooconfig-trace.js b/evals/configs/promptfooconfig-trace.js
@@ -3,8 +3,10 @@ module.exports = {
   description: 'Trace Template Quality Evaluation',
 
   // Rate limiting to avoid 429 errors
-  maxConcurrency: 1,
-  delay: 2000, // 2 second delay between tests
+  evaluateOptions: {
+    maxConcurrency: 1,
+    delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
+  },
 
   // Trace prompt
   prompts: ['file://../prompts/trace-prompt.txt'],
diff --git a/evals/configs/promptfooconfig.js b/evals/configs/promptfooconfig.js
@@ -17,6 +17,7 @@ module.exports = {
         apiKey: process.env.LLM_AUTH_TOKEN,
         temperature: 0.7,
         max_tokens: 6000,
+        retry: 5,
       },
       env: {
         OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,
diff --git a/evals/graders/custom_graders.py b/evals/graders/custom_graders.py
@@ -1240,3 +1240,73 @@ def check_architectural_focus(output: str, context: dict) -> dict:
         'score': score,
         'reason': ' '.join(reasons)
     }
+
+
+def check_completeness(output: str, context: dict) -> dict:
+    """
+    Check if specification has comprehensive coverage of requirements.
+    Used for complex features like e-commerce checkout.
+
+    Args:
+        output: The generated specification text
+        context: Additional context with vars (user_input)
+
+    Returns:
+        dict with 'pass', 'score', and 'reason' keys
+    """
+    import re
+
+    output_lower = output.lower()
+    scores = []
+    details = []
+
+    # 1. Check for functional requirements section with numbered items
+    fr_pattern = re.compile(r'fr-\d+|functional requirement', re.IGNORECASE)
+    has_functional_reqs = bool(fr_pattern.search(output))
+    if has_functional_reqs:
+        scores.append(1.0)
+        details.append('functional requirements present')
+    else:
+        scores.append(0.0)
+        details.append('missing functional requirements')
+
+    # 2. Check for user stories (at least 3)
+    user_story_pattern = re.compile(r'as a .+?, i want', re.IGNORECASE)
+    user_stories = user_story_pattern.findall(output)
+    story_score = min(1.0, len(user_stories) / 3)  # Full score at 3+ stories
+    scores.append(story_score)
+    details.append(f'{len(user_stories)} user stories')
+
+    # 3. Check for non-functional requirements
+    nfr_terms = ['performance', 'security', 'scalability', 'availability', 'nfr-']
+    nfr_found = sum(1 for term in nfr_terms if term in output_lower)
+    nfr_score = min(1.0, nfr_found / 2)  # Full score at 2+ NFR topics
+    scores.append(nfr_score)
+    details.append(f'{nfr_found} NFR topics')
+
+    # 4. Check for edge cases section
+    edge_case_terms = ['edge case', 'error', 'failure', 'timeout', 'invalid', 'exception']
+    edge_found = sum(1 for term in edge_case_terms if term in output_lower)
+    edge_score = min(1.0, edge_found / 2)  # Full score at 2+ edge case terms
+    scores.append(edge_score)
+    details.append(f'{edge_found} edge case terms')
+
+    # 5. Check for specific domain terms based on user input
+    user_input = context.get('vars', {}).get('user_input', '').lower()
+
+    # For e-commerce checkout, check specific terms
+    if 'checkout' in user_input or 'cart' in user_input or 'payment' in user_input:
+        ecommerce_terms = ['cart', 'payment', 'order', 'checkout', 'confirmation', 'inventory']
+        ecommerce_found = sum(1 for term in ecommerce_terms if term in output_lower)
+        domain_score = min(1.0, ecommerce_found / 3)  # Full score at 3+ domain terms
+        scores.append(domain_score)
+        details.append(f'{ecommerce_found}/6 e-commerce terms')
+
+    # Calculate average score
+    avg_score = sum(scores) / len(scores) if scores else 0.0
+
+    return {
+        'pass': avg_score >= 0.6,
+        'score': avg_score,
+        'reason': f'Completeness: {avg_score:.0%} ({", ".join(details)})'
+    }
diff --git a/evals/prompts/plan-prompt.txt b/evals/prompts/plan-prompt.txt
@@ -1,5 +1,7 @@
 You are tasked with creating an implementation plan.
 
+LANGUAGE REQUIREMENT: You MUST respond entirely in English. All output, including any thinking, reasoning, or explanations, must be in English only. Do not use any other language.
+
 USER REQUIREMENTS:
 {{ user_input }}
 
diff --git a/evals/scripts/check_eval_scores.py b/evals/scripts/check_eval_scores.py
@@ -25,26 +25,51 @@ def load_results(file_path: str) -> Dict[str, Any]:
         sys.exit(1)
 
 
-def calculate_stats(results: Dict[str, Any]) -> Dict[str, Any]:
+def is_api_error(result: Dict[str, Any]) -> bool:
+    """Check if a result is an API error (rate limit, timeout, etc.)."""
+    # Safety check: ensure result is a dictionary
+    if not isinstance(result, dict):
+        return False
+
+    error = result.get('error', '')
+    if isinstance(error, str):
+        return 'Rate limited' in error or '429' in error or 'timeout' in error.lower()
+    return False
+
+
+def calculate_stats(results: Dict[str, Any], exclude_api_errors: bool = False) -> Dict[str, Any]:
     """Calculate statistics from evaluation results."""
-    test_results = results.get('results', [])
+    # Navigate to the actual test results array
+    results_data = results.get('results', {})
+    if isinstance(results_data, dict):
+        test_results = results_data.get('results', [])
+    else:
+        test_results = []
 
     if not test_results:
         return {
             'total': 0,
             'passed': 0,
             'failed': 0,
+            'errors': 0,
             'pass_rate': 0.0,
             'average_score': 0.0,
             'min_score': 0.0,
             'max_score': 0.0
         }
 
+    # Count API errors separately
+    api_errors = sum(1 for r in test_results if is_api_error(r))
+
+    # Filter out API errors if requested
+    if exclude_api_errors:
+        test_results = [r for r in test_results if not is_api_error(r)]
+
     total = len(test_results)
     passed = sum(1 for r in test_results if r.get('success', False))
     failed = total - passed
 
-    scores = [r.get('score', 0) for r in test_results if 'score' in r]
+    scores = [r.get('score', 0) for r in test_results if 'score' in r and r.get('score', 0) > 0]
     average_score = sum(scores) / len(scores) if scores else 0.0
     min_score = min(scores) if scores else 0.0
     max_score = max(scores) if scores else 0.0
@@ -53,6 +78,7 @@ def calculate_stats(results: Dict[str, Any]) -> Dict[str, Any]:
         'total': total,
         'passed': passed,
         'failed': failed,
+        'errors': api_errors,
         'pass_rate': passed / total if total > 0 else 0.0,
         'average_score': average_score,
         'min_score': min_score,
@@ -68,6 +94,8 @@ def print_summary(stats: Dict[str, Any], results: Dict[str, Any]) -> None:
     print(f"Total Tests:    {stats['total']}")
     print(f"Passed:         {stats['passed']} ✅")
     print(f"Failed:         {stats['failed']} ❌")
+    if stats.get('errors', 0) > 0:
+        print(f"API Errors:     {stats['errors']} ⚠️  (excluded from pass rate)")
     print(f"Pass Rate:      {stats['pass_rate']:.1%}")
     print(f"Average Score:  {stats['average_score']:.2f}")
     print(f"Score Range:    {stats['min_score']:.2f} - {stats['max_score']:.2f}")
@@ -76,7 +104,9 @@ def print_summary(stats: Dict[str, Any], results: Dict[str, Any]) -> None:
     # Show failed tests
     if stats['failed'] > 0:
         print("\n❌ Failed Tests:")
-        for i, result in enumerate(results.get('results', []), 1):
+        results_data = results.get('results', {})
+        test_results = results_data.get('results', []) if isinstance(results_data, dict) else []
+        for i, result in enumerate(test_results, 1):
             if not result.get('success', False):
                 test_name = result.get('description', f'Test {i}')
                 score = result.get('score', 0)
@@ -135,14 +165,19 @@ def main():
         action='store_true',
         help='Show detailed test results'
     )
+    parser.add_argument(
+        '--allow-api-errors',
+        action='store_true',
+        help='Exclude API errors (rate limits, timeouts) from pass rate calculation'
+    )
 
     args = parser.parse_args()
 
     # Load results
     results = load_results(args.results)
 
     # Calculate stats
-    stats = calculate_stats(results)
+    stats = calculate_stats(results, exclude_api_errors=args.allow_api_errors)
 
     # Print summary
     print_summary(stats, results)
@@ -152,7 +187,9 @@ def main():
         print("\n" + "="*60)
         print("📋 Detailed Results")
         print("="*60)
-        for i, result in enumerate(results.get('results', []), 1):
+        results_data = results.get('results', {})
+        test_results = results_data.get('results', []) if isinstance(results_data, dict) else []
+        for i, result in enumerate(test_results, 1):
             test_name = result.get('description', f'Test {i}')
             success = result.get('success', False)
             score = result.get('score', 0)
diff --git a/evals/scripts/run-promptfoo-eval.sh b/evals/scripts/run-promptfoo-eval.sh