Skip to content

Commit 4007491

Browse files
authored
Merge pull request #3 from tikalk/eval_github_actions
Run AI evals on PRs and fix doc links
2 parents 2779aef + d09a2d5 commit 4007491

File tree

12 files changed

+195
-36
lines changed

12 files changed

+195
-36
lines changed

.github/workflows/eval.yml

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,14 @@
11
name: AI Evals
22

33
on:
4-
workflow_dispatch: # Manual trigger only
4+
pull_request:
5+
branches: [main]
6+
workflow_dispatch: # Manual trigger still available
57
inputs:
68
model:
79
description: 'Model to use for evaluation'
810
required: false
9-
default: 'claude-sonnet-4-5-20250929'
11+
default: 'GLM-4.6V-Flash'
1012
type: string
1113

1214
jobs:
@@ -24,6 +26,14 @@ jobs:
2426
with:
2527
node-version: '20'
2628

29+
- name: Restore promptfoo cache
30+
uses: actions/cache@v4
31+
with:
32+
path: ~/.promptfoo/cache
33+
key: promptfoo-${{ hashFiles('evals/configs/**', 'evals/prompts/**') }}-${{ github.event.inputs.model || secrets.LLM_MODEL || 'default' }}
34+
restore-keys: |
35+
promptfoo-${{ hashFiles('evals/configs/**', 'evals/prompts/**') }}-
36+
2737
- name: Setup Python
2838
uses: actions/setup-python@v5
2939
with:
@@ -32,24 +42,32 @@ jobs:
3242
- name: Install Python dependencies
3343
run: |
3444
python -m pip install --upgrade pip
35-
# No requirements.txt needed for check_eval_scores.py (uses stdlib only)
45+
pip install -e ".[test]"
46+
47+
- name: Run pytest
48+
run: |
49+
pytest tests/ -v --tb=short
3650
3751
- name: Run Evaluations
52+
continue-on-error: true
3853
env:
3954
LLM_BASE_URL: ${{ secrets.LLM_BASE_URL }}
4055
LLM_AUTH_TOKEN: ${{ secrets.LLM_AUTH_TOKEN }}
41-
LLM_MODEL: ${{ github.event.inputs.model || 'claude-sonnet-4-5-20250929' }}
56+
LLM_MODEL: ${{ github.event.inputs.model || secrets.LLM_MODEL || 'llama-3.3-70b-versatile' }}
57+
PROMPTFOO_REQUEST_BACKOFF_MS: '30000'
4258
run: |
4359
chmod +x ./evals/scripts/run-promptfoo-eval.sh
4460
./evals/scripts/run-promptfoo-eval.sh --json
4561
4662
- name: Check Quality Thresholds
4763
id: check_thresholds
64+
continue-on-error: true
4865
run: |
4966
python3 evals/scripts/check_eval_scores.py \
5067
--results eval-results.json \
51-
--min-score 0.70 \
52-
--min-pass-rate 0.70 \
68+
--min-score 0.50 \
69+
--min-pass-rate 0.85 \
70+
--allow-api-errors \
5371
--verbose || echo "threshold_failed=true" >> $GITHUB_OUTPUT
5472
5573
- name: Generate Summary
@@ -101,10 +119,10 @@ jobs:
101119
summary += f"- {test_name} (score: {score:.2f})\n"
102120
103121
# Success message
104-
if pass_rate >= 70:
122+
if pass_rate >= 85:
105123
summary += "\n✅ **Quality thresholds met!**"
106124
else:
107-
summary += "\n⚠️ **Quality thresholds not met.** Please review failures."
125+
summary += "\n⚠️ **Quality thresholds not met (target: 85%).** Please review failures."
108126
109127
# Write to output file for PR comment
110128
with open('eval_summary.txt', 'w') as f:
@@ -169,8 +187,21 @@ jobs:
169187
eval_summary.txt
170188
retention-days: 30
171189

172-
- name: Fail if thresholds not met
173-
if: steps.check_thresholds.outputs.threshold_failed == 'true'
190+
# Note: This step will fail but not block the PR (informational only)
191+
- name: Report threshold status
192+
if: always()
174193
run: |
175-
echo "❌ Quality thresholds not met"
176-
exit 1
194+
if [ "${{ steps.check_thresholds.outcome }}" = "failure" ]; then
195+
echo "⚠️ Quality thresholds not met (target: 85% pass rate)"
196+
echo "This is informational only and does not block the PR."
197+
exit 0
198+
else
199+
echo "✅ Quality thresholds met!"
200+
fi
201+
202+
# - name: Fail if thresholds not met
203+
# if: steps.check_thresholds.outputs.threshold_failed == 'true'
204+
# run: |
205+
# echo "❌ Quality thresholds not met"
206+
# exit 1
207+

evals/configs/promptfooconfig-arch.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Architecture Template Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 2000, // 2 second delay between tests
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
9+
},
810

911
// Architecture prompt
1012
prompts: ['file://../prompts/arch-prompt.txt'],

evals/configs/promptfooconfig-clarify.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Clarify Command Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 2000, // 2 second delay between tests
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
9+
},
810

911
// Clarify prompt
1012
prompts: ['file://../prompts/clarify-prompt.txt'],

evals/configs/promptfooconfig-ext.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Extension System Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 2000, // 2 second delay between tests
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
9+
},
810

911
// Extension prompt
1012
prompts: ['file://../prompts/ext-prompt.txt'],

evals/configs/promptfooconfig-plan.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Plan Template Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 2000, // 2 second delay between tests
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
9+
},
810

911
// Plan prompt only
1012
prompts: ['file://../prompts/plan-prompt.txt'],

evals/configs/promptfooconfig-spec.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Spec Template Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 5000, // 5 second delay between tests (increased for Groq)
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 5000, // 15s in CI to avoid rate limiting, 5s locally
9+
},
810

911
// Spec prompt only
1012
prompts: ['file://../prompts/spec-prompt.txt'],

evals/configs/promptfooconfig-trace.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ module.exports = {
33
description: 'Trace Template Quality Evaluation',
44

55
// Rate limiting to avoid 429 errors
6-
maxConcurrency: 1,
7-
delay: 2000, // 2 second delay between tests
6+
evaluateOptions: {
7+
maxConcurrency: 1,
8+
delay: process.env.CI ? 15000 : 2000, // 15s in CI to avoid rate limiting, 2s locally
9+
},
810

911
// Trace prompt
1012
prompts: ['file://../prompts/trace-prompt.txt'],

evals/configs/promptfooconfig.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ module.exports = {
1717
apiKey: process.env.LLM_AUTH_TOKEN,
1818
temperature: 0.7,
1919
max_tokens: 6000,
20+
retry: 5,
2021
},
2122
env: {
2223
OPENAI_API_KEY: process.env.LLM_AUTH_TOKEN,

evals/graders/custom_graders.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1240,3 +1240,73 @@ def check_architectural_focus(output: str, context: dict) -> dict:
12401240
'score': score,
12411241
'reason': ' '.join(reasons)
12421242
}
1243+
1244+
1245+
def check_completeness(output: str, context: dict) -> dict:
1246+
"""
1247+
Check if specification has comprehensive coverage of requirements.
1248+
Used for complex features like e-commerce checkout.
1249+
1250+
Args:
1251+
output: The generated specification text
1252+
context: Additional context with vars (user_input)
1253+
1254+
Returns:
1255+
dict with 'pass', 'score', and 'reason' keys
1256+
"""
1257+
import re
1258+
1259+
output_lower = output.lower()
1260+
scores = []
1261+
details = []
1262+
1263+
# 1. Check for functional requirements section with numbered items
1264+
fr_pattern = re.compile(r'fr-\d+|functional requirement', re.IGNORECASE)
1265+
has_functional_reqs = bool(fr_pattern.search(output))
1266+
if has_functional_reqs:
1267+
scores.append(1.0)
1268+
details.append('functional requirements present')
1269+
else:
1270+
scores.append(0.0)
1271+
details.append('missing functional requirements')
1272+
1273+
# 2. Check for user stories (at least 3)
1274+
user_story_pattern = re.compile(r'as a .+?, i want', re.IGNORECASE)
1275+
user_stories = user_story_pattern.findall(output)
1276+
story_score = min(1.0, len(user_stories) / 3) # Full score at 3+ stories
1277+
scores.append(story_score)
1278+
details.append(f'{len(user_stories)} user stories')
1279+
1280+
# 3. Check for non-functional requirements
1281+
nfr_terms = ['performance', 'security', 'scalability', 'availability', 'nfr-']
1282+
nfr_found = sum(1 for term in nfr_terms if term in output_lower)
1283+
nfr_score = min(1.0, nfr_found / 2) # Full score at 2+ NFR topics
1284+
scores.append(nfr_score)
1285+
details.append(f'{nfr_found} NFR topics')
1286+
1287+
# 4. Check for edge cases section
1288+
edge_case_terms = ['edge case', 'error', 'failure', 'timeout', 'invalid', 'exception']
1289+
edge_found = sum(1 for term in edge_case_terms if term in output_lower)
1290+
edge_score = min(1.0, edge_found / 2) # Full score at 2+ edge case terms
1291+
scores.append(edge_score)
1292+
details.append(f'{edge_found} edge case terms')
1293+
1294+
# 5. Check for specific domain terms based on user input
1295+
user_input = context.get('vars', {}).get('user_input', '').lower()
1296+
1297+
# For e-commerce checkout, check specific terms
1298+
if 'checkout' in user_input or 'cart' in user_input or 'payment' in user_input:
1299+
ecommerce_terms = ['cart', 'payment', 'order', 'checkout', 'confirmation', 'inventory']
1300+
ecommerce_found = sum(1 for term in ecommerce_terms if term in output_lower)
1301+
domain_score = min(1.0, ecommerce_found / 3) # Full score at 3+ domain terms
1302+
scores.append(domain_score)
1303+
details.append(f'{ecommerce_found}/6 e-commerce terms')
1304+
1305+
# Calculate average score
1306+
avg_score = sum(scores) / len(scores) if scores else 0.0
1307+
1308+
return {
1309+
'pass': avg_score >= 0.6,
1310+
'score': avg_score,
1311+
'reason': f'Completeness: {avg_score:.0%} ({", ".join(details)})'
1312+
}

evals/prompts/plan-prompt.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
You are tasked with creating an implementation plan.
22

3+
LANGUAGE REQUIREMENT: You MUST respond entirely in English. All output, including any thinking, reasoning, or explanations, must be in English only. Do not use any other language.
4+
35
USER REQUIREMENTS:
46
{{ user_input }}
57

0 commit comments

Comments
 (0)