Skip to content

Commit 63e6a81

Browse files
committed
Add workflow for validating test generation prompts
Introduces a new GitHub Actions workflow to validate test generation prompts in the 'shiny/pytest/generate' directory. Also renames workflow files for consistency, updates .gitignore to exclude new result and metadata files, and improves path handling in test metadata and evaluation scripts for robustness.
1 parent c896498 commit 63e6a81

File tree

6 files changed

+159
-4
lines changed

6 files changed

+159
-4
lines changed
File renamed without changes.
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
name: Validate Test Generation Prompts
2+
3+
on:
4+
push:
5+
paths:
6+
- 'shiny/pytest/generate/**'
7+
pull_request:
8+
paths:
9+
- 'shiny/pytest/generate/**'
10+
11+
concurrency:
12+
group: ${{ github.workflow }}-${{ github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
validate-prompts:
17+
runs-on: ubuntu-latest
18+
19+
steps:
20+
- name: Checkout repository
21+
uses: actions/checkout@v4
22+
23+
- name: Set up Python
24+
uses: actions/setup-python@v5
25+
with:
26+
python-version: '3.12'
27+
28+
- name: Install dependencies
29+
run: |
30+
pip install -e ".[test]"
31+
32+
- name: Install Playwright browsers
33+
run: |
34+
playwright install
35+
36+
- name: Run Evaluation and Tests 3 Times
37+
env:
38+
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
39+
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
40+
run: |
41+
set -e # Exit immediately if a command fails
42+
43+
for i in {1..3}
44+
do
45+
echo "--- Starting Attempt $i of 3 ---"
46+
47+
# Clean up results from previous attempt to ensure a clean slate
48+
rm -rf results/
49+
mkdir -p results/
50+
rm -f test-results.xml
51+
52+
echo "[Attempt $i] Creating test metadata..."
53+
python tests/inspect-ai/scripts/create_test_metadata.py
54+
55+
echo "[Attempt $i] Running Inspect AI evaluation..."
56+
inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \
57+
--log-dir results/ \
58+
--log-format json
59+
60+
echo "[Attempt $i] Running Tests..."
61+
test_exit_code=0
62+
# Disable exit on error just for the pytest command to check the exit code
63+
set +e
64+
pytest tests/inspect-ai/apps --tb=short --disable-warnings -n auto --maxfail=2 --junit-xml=test-results.xml || test_exit_code=$?
65+
# Re-enable exit on error immediately
66+
set -e
67+
68+
# Check if tests failed and how many failures occurred
69+
if [ "${test_exit_code:-0}" -ne 0 ]; then
70+
failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
71+
echo "Found $failure_count test failures on attempt $i"
72+
73+
# Fail the workflow if more than 1 test failed
74+
if [ "$failure_count" -gt 1 ]; then
75+
echo "More than 1 test failed on attempt $i - failing CI"
76+
exit 1
77+
fi
78+
fi
79+
echo "--- Attempt $i of 3 Succeeded ---"
80+
done
81+
82+
echo "All 3 evaluation and test runs passed successfully."
83+
84+
- name: Process Results
85+
run: |
86+
# Find the latest evaluation result file and process it
87+
latest_result=$(ls -t results/*.json | head -1)
88+
if [ -f "$latest_result" ]; then
89+
echo "Processing results from: $latest_result"
90+
python tests/inspect-ai/utils/scripts/process_results.py "$latest_result"
91+
else
92+
echo "No result files found in results/ directory"
93+
exit 1
94+
fi
95+
96+
- name: Check Quality Gate
97+
run: |
98+
if [ -f "results/summary.json" ]; then
99+
echo "Found summary file, checking quality gate..."
100+
python tests/inspect-ai/utils/scripts/quality_gate.py results/
101+
else
102+
echo "Summary file not found at results/summary.json"
103+
ls -la results/
104+
exit 1
105+
fi
106+
107+
- name: Comment PR Results
108+
if: github.event_name == 'pull_request'
109+
uses: actions/github-script@v7
110+
with:
111+
script: |
112+
const fs = require('fs');
113+
114+
try {
115+
const results = JSON.parse(fs.readFileSync('results/summary.json', 'utf8'));
116+
117+
const comment = `## Inspect AI Evaluation Results
118+
119+
- **Tests Passed**: ${results.passed}/${results.total}
120+
- **Quality Gate**: ${results.quality_gate_passed ? '✅ PASSED' : '❌ FAILED'}
121+
122+
### Details
123+
${results.details}
124+
`;
125+
126+
github.rest.issues.createComment({
127+
issue_number: context.issue.number,
128+
owner: context.repo.owner,
129+
repo: context.repo.repo,
130+
body: comment
131+
});
132+
} catch (error) {
133+
console.error('Error reading summary file:', error);
134+
const comment = `## Inspect AI Evaluation Results
135+
136+
❌ **Error**: Could not read evaluation results summary file.
137+
138+
Please check the workflow logs for details.`;
139+
140+
github.rest.issues.createComment({
141+
issue_number: context.issue.number,
142+
owner: context.repo.owner,
143+
repo: context.repo.repo,
144+
body: comment
145+
});
146+
}
File renamed without changes.

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,3 +124,6 @@ shiny_bookmarks/
124124
# setuptools_scm
125125
shiny/_version.py
126126
tests/inspect-ai/apps/*/test_*.py
127+
test-results.xml
128+
/results
129+
tests/inspect-ai/scripts/test_metadata.json

tests/inspect-ai/scripts/create_test_metadata.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
from itertools import islice
33
from pathlib import Path
44

5-
from ....shiny.pytest.generate import ShinyTestGenerator
5+
from shiny.pytest.generate import ShinyTestGenerator
66

77

88
def generate_shiny_test_metadata(
9-
apps_dir: str | Path = "apps", max_tests: int = 10
9+
apps_dir: str | Path = "tests/inspect-ai/apps", max_tests: int = 10
1010
) -> dict:
1111
"""
1212
Generate Shiny tests and metadata for apps in the specified directory.
@@ -21,6 +21,12 @@ def generate_shiny_test_metadata(
2121
generator = ShinyTestGenerator()
2222
apps_dir = Path(apps_dir)
2323

24+
if not apps_dir.exists() and apps_dir.is_relative_to("."):
25+
script_dir = Path(__file__).parent
26+
apps_dir = script_dir.parent / "apps"
27+
if not apps_dir.exists():
28+
apps_dir = script_dir.parent.parent.parent / "tests" / "inspect-ai" / "apps"
29+
2430
app_files = islice(apps_dir.glob("*/app*.py"), max_tests)
2531

2632
test_data = {}

tests/inspect-ai/scripts/evaluation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -168,8 +168,8 @@ def shiny_test_evaluation() -> Task:
168168
Inspect AI task for evaluating generated Shiny tests.
169169
"""
170170
# Load test data from the JSON file
171-
repo_root = Path(__file__).parent.parent # Go up from evals/ to repo root
172-
metadata_file = repo_root / "evals" / "test_metadata.json"
171+
script_dir = Path(__file__).parent # Current script directory
172+
metadata_file = script_dir / "test_metadata.json"
173173
with open(metadata_file, "r") as f:
174174
test_data = json.load(f)
175175

0 commit comments

Comments
 (0)