|
| 1 | +name: Validate Test Generation Prompts |
| 2 | + |
| 3 | +on: |
| 4 | + push: |
| 5 | + paths: |
| 6 | + - 'shiny/pytest/generate/**' |
| 7 | + pull_request: |
| 8 | + paths: |
| 9 | + - 'shiny/pytest/generate/**' |
| 10 | + |
| 11 | +concurrency: |
| 12 | + group: ${{ github.workflow }}-${{ github.ref }} |
| 13 | + cancel-in-progress: true |
| 14 | + |
| 15 | +jobs: |
| 16 | + validate-prompts: |
| 17 | + runs-on: ubuntu-latest |
| 18 | + |
| 19 | + steps: |
| 20 | + - name: Checkout repository |
| 21 | + uses: actions/checkout@v4 |
| 22 | + |
| 23 | + - name: Set up Python |
| 24 | + uses: actions/setup-python@v5 |
| 25 | + with: |
| 26 | + python-version: '3.12' |
| 27 | + |
| 28 | + - name: Install dependencies |
| 29 | + run: | |
| 30 | + pip install -e ".[test]" |
| 31 | +
|
| 32 | + - name: Install Playwright browsers |
| 33 | + run: | |
| 34 | + playwright install |
| 35 | +
|
| 36 | + - name: Run Evaluation and Tests 3 Times |
| 37 | + env: |
| 38 | + OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} |
| 39 | + ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} |
| 40 | + run: | |
| 41 | + set -e # Exit immediately if a command fails |
| 42 | +
|
| 43 | + for i in {1..3} |
| 44 | + do |
| 45 | + echo "--- Starting Attempt $i of 3 ---" |
| 46 | +
|
| 47 | + # Clean up results from previous attempt to ensure a clean slate |
| 48 | + rm -rf results/ |
| 49 | + mkdir -p results/ |
| 50 | + rm -f test-results.xml |
| 51 | +
|
| 52 | + echo "[Attempt $i] Creating test metadata..." |
| 53 | + python tests/inspect-ai/scripts/create_test_metadata.py |
| 54 | +
|
| 55 | + echo "[Attempt $i] Running Inspect AI evaluation..." |
| 56 | + inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \ |
| 57 | + --log-dir results/ \ |
| 58 | + --log-format json |
| 59 | +
|
| 60 | + echo "[Attempt $i] Running Tests..." |
| 61 | + test_exit_code=0 |
| 62 | + # Disable exit on error just for the pytest command to check the exit code |
| 63 | + set +e |
| 64 | + pytest tests/inspect-ai/apps --tb=short --disable-warnings -n auto --maxfail=2 --junit-xml=test-results.xml || test_exit_code=$? |
| 65 | + # Re-enable exit on error immediately |
| 66 | + set -e |
| 67 | +
|
| 68 | + # Check if tests failed and how many failures occurred |
| 69 | + if [ "${test_exit_code:-0}" -ne 0 ]; then |
| 70 | + failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0") |
| 71 | + echo "Found $failure_count test failures on attempt $i" |
| 72 | +
|
| 73 | + # Fail the workflow if more than 1 test failed |
| 74 | + if [ "$failure_count" -gt 1 ]; then |
| 75 | + echo "More than 1 test failed on attempt $i - failing CI" |
| 76 | + exit 1 |
| 77 | + fi |
| 78 | + fi |
| 79 | + echo "--- Attempt $i of 3 Succeeded ---" |
| 80 | + done |
| 81 | +
|
| 82 | + echo "All 3 evaluation and test runs passed successfully." |
| 83 | +
|
| 84 | + - name: Process Results |
| 85 | + run: | |
| 86 | + # Find the latest evaluation result file and process it |
| 87 | + latest_result=$(ls -t results/*.json | head -1) |
| 88 | + if [ -f "$latest_result" ]; then |
| 89 | + echo "Processing results from: $latest_result" |
| 90 | + python tests/inspect-ai/utils/scripts/process_results.py "$latest_result" |
| 91 | + else |
| 92 | + echo "No result files found in results/ directory" |
| 93 | + exit 1 |
| 94 | + fi |
| 95 | +
|
| 96 | + - name: Check Quality Gate |
| 97 | + run: | |
| 98 | + if [ -f "results/summary.json" ]; then |
| 99 | + echo "Found summary file, checking quality gate..." |
| 100 | + python tests/inspect-ai/utils/scripts/quality_gate.py results/ |
| 101 | + else |
| 102 | + echo "Summary file not found at results/summary.json" |
| 103 | + ls -la results/ |
| 104 | + exit 1 |
| 105 | + fi |
| 106 | +
|
| 107 | + - name: Comment PR Results |
| 108 | + if: github.event_name == 'pull_request' |
| 109 | + uses: actions/github-script@v7 |
| 110 | + with: |
| 111 | + script: | |
| 112 | + const fs = require('fs'); |
| 113 | +
|
| 114 | + try { |
| 115 | + const results = JSON.parse(fs.readFileSync('results/summary.json', 'utf8')); |
| 116 | +
|
| 117 | + const comment = `## Inspect AI Evaluation Results |
| 118 | +
|
| 119 | + - **Tests Passed**: ${results.passed}/${results.total} |
| 120 | + - **Quality Gate**: ${results.quality_gate_passed ? '✅ PASSED' : '❌ FAILED'} |
| 121 | +
|
| 122 | + ### Details |
| 123 | + ${results.details} |
| 124 | + `; |
| 125 | +
|
| 126 | + github.rest.issues.createComment({ |
| 127 | + issue_number: context.issue.number, |
| 128 | + owner: context.repo.owner, |
| 129 | + repo: context.repo.repo, |
| 130 | + body: comment |
| 131 | + }); |
| 132 | + } catch (error) { |
| 133 | + console.error('Error reading summary file:', error); |
| 134 | + const comment = `## Inspect AI Evaluation Results |
| 135 | +
|
| 136 | + ❌ **Error**: Could not read evaluation results summary file. |
| 137 | +
|
| 138 | + Please check the workflow logs for details.`; |
| 139 | +
|
| 140 | + github.rest.issues.createComment({ |
| 141 | + issue_number: context.issue.number, |
| 142 | + owner: context.repo.owner, |
| 143 | + repo: context.repo.repo, |
| 144 | + body: comment |
| 145 | + }); |
| 146 | + } |
0 commit comments