feat(cli): Add AI support to shiny add test
          
            #3
        
      Workflow file for this run
  
    
      This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
      Learn more about bidirectional Unicode characters
    
  
  
    
  | name: Validate Test Generation Prompts | |
| on: | |
| push: | |
| paths: | |
| - 'shiny/pytest/generate/**' | |
| pull_request: | |
| paths: | |
| - 'shiny/pytest/generate/**' | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.ref }} | |
| cancel-in-progress: true | |
| jobs: | |
| validate-prompts: | |
| runs-on: ubuntu-latest | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set up Python | |
| uses: actions/setup-python@v5 | |
| with: | |
| python-version: '3.12' | |
| - name: Setup py-shiny | |
| id: install | |
| uses: ./.github/py-shiny/setup | |
| - name: Install dependencies | |
| run: | | |
| python -m pip install --upgrade pip | |
| make install-deps | |
| make install | |
| - name: Install Playwright browsers | |
| run: | | |
| playwright install | |
| - name: Run Evaluation and Tests 3 Times | |
| env: | |
| OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} | |
| ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} | |
| run: | | |
| set -e # Exit immediately if a command fails | |
| for i in {1..3} | |
| do | |
| echo "--- Starting Attempt $i of 3 ---" | |
| # Clean up results from previous attempt to ensure a clean slate | |
| rm -rf results/ | |
| mkdir -p results/ | |
| rm -f test-results.xml | |
| echo "[Attempt $i] Creating test metadata..." | |
| python tests/inspect-ai/scripts/create_test_metadata.py | |
| echo "[Attempt $i] Running Inspect AI evaluation..." | |
| inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \ | |
| --log-dir results/ \ | |
| --log-format json | |
| echo "[Attempt $i] Running Tests..." | |
| test_exit_code=0 | |
| # Disable exit on error just for the pytest command to check the exit code | |
| set +e | |
| pytest tests/inspect-ai/apps --tb=short --disable-warnings -n auto --maxfail=2 --junit-xml=test-results.xml || test_exit_code=$? | |
| # Re-enable exit on error immediately | |
| set -e | |
| # Check if tests failed and how many failures occurred | |
| if [ "${test_exit_code:-0}" -ne 0 ]; then | |
| failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0") | |
| echo "Found $failure_count test failures on attempt $i" | |
| # Fail the workflow if more than 1 test failed | |
| if [ "$failure_count" -gt 1 ]; then | |
| echo "More than 1 test failed on attempt $i - failing CI" | |
| exit 1 | |
| fi | |
| fi | |
| echo "--- Attempt $i of 3 Succeeded ---" | |
| done | |
| echo "All 3 evaluation and test runs passed successfully." | |
| - name: Process Results | |
| run: | | |
| # Find the latest evaluation result file and process it | |
| latest_result=$(ls -t results/*.json | head -1) | |
| if [ -f "$latest_result" ]; then | |
| echo "Processing results from: $latest_result" | |
| python tests/inspect-ai/utils/scripts/process_results.py "$latest_result" | |
| else | |
| echo "No result files found in results/ directory" | |
| exit 1 | |
| fi | |
| - name: Check Quality Gate | |
| run: | | |
| if [ -f "results/summary.json" ]; then | |
| echo "Found summary file, checking quality gate..." | |
| python tests/inspect-ai/utils/scripts/quality_gate.py results/ | |
| else | |
| echo "Summary file not found at results/summary.json" | |
| ls -la results/ | |
| exit 1 | |
| fi | |
| - name: Comment PR Results | |
| if: github.event_name == 'pull_request' | |
| uses: actions/github-script@v7 | |
| with: | |
| script: | | |
| const fs = require('fs'); | |
| try { | |
| const results = JSON.parse(fs.readFileSync('results/summary.json', 'utf8')); | |
| const comment = `## Inspect AI Evaluation Results | |
| - **Tests Passed**: ${results.passed}/${results.total} | |
| - **Quality Gate**: ${results.quality_gate_passed ? '✅ PASSED' : '❌ FAILED'} | |
| ### Details | |
| ${results.details} | |
| `; | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| } catch (error) { | |
| console.error('Error reading summary file:', error); | |
| const comment = `## Inspect AI Evaluation Results | |
| ❌ **Error**: Could not read evaluation results summary file. | |
| Please check the workflow logs for details.`; | |
| github.rest.issues.createComment({ | |
| issue_number: context.issue.number, | |
| owner: context.repo.owner, | |
| repo: context.repo.repo, | |
| body: comment | |
| }); | |
| } |