Add workflow for validating test generation prompts

karangattu · karangattu · commit 63e6a818ee89 · 2025-07-25T19:20:30.000+05:30
Introduces a new GitHub Actions workflow to validate test generation prompts in the 'shiny/pytest/generate' directory. Also renames workflow files for consistency, updates .gitignore to exclude new result and metadata files, and improves path handling in test metadata and evaluation scripts for robustness.
diff --git a/.github/workflows/validate-conventional-commits.yaml b/.github/workflows/validate-conventional-commits.yaml
diff --git a/.github/workflows/validate-test-generation-prompts.yaml b/.github/workflows/validate-test-generation-prompts.yaml
@@ -0,0 +1,146 @@
+name: Validate Test Generation Prompts
+
+on:
+  push:
+    paths:
+      - 'shiny/pytest/generate/**'
+  pull_request:
+    paths:
+      - 'shiny/pytest/generate/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  validate-prompts:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+
+      - name: Install dependencies
+        run: |
+          pip install -e ".[test]"
+
+      - name: Install Playwright browsers
+        run: |
+          playwright install
+
+      - name: Run Evaluation and Tests 3 Times
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        run: |
+          set -e # Exit immediately if a command fails
+
+          for i in {1..3}
+          do
+            echo "--- Starting Attempt $i of 3 ---"
+
+            # Clean up results from previous attempt to ensure a clean slate
+            rm -rf results/
+            mkdir -p results/
+            rm -f test-results.xml
+
+            echo "[Attempt $i] Creating test metadata..."
+            python tests/inspect-ai/scripts/create_test_metadata.py
+
+            echo "[Attempt $i] Running Inspect AI evaluation..."
+            inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \
+              --log-dir results/ \
+              --log-format json
+
+            echo "[Attempt $i] Running Tests..."
+            test_exit_code=0
+            # Disable exit on error just for the pytest command to check the exit code
+            set +e
+            pytest tests/inspect-ai/apps --tb=short --disable-warnings -n auto --maxfail=2 --junit-xml=test-results.xml || test_exit_code=$?
+            # Re-enable exit on error immediately
+            set -e
+
+            # Check if tests failed and how many failures occurred
+            if [ "${test_exit_code:-0}" -ne 0 ]; then
+              failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
+              echo "Found $failure_count test failures on attempt $i"
+
+              # Fail the workflow if more than 1 test failed
+              if [ "$failure_count" -gt 1 ]; then
+                echo "More than 1 test failed on attempt $i - failing CI"
+                exit 1
+              fi
+            fi
+            echo "--- Attempt $i of 3 Succeeded ---"
+          done
+
+          echo "All 3 evaluation and test runs passed successfully."
+
+      - name: Process Results
+        run: |
+          # Find the latest evaluation result file and process it
+          latest_result=$(ls -t results/*.json | head -1)
+          if [ -f "$latest_result" ]; then
+            echo "Processing results from: $latest_result"
+            python tests/inspect-ai/utils/scripts/process_results.py "$latest_result"
+          else
+            echo "No result files found in results/ directory"
+            exit 1
+          fi
+
+      - name: Check Quality Gate
+        run: |
+          if [ -f "results/summary.json" ]; then
+            echo "Found summary file, checking quality gate..."
+            python tests/inspect-ai/utils/scripts/quality_gate.py results/
+          else
+            echo "Summary file not found at results/summary.json"
+            ls -la results/
+            exit 1
+          fi
+
+      - name: Comment PR Results
+        if: github.event_name == 'pull_request'
+        uses: actions/github-script@v7
+        with:
+          script: |
+            const fs = require('fs');
+
+            try {
+              const results = JSON.parse(fs.readFileSync('results/summary.json', 'utf8'));
+
+              const comment = `## Inspect AI Evaluation Results
+
+              - **Tests Passed**: ${results.passed}/${results.total}
+              - **Quality Gate**: ${results.quality_gate_passed ? '✅ PASSED' : '❌ FAILED'}
+
+              ### Details
+              ${results.details}
+              `;
+
+              github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: comment
+              });
+            } catch (error) {
+              console.error('Error reading summary file:', error);
+              const comment = `## Inspect AI Evaluation Results
+
+              ❌ **Error**: Could not read evaluation results summary file.
+
+              Please check the workflow logs for details.`;
+
+              github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: comment
+              });
+            }
diff --git a/.github/workflows/validate-testing-docs-on-change.yml b/.github/workflows/validate-testing-docs-on-change.yml
diff --git a/.gitignore b/.gitignore
@@ -124,3 +124,6 @@ shiny_bookmarks/
 # setuptools_scm
 shiny/_version.py
 tests/inspect-ai/apps/*/test_*.py
+test-results.xml
+/results
+tests/inspect-ai/scripts/test_metadata.json
diff --git a/tests/inspect-ai/scripts/create_test_metadata.py b/tests/inspect-ai/scripts/create_test_metadata.py
@@ -2,11 +2,11 @@
 from itertools import islice
 from pathlib import Path
 
-from ....shiny.pytest.generate import ShinyTestGenerator
+from shiny.pytest.generate import ShinyTestGenerator
 
 
 def generate_shiny_test_metadata(
-    apps_dir: str | Path = "apps", max_tests: int = 10
+    apps_dir: str | Path = "tests/inspect-ai/apps", max_tests: int = 10
 ) -> dict:
     """
     Generate Shiny tests and metadata for apps in the specified directory.
@@ -21,6 +21,12 @@ def generate_shiny_test_metadata(
     generator = ShinyTestGenerator()
     apps_dir = Path(apps_dir)
 
+    if not apps_dir.exists() and apps_dir.is_relative_to("."):
+        script_dir = Path(__file__).parent
+        apps_dir = script_dir.parent / "apps"
+        if not apps_dir.exists():
+            apps_dir = script_dir.parent.parent.parent / "tests" / "inspect-ai" / "apps"
+
     app_files = islice(apps_dir.glob("*/app*.py"), max_tests)
 
     test_data = {}
diff --git a/tests/inspect-ai/scripts/evaluation.py b/tests/inspect-ai/scripts/evaluation.py
@@ -168,8 +168,8 @@ def shiny_test_evaluation() -> Task:
     Inspect AI task for evaluating generated Shiny tests.
     """
     # Load test data from the JSON file
-    repo_root = Path(__file__).parent.parent  # Go up from evals/ to repo root
-    metadata_file = repo_root / "evals" / "test_metadata.json"
+    script_dir = Path(__file__).parent  # Current script directory
+    metadata_file = script_dir / "test_metadata.json"
     with open(metadata_file, "r") as f:
         test_data = json.load(f)