Refactor CI test evaluation and update Python version

karangattu · karangattu · commit 038a350bec8e · 2025-08-05T05:42:20.000-07:00
Moved the test evaluation logic to a reusable shell script and updated workflows to use Python 3.13. Improved the testing docs workflow to check for documentation sync and provide clearer PR comments. Added 'openai' to chatlas test dependencies in pyproject.toml.
diff --git a/.github/workflows/verify-test-generation-prompts.yaml b/.github/workflows/verify-test-generation-prompts.yaml
@@ -10,7 +10,7 @@ concurrency:
   cancel-in-progress: true
 
 env:
-  PYTHON_VERSION: "3.12"
+  PYTHON_VERSION: "3.13"
   ATTEMPTS: 3
   PYTHONUNBUFFERED: 1
 
@@ -70,80 +70,7 @@ jobs:
           ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
           PYTHONUNBUFFERED: 1
         timeout-minutes: 25
-        run: |
-          set -e # Exit immediately if a command fails
-
-          # Function to log with timestamp
-          log_with_timestamp() {
-            echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
-          }
-
-          # Function to cleanup hanging processes
-          cleanup_processes() {
-            log_with_timestamp "Cleaning up any hanging processes..."
-            pkill -f "playwright" || true
-            pkill -f "chromium" || true
-            pkill -f "pytest" || true
-          }
-
-          # Set up trap to cleanup on exit
-          trap cleanup_processes EXIT
-
-          for i in {1..3}
-          do
-            log_with_timestamp "Starting Attempt $i of 3"
-
-            # Clean up results from previous attempt to ensure a clean slate
-            rm -rf results/
-            mkdir -p results/
-            rm -f test-results.xml
-
-            log_with_timestamp "[Attempt $i] Creating test metadata..."
-            python tests/inspect-ai/scripts/create_test_metadata.py
-
-            log_with_timestamp "[Attempt $i] Running Inspect AI evaluation..."
-            inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \
-              --log-dir results/ \
-              --log-format json
-
-            log_with_timestamp "[Attempt $i] Running Tests..."
-            test_exit_code=0
-            # Disable exit on error just for the pytest command to check the exit code
-            set +e
-            timeout 15m pytest tests/inspect-ai/apps \
-              --tb=short \
-              --disable-warnings \
-              --maxfail=2 \
-              --junit-xml=test-results.xml \
-              --durations=10 \
-              --timeout=300 \
-              --timeout-method=thread \
-              -v || test_exit_code=$?
-            # Re-enable exit on error immediately
-            set -e
-
-            # Check if timeout occurred
-            if [ "${test_exit_code:-0}" -eq 124 ]; then
-              log_with_timestamp "Tests timed out on attempt $i - this may indicate hanging tests"
-              cleanup_processes
-              exit 1
-            fi
-
-            # Check if tests failed and how many failures occurred
-            if [ "${test_exit_code:-0}" -ne 0 ]; then
-              failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
-              log_with_timestamp "Found $failure_count test failures on attempt $i"
-
-              # Fail the workflow if more than 1 test failed
-              if [ "$failure_count" -gt 1 ]; then
-                log_with_timestamp "More than 1 test failed on attempt $i - failing CI"
-                exit 1
-              fi
-            fi
-            log_with_timestamp "Attempt $i of 3 Succeeded"
-          done
-
-          log_with_timestamp "All 3 evaluation and test runs passed successfully."
+        run: ./tests/inspect-ai/scripts/run-test-evaluation.sh
 
       - name: Upload test results
         if: always()
diff --git a/.github/workflows/verify-testing-docs-on-change.yml b/.github/workflows/verify-testing-docs-on-change.yml
@@ -33,40 +33,74 @@ jobs:
             echo "No changes detected in shiny/playwright/controller directory"
           fi
 
-      - name: Comment on PR about testing docs update
+      - name: Set up Python
+        if: steps.check-controller.outputs.controller_changed == 'true'
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.13'
+
+      - name: Install uv
+        if: steps.check-controller.outputs.controller_changed == 'true'
+        uses: astral-sh/setup-uv@v4
+
+      - name: Install dependencies
+        if: steps.check-controller.outputs.controller_changed == 'true'
+        run: |
+          uv pip install --system --upgrade pip
+          uv pip install --system -e ".[dev,test]"
+
+      - name: Update testing docs and check for changes
         if: steps.check-controller.outputs.controller_changed == 'true'
+        id: check-docs-changes
+        run: |
+          # Store the current state of the documentation file
+          cp shiny/pytest/generate/data/docs/documentation_testing.json documentation_testing_before.json
+
+          # Run the make command to update testing docs
+          make update-testing-docs
+
+          # Check if the documentation file has changed
+          if ! diff -q documentation_testing_before.json shiny/pytest/generate/data/docs/documentation_testing.json > /dev/null 2>&1; then
+            echo "docs_changed=true" >> $GITHUB_OUTPUT
+            echo "Documentation file has changed after running make update-testing-docs"
+            echo "The generated documentation is out of sync with the current controller changes."
+            exit 1
+          else
+            echo "docs_changed=false" >> $GITHUB_OUTPUT
+            echo "Documentation file is up to date"
+          fi
+
+      - name: Comment on PR about testing docs update
+        if: steps.check-docs-changes.outputs.docs_changed == 'true'
         uses: marocchino/sticky-pull-request-comment@v2
         with:
           header: testing-docs-update
           message: |
-            🤖 **Testing Documentation Update Required**
+            🚨 **Testing Documentation Out of Sync**
 
-            We detected changes in the `shiny/playwright/controller` directory. These changes may affect the testing documentation used by the `shiny add test` command.
+            We detected changes in the `shiny/playwright/controller` directory that affect the testing documentation used by the `shiny add test` command.
 
-            **Please run the following command to update the testing documentation:**
+            **The generated documentation is out of sync with your controller changes. Please run:**
 
             ```bash
             make update-testing-docs
             ```
 
+            **Then commit the updated `shiny/pytest/generate/data/docs/documentation_testing.json` file.**
+
             <details><summary>Additional details</summary>
 
-            This command will:
-            1. Install repomix if not already installed
-            2. Build the latest documentation with quartodoc
-            3. Generate repomix output for testing docs
-            4. Process the output to update the AI test generator documentation
-            5. Clean up temporary files
+            The updated documentation file ensures that the AI test generator has access to the latest controller API documentation.
 
             </details>
 
-            This will ensure that the AI test generator has access to the latest controller API documentation.
+            ❌ **This check will fail until the documentation is updated and committed.**
 
             ---
             *This comment was automatically generated by the validate_testing_docs workflow.*
 
-      - name: Remove comment when no controller changes
-        if: steps.check-controller.outputs.controller_changed == 'false'
+      - name: Remove comment when no controller changes or docs are up to date
+        if: steps.check-controller.outputs.controller_changed == 'false' || (steps.check-controller.outputs.controller_changed == 'true' && steps.check-docs-changes.outputs.docs_changed == 'false')
         uses: marocchino/sticky-pull-request-comment@v2
         with:
           header: testing-docs-update
diff --git a/pyproject.toml b/pyproject.toml
@@ -91,7 +91,7 @@ test = [
     "dask[dataframe]",
     "pyarrow",
     "pyarrow-stubs",
-    "chatlas[anthropic]",
+    "chatlas[anthropic,openai]",
     "chatlas[openai]",
 ]
 dev = [
diff --git a/tests/inspect-ai/scripts/run-test-evaluation.sh b/tests/inspect-ai/scripts/run-test-evaluation.sh
@@ -0,0 +1,75 @@
+#!/bin/bash
+
+set -e # Exit immediately if a command fails
+
+# Function to log with timestamp
+log_with_timestamp() {
+  echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1"
+}
+
+# Function to cleanup hanging processes
+cleanup_processes() {
+  log_with_timestamp "Cleaning up any hanging processes..."
+  pkill -f "playwright" || true
+  pkill -f "chromium" || true
+  pkill -f "pytest" || true
+}
+
+# Set up trap to cleanup on exit
+trap cleanup_processes EXIT
+
+for i in {1..3}
+do
+  log_with_timestamp "Starting Attempt $i of 3"
+
+  # Clean up results from previous attempt to ensure a clean slate
+  rm -rf results/
+  mkdir -p results/
+  rm -f test-results.xml
+
+  log_with_timestamp "[Attempt $i] Creating test metadata..."
+  python tests/inspect-ai/scripts/create_test_metadata.py
+
+  log_with_timestamp "[Attempt $i] Running Inspect AI evaluation..."
+  inspect eval tests/inspect-ai/scripts/evaluation.py@shiny_test_evaluation \
+    --log-dir results/ \
+    --log-format json
+
+  log_with_timestamp "[Attempt $i] Running Tests..."
+  test_exit_code=0
+  # Disable exit on error just for the pytest command to check the exit code
+  set +e
+  timeout 15m pytest tests/inspect-ai/apps \
+    --tb=short \
+    --disable-warnings \
+    --maxfail=2 \
+    --junit-xml=test-results.xml \
+    --durations=10 \
+    --timeout=300 \
+    --timeout-method=thread \
+    -v || test_exit_code=$?
+  # Re-enable exit on error immediately
+  set -e
+
+  # Check if timeout occurred
+  if [ "${test_exit_code:-0}" -eq 124 ]; then
+    log_with_timestamp "Tests timed out on attempt $i - this may indicate hanging tests"
+    cleanup_processes
+    exit 1
+  fi
+
+  # Check if tests failed and how many failures occurred
+  if [ "${test_exit_code:-0}" -ne 0 ]; then
+    failure_count=$(grep -o 'failures="[0-9]*"' test-results.xml | grep -o '[0-9]*' || echo "0")
+    log_with_timestamp "Found $failure_count test failures on attempt $i"
+
+    # Fail the workflow if more than 1 test failed
+    if [ "$failure_count" -gt 1 ]; then
+      log_with_timestamp "More than 1 test failed on attempt $i - failing CI"
+      exit 1
+    fi
+  fi
+  log_with_timestamp "Attempt $i of 3 Succeeded"
+done
+
+log_with_timestamp "All 3 evaluation and test runs passed successfully."

Original file line number	Diff line number	Diff line change
`@@ -91,7 +91,7 @@ test = [`
`91`	`91`	`"dask[dataframe]",`
`92`	`92`	`"pyarrow",`
`93`	`93`	`"pyarrow-stubs",`
`94`		`- "chatlas[anthropic]",`
	`94`	`+ "chatlas[anthropic,openai]",`
`95`	`95`	`"chatlas[openai]",`
`96`	`96`	`]`
`97`	`97`	`dev = [`