select tests based on PR labels

filip-michalsky · filip-michalsky · commit 6a7c449e3765 · 2025-06-06T20:41:07.000-04:00
diff --git a/.github/pull_request_template b/.github/pull_request_template
@@ -3,3 +3,20 @@
 # what changed
 
 # test plan
+
+---
+
+## 🧪 Test Execution
+
+By default, **unit tests**, **integration tests**, and **smoke tests** run on all PRs.
+
+For additional testing, add one or more of these labels to your PR:
+
+- `test-browserbase` - Run Browserbase integration tests (requires API credentials)
+- `test-performance` - Run performance and load tests  
+- `test-llm` - Run LLM integration tests (requires API keys)
+- `test-e2e` - Run end-to-end workflow tests
+- `test-slow` - Run all slow-marked tests
+- `test-all` - Run the complete test suite (use sparingly)
+
+**Note**: Label-triggered tests only run when the labels are applied to the PR, not on individual commits.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -5,6 +5,7 @@ on:
     branches: [ main, develop ]
   pull_request:
     branches: [ main, develop ]
+    types: [opened, synchronize, reopened, labeled, unlabeled]
   # schedule:
   #   # Run tests daily at 6 AM UTC
   #   - cron: '0 6 * * *'
@@ -140,7 +141,10 @@ jobs:
     name: Browserbase Integration Tests
     runs-on: ubuntu-latest
     needs: test-unit
-    if: github.event_name == 'schedule' || contains(github.event.head_commit.message, '[test-browserbase]')
+    if: |
+      github.event_name == 'schedule' || 
+      contains(github.event.pull_request.labels.*.name, 'test-browserbase') ||
+      contains(github.event.pull_request.labels.*.name, 'browserbase')
     
     steps:
     - uses: actions/checkout@v4
@@ -183,7 +187,10 @@ jobs:
     name: Performance Tests
     runs-on: ubuntu-latest
     needs: test-unit
-    if: github.event_name == 'schedule' || contains(github.event.head_commit.message, '[test-performance]')
+    if: |
+      github.event_name == 'schedule' || 
+      contains(github.event.pull_request.labels.*.name, 'test-performance') ||
+      contains(github.event.pull_request.labels.*.name, 'performance')
     
     steps:
     - uses: actions/checkout@v4
@@ -253,6 +260,192 @@ jobs:
         name: smoke-test-results
         path: junit-smoke.xml
 
+  test-llm:
+    name: LLM Integration Tests
+    runs-on: ubuntu-latest
+    needs: test-unit
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'test-llm') ||
+      contains(github.event.pull_request.labels.*.name, 'llm')
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e ".[dev]"
+        pip install jsonschema
+        # Install temporary Google GenAI wheel
+        pip install temp/google_genai-1.14.0-py3-none-any.whl
+    
+    - name: Run LLM tests
+      run: |
+        pytest tests/ -v \
+          --cov=stagehand \
+          --cov-report=xml \
+          --junit-xml=junit-llm.xml \
+          -m "llm" \
+          --tb=short
+      env:
+        MODEL_API_KEY: ${{ secrets.MODEL_API_KEY || 'mock-model-key' }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY || 'mock-openai-key' }}
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY || 'mock-anthropic-key' }}
+    
+    - name: Upload LLM test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: llm-test-results
+        path: junit-llm.xml
+
+  test-e2e:
+    name: End-to-End Tests
+    runs-on: ubuntu-latest
+    needs: test-unit
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'test-e2e') ||
+      contains(github.event.pull_request.labels.*.name, 'e2e')
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e ".[dev]"
+        pip install jsonschema
+        # Install temporary Google GenAI wheel
+        pip install temp/google_genai-1.14.0-py3-none-any.whl
+        playwright install chromium
+    
+    - name: Run E2E tests
+      run: |
+        pytest tests/ -v \
+          --cov=stagehand \
+          --cov-report=xml \
+          --junit-xml=junit-e2e.xml \
+          -m "e2e" \
+          --tb=short
+      env:
+        BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY || 'mock-api-key' }}
+        BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID || 'mock-project-id' }}
+        MODEL_API_KEY: ${{ secrets.MODEL_API_KEY || 'mock-model-key' }}
+        STAGEHAND_API_URL: ${{ secrets.STAGEHAND_API_URL || 'http://localhost:3000' }}
+    
+    - name: Upload E2E test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: e2e-test-results
+        path: junit-e2e.xml
+
+  test-slow:
+    name: Slow Tests
+    runs-on: ubuntu-latest
+    needs: test-unit
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'test-slow') ||
+      contains(github.event.pull_request.labels.*.name, 'slow')
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e ".[dev]"
+        pip install jsonschema
+        # Install temporary Google GenAI wheel
+        pip install temp/google_genai-1.14.0-py3-none-any.whl
+        playwright install chromium
+    
+    - name: Run slow tests
+      run: |
+        pytest tests/ -v \
+          --cov=stagehand \
+          --cov-report=xml \
+          --junit-xml=junit-slow.xml \
+          -m "slow" \
+          --tb=short
+      env:
+        BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY || 'mock-api-key' }}
+        BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID || 'mock-project-id' }}
+        MODEL_API_KEY: ${{ secrets.MODEL_API_KEY || 'mock-model-key' }}
+    
+    - name: Upload slow test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: slow-test-results
+        path: junit-slow.xml
+
+  test-all:
+    name: Complete Test Suite
+    runs-on: ubuntu-latest
+    needs: test-unit
+    if: |
+      contains(github.event.pull_request.labels.*.name, 'test-all') ||
+      contains(github.event.pull_request.labels.*.name, 'full-test')
+    
+    steps:
+    - uses: actions/checkout@v4
+    
+    - name: Set up Python 3.11
+      uses: actions/setup-python@v4
+      with:
+        python-version: "3.11"
+    
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e ".[dev]"
+        pip install jsonschema
+        # Install temporary Google GenAI wheel
+        pip install temp/google_genai-1.14.0-py3-none-any.whl
+        playwright install chromium
+    
+    - name: Run complete test suite
+      run: |
+        pytest tests/ -v \
+          --cov=stagehand \
+          --cov-report=xml \
+          --cov-report=html \
+          --junit-xml=junit-all.xml \
+          --maxfail=10 \
+          --tb=short
+      env:
+        BROWSERBASE_API_KEY: ${{ secrets.BROWSERBASE_API_KEY }}
+        BROWSERBASE_PROJECT_ID: ${{ secrets.BROWSERBASE_PROJECT_ID }}
+        MODEL_API_KEY: ${{ secrets.MODEL_API_KEY }}
+        OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+        ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
+        STAGEHAND_API_URL: ${{ secrets.STAGEHAND_API_URL }}
+    
+    - name: Upload complete test results
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: complete-test-results
+        path: |
+          junit-all.xml
+          htmlcov/
+
   coverage-report:
     name: Coverage Report
     runs-on: ubuntu-latest
@@ -342,12 +535,38 @@ jobs:
         echo "- Unit test configurations: $UNIT_TESTS" >> $GITHUB_STEP_SUMMARY
         echo "- Integration test categories: $INTEGRATION_TESTS" >> $GITHUB_STEP_SUMMARY
         
-        # Check for test failures
-        if [ -f test-results/*/junit-*.xml ]; then
-          echo "- Test artifacts generated successfully ✅" >> $GITHUB_STEP_SUMMARY
+        # Check for optional test runs
+        if [ -f test-results/*/junit-browserbase.xml ]; then
+          echo "- Browserbase tests: ✅ Executed" >> $GITHUB_STEP_SUMMARY
+        else
+          echo "- Browserbase tests: ⏭️ Skipped (add 'test-browserbase' label to run)" >> $GITHUB_STEP_SUMMARY
+        fi
+        
+        if [ -f test-results/*/junit-performance.xml ]; then
+          echo "- Performance tests: ✅ Executed" >> $GITHUB_STEP_SUMMARY
+        else
+          echo "- Performance tests: ⏭️ Skipped (add 'test-performance' label to run)" >> $GITHUB_STEP_SUMMARY
+        fi
+        
+        if [ -f test-results/*/junit-llm.xml ]; then
+          echo "- LLM tests: ✅ Executed" >> $GITHUB_STEP_SUMMARY
         else
-          echo "- Test artifacts missing ❌" >> $GITHUB_STEP_SUMMARY
+          echo "- LLM tests: ⏭️ Skipped (add 'test-llm' label to run)" >> $GITHUB_STEP_SUMMARY
         fi
         
+        if [ -f test-results/*/junit-e2e.xml ]; then
+          echo "- E2E tests: ✅ Executed" >> $GITHUB_STEP_SUMMARY
+        else
+          echo "- E2E tests: ⏭️ Skipped (add 'test-e2e' label to run)" >> $GITHUB_STEP_SUMMARY
+        fi
+        
+        echo "" >> $GITHUB_STEP_SUMMARY
+        echo "### Available Test Labels" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-browserbase\` - Run Browserbase integration tests" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-performance\` - Run performance and load tests" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-llm\` - Run LLM integration tests" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-e2e\` - Run end-to-end workflow tests" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-slow\` - Run all slow-marked tests" >> $GITHUB_STEP_SUMMARY
+        echo "- \`test-all\` - Run complete test suite" >> $GITHUB_STEP_SUMMARY
         echo "" >> $GITHUB_STEP_SUMMARY
         echo "Detailed results are available in the artifacts section." >> $GITHUB_STEP_SUMMARY 
diff --git a/tests/README.md b/tests/README.md
@@ -121,18 +121,62 @@ pytest -m local
 # Browserbase tests (requires credentials)
 pytest -m browserbase
 
+# LLM integration tests (requires API keys)
+pytest -m llm
+
+# End-to-end workflow tests
+pytest -m e2e
+
+# Performance tests
+pytest -m performance
+
+# Slow tests
+pytest -m slow
+
 # Mock-only tests (no external dependencies)
 pytest -m mock
 ```
 
+### PR Label-Based Testing
+
+Instead of manually running specific test categories, you can add labels to your PR:
+
+| PR Label | Equivalent Command | Description |
+|----------|-------------------|-------------|
+| `test-browserbase` | `pytest -m browserbase` | Browserbase integration tests |
+| `test-performance` | `pytest -m performance` | Performance and load tests |
+| `test-llm` | `pytest -m llm` | LLM provider integration tests |
+| `test-e2e` | `pytest -m e2e` | End-to-end workflow tests |
+| `test-slow` | `pytest -m slow` | All time-intensive tests |
+| `test-all` | `pytest` | Complete test suite |
+
+**Benefits of label-based testing:**
+- No need to modify commit messages
+- Tests can be triggered after PR creation
+- Multiple test categories can run simultaneously
+- Team members can add/remove labels as needed
+
 ### CI/CD Test Execution
 
 The tests are automatically run in GitHub Actions with different configurations:
 
+#### Always Run on PRs:
 - **Unit Tests**: Run on Python 3.9, 3.10, 3.11, 3.12
-- **Integration Tests**: Run on Python 3.11 with different categories
-- **Browserbase Tests**: Run on schedule or with `[test-browserbase]` in commit message
-- **Performance Tests**: Run on schedule or with `[test-performance]` in commit message
+- **Integration Tests**: Run on Python 3.11 with different categories (api, browser, end_to_end)
+- **Smoke Tests**: Quick validation tests
+
+#### Label-Triggered Tests:
+Add these labels to your PR to run additional test suites:
+
+- **`test-browserbase`** or **`browserbase`**: Browserbase integration tests
+- **`test-performance`** or **`performance`**: Performance and load tests
+- **`test-llm`** or **`llm`**: LLM integration tests
+- **`test-e2e`** or **`e2e`**: End-to-end workflow tests
+- **`test-slow`** or **`slow`**: All slow-marked tests
+- **`test-all`** or **`full-test`**: Complete test suite
+
+#### Scheduled Tests:
+- **Daily**: Comprehensive test suite including Browserbase and performance tests
 
 ## 🎯 Test Coverage Requirements