🏃‍♂️ Add auto-generated small benchmarks on merge

codegen-sh[bot] · codegen-sh[bot] · commit 89ee9ffffd44 · 2025-07-06T21:38:58.000Z
- Create merge-benchmark.yml workflow that triggers on PR merges
- Run small benchmarks with 10 iterations per provider
- Add merge-small.json config for lightweight benchmark settings
- Update comprehensive benchmark workflow naming for clarity
- Document dual benchmark system in README
- Distinguish between small (merge-triggered) and comprehensive (manual/scheduled) benchmarks
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -1,11 +1,11 @@
-name: Outline Benchmarks
+name: Comprehensive Benchmarks (Manual/Scheduled)
 
 on:
   schedule:
-    # Run benchmarks daily at 2 AM UTC
+    # Run comprehensive benchmarks daily at 2 AM UTC
     - cron: '0 2 * * *'
   workflow_dispatch:
-    # Allow manual triggering
+    # Allow manual triggering for high-iteration benchmarks
   push:
     branches:
       - main
@@ -14,7 +14,7 @@ on:
       - '.github/workflows/benchmark.yml'
 
 jobs:
-  benchmark:
+  comprehensive-benchmark:
     runs-on: ubuntu-latest
 
     steps:
@@ -59,7 +59,7 @@ jobs:
       run: |
         git add benchmarks/results/
         if ! git diff --cached --quiet; then
-          git commit -m "📊 Automated benchmark results - $(date '+%Y-%m-%d %H:%M:%S')"
+          git commit -m "📊 Comprehensive benchmark results (scheduled) - $(date '+%Y-%m-%d %H:%M:%S')"
           git push
         else
           echo "No new benchmark results to commit"
@@ -71,6 +71,6 @@ jobs:
       uses: actions/upload-artifact@v4
       if: always()
       with:
-        name: benchmark-results
+        name: comprehensive-benchmark-results
         path: benchmarks/results/
         retention-days: 30
diff --git a/.github/workflows/merge-benchmark.yml b/.github/workflows/merge-benchmark.yml
@@ -0,0 +1,117 @@
+name: Small Benchmarks (Auto on Merge)
+
+on:
+  pull_request:
+    types: [closed]
+    branches:
+      - main
+  workflow_dispatch:
+    # Allow manual triggering for testing
+
+jobs:
+  small-benchmark:
+    # Only run on merged PRs, not just closed ones
+    if: github.event.pull_request.merged == true || github.event_name == 'workflow_dispatch'
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+      with:
+        token: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.12'
+
+    - name: Install uv
+      uses: astral-sh/setup-uv@v3
+      with:
+        version: "latest"
+
+    - name: Install dependencies
+      run: |
+        uv sync --all-extras
+
+    - name: Set up Docker Buildx
+      uses: docker/setup-buildx-action@v3
+
+    - name: Configure Git
+      run: |
+        git config --global user.name "Benchmark Bot"
+        git config --global user.email "benchmark@grainchain.dev"
+
+    - name: Run small benchmarks (10 iterations)
+      run: |
+        uv run python benchmarks/scripts/grainchain_benchmark.py --config benchmarks/configs/merge-small.json
+      env:
+        DOCKER_HOST: unix:///var/run/docker.sock
+
+    - name: Generate summary report
+      run: |
+        uv run python benchmarks/scripts/auto_publish.py --generate-summary
+      continue-on-error: true
+
+    - name: Commit and push results
+      run: |
+        git add benchmarks/results/
+        if ! git diff --cached --quiet; then
+          git commit -m "📊 Small benchmark results (merge-triggered) - $(date '+%Y-%m-%d %H:%M:%S')"
+          git push
+        else
+          echo "No new benchmark results to commit"
+        fi
+      env:
+        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+    - name: Upload benchmark artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: small-benchmark-results
+        path: benchmarks/results/
+        retention-days: 30
+
+    - name: Comment on PR with results
+      if: github.event_name == 'pull_request'
+      uses: actions/github-script@v7
+      with:
+        script: |
+          const fs = require('fs');
+          const path = require('path');
+
+          // Find the latest benchmark result file
+          const resultsDir = 'benchmarks/results';
+          if (fs.existsSync(resultsDir)) {
+            const files = fs.readdirSync(resultsDir)
+              .filter(f => f.endsWith('.json') && f.includes('grainchain_benchmark'))
+              .sort()
+              .reverse();
+
+            if (files.length > 0) {
+              const latestFile = files[0];
+              const results = JSON.parse(fs.readFileSync(path.join(resultsDir, latestFile), 'utf8'));
+
+              let comment = '## 🏃‍♂️ Small Benchmark Results (10 iterations)\n\n';
+              comment += `**Benchmark completed:** ${results.metadata?.timestamp || 'Unknown'}\n\n`;
+
+              if (results.summary?.provider_comparison) {
+                comment += '### Provider Performance Summary\n\n';
+                for (const [provider, metrics] of Object.entries(results.summary.provider_comparison)) {
+                  comment += `**${provider.toUpperCase()}:**\n`;
+                  comment += `- Success Rate: ${(metrics.success_rate || 0).toFixed(1)}%\n`;
+                  comment += `- Avg Duration: ${(metrics.avg_duration || 0).toFixed(2)}s\n\n`;
+                }
+              }
+
+              comment += `\n📊 [View detailed results](${context.payload.repository.html_url}/actions/runs/${context.runId})`;
+
+              github.rest.issues.createComment({
+                issue_number: context.issue.number,
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                body: comment
+              });
+            }
+          }
diff --git a/README.md b/README.md
@@ -99,7 +99,19 @@ if not e2b_info.available:
 
 ## ⚡ Performance Benchmarks
 
-Compare sandbox providers with comprehensive performance testing:
+Grainchain features a dual benchmark system for comprehensive performance testing:
+
+### 🏃‍♂️ Automated Small Benchmarks
+- **Trigger**: Automatically run on every merge to `main`
+- **Iterations**: 10 per provider (fast execution)
+- **Purpose**: Quick regression detection and merge validation
+- **Providers**: `local`, `e2b`
+
+### 🔬 Comprehensive Benchmarks
+- **Trigger**: Manual execution or daily scheduled runs
+- **Iterations**: 3+ per provider (thorough analysis)
+- **Purpose**: Detailed performance analysis and provider comparison
+- **Providers**: `local`, `e2b`, `daytona`, `morph`
 
 ### Quick Performance Test
 
@@ -117,6 +129,18 @@ grainchain benchmark --provider local --output benchmarks/results/
 ./scripts/benchmark_status.sh
 ```
 
+### Manual Small Benchmark
+
+Run the same lightweight benchmarks that execute on merge:
+
+```bash
+# Run small benchmarks (10 iterations) manually
+grainchain benchmark --config benchmarks/configs/merge-small.json
+
+# Or with custom iterations
+grainchain benchmark --provider local e2b --iterations 10
+```
+
 ### Full Benchmark Suite
 
 Run comprehensive benchmarks across all providers:
diff --git a/benchmarks/configs/merge-small.json b/benchmarks/configs/merge-small.json
@@ -0,0 +1,39 @@
+{
+  "providers": ["local", "e2b"],
+  "iterations": 10,
+  "timeout": 30,
+  "parallel_tests": false,
+  "detailed_metrics": true,
+  "export_formats": ["json", "markdown"],
+  "test_scenarios": {
+    "basic_commands": {
+      "enabled": true,
+      "timeout": 10
+    },
+    "python_execution": {
+      "enabled": true,
+      "timeout": 15
+    },
+    "file_operations": {
+      "enabled": true,
+      "timeout": 20,
+      "test_files": [
+        { "name": "small.txt", "size": 100 },
+        { "name": "medium.txt", "size": 10000 }
+      ]
+    },
+    "computational_tasks": {
+      "enabled": true,
+      "timeout": 30
+    }
+  },
+  "environment": {
+    "E2B_API_KEY": "from_env",
+    "E2B_TEMPLATE": "base"
+  },
+  "reporting": {
+    "include_raw_data": false,
+    "generate_charts": false,
+    "auto_commit": true
+  }
+}