biodatageeks
diff --git a/‎.github/workflows/benchmark.yml‎
Lines changed: 301 additions & 0 deletions b/‎.github/workflows/benchmark.yml‎
Lines changed: 301 additions & 0 deletions
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
@@ -0,0 +1,301 @@
+name: Performance Benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      alert_threshold:
+        description: 'Alert threshold percentage (e.g., 150 for 150% degradation)'
+        required: false
+        default: '150'
+        type: string
+      baseline_tag:
+        description: 'Baseline git tag (leave empty for latest tag)'
+        required: false
+        default: ''
+        type: string
+      target_branch:
+        description: 'Target branch to benchmark (leave empty for current branch)'
+        required: false
+        default: ''
+        type: string
+
+jobs:
+  benchmark:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: write
+      pull-requests: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Fetch all history for tag identification
+
+      - name: Determine baseline tag
+        id: baseline
+        run: |
+          if [ -n "${{ inputs.baseline_tag }}" ]; then
+            BASELINE_TAG="${{ inputs.baseline_tag }}"
+            echo "Using user-specified baseline tag: $BASELINE_TAG"
+          else
+            BASELINE_TAG=$(git tag --sort=-creatordate | head -1)
+            if [ -z "$BASELINE_TAG" ]; then
+              echo "Error: No git tags found. Please create a tag or specify baseline_tag input."
+              exit 1
+            fi
+            echo "Using latest tag as baseline: $BASELINE_TAG"
+          fi
+          echo "tag=$BASELINE_TAG" >> $GITHUB_OUTPUT
+
+          # Verify tag exists
+          if ! git rev-parse "$BASELINE_TAG" >/dev/null 2>&1; then
+            echo "Error: Tag '$BASELINE_TAG' does not exist"
+            exit 1
+          fi
+
+      - name: Determine target reference
+        id: target
+        run: |
+          if [ -n "${{ inputs.target_branch }}" ]; then
+            TARGET_REF="${{ inputs.target_branch }}"
+          else
+            TARGET_REF="${{ github.ref_name }}"
+          fi
+          echo "ref=$TARGET_REF" >> $GITHUB_OUTPUT
+          echo "Target reference: $TARGET_REF"
+
+      - name: Validate threshold
+        id: threshold
+        run: |
+          THRESHOLD="${{ inputs.alert_threshold }}"
+          # Validate threshold is a number between 100 and 1000
+          if ! [[ "$THRESHOLD" =~ ^[0-9]+(\.[0-9]+)?$ ]]; then
+            echo "Error: Threshold must be a number"
+            exit 1
+          fi
+          if (( $(echo "$THRESHOLD < 100" | bc -l) )); then
+            echo "Error: Threshold must be >= 100"
+            exit 1
+          fi
+          if (( $(echo "$THRESHOLD > 1000" | bc -l) )); then
+            echo "Error: Threshold must be <= 1000"
+            exit 1
+          fi
+          echo "value=$THRESHOLD" >> $GITHUB_OUTPUT
+          echo "Using alert threshold: $THRESHOLD%"
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.11'
+
+      - name: Cache polars-bio-bench repository
+        uses: actions/cache@v4
+        with:
+          path: polars-bio-bench
+          key: polars-bio-bench-${{ hashFiles('**/lockfiles') }}
+
+      - name: Clone polars-bio-bench repository
+        run: |
+          if [ ! -d "polars-bio-bench" ]; then
+            git clone https://github.com/biodatageeks/polars-bio-bench.git
+          else
+            cd polars-bio-bench && git pull && cd ..
+          fi
+
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+
+      - name: Install polars-bio-bench dependencies
+        run: |
+          cd polars-bio-bench
+          poetry install
+
+      # ============================================
+      # BASELINE BENCHMARK
+      # ============================================
+
+      - name: Checkout baseline code
+        run: |
+          git checkout ${{ steps.baseline.outputs.tag }}
+
+      - name: Install baseline polars-bio
+        run: |
+          pip install -e .
+
+      - name: Set up benchmark environment
+        run: |
+          export BENCH_DATA_ROOT=/tmp/polars-bio-bench/
+          export POLARS_MAX_THREADS=1
+          mkdir -p $BENCH_DATA_ROOT
+
+      - name: Run baseline benchmarks
+        run: |
+          cd polars-bio-bench
+          export BENCH_DATA_ROOT=/tmp/polars-bio-bench/
+          export POLARS_MAX_THREADS=1
+          # Run benchmarks using poetry
+          poetry run python src/run-benchmarks.py --bench-config conf/benchmark_small.yaml
+          # Copy results to parent directory with baseline prefix
+          mkdir -p ../baseline_results
+          cp -r results/* ../baseline_results/ 2>/dev/null || true
+        continue-on-error: true
+
+      - name: Verify baseline results
+        run: |
+          if [ ! -d baseline_results ] || [ -z "$(ls -A baseline_results)" ]; then
+            echo "Error: Baseline benchmark did not produce results"
+            exit 1
+          fi
+          echo "Baseline results:"
+          ls -la baseline_results/
+          find baseline_results/ -name "*.csv" | head -5 | xargs -I {} sh -c 'echo "=== {} ===" && head -10 {}'
+
+      # ============================================
+      # TARGET/PR BENCHMARK
+      # ============================================
+
+      - name: Checkout target code
+        run: |
+          git checkout ${{ steps.target.outputs.ref }}
+
+      - name: Clean previous installation
+        run: |
+          pip uninstall -y polars-bio || true
+
+      - name: Install target polars-bio
+        run: |
+          pip install -e .
+
+      - name: Run target benchmarks
+        run: |
+          cd polars-bio-bench
+          export BENCH_DATA_ROOT=/tmp/polars-bio-bench/
+          export POLARS_MAX_THREADS=1
+          # Run benchmarks using poetry
+          poetry run python src/run-benchmarks.py --bench-config conf/benchmark_small.yaml
+          # Copy results to parent directory with pr prefix
+          mkdir -p ../pr_results
+          cp -r results/* ../pr_results/ 2>/dev/null || true
+        continue-on-error: true
+
+      - name: Verify target results
+        run: |
+          if [ ! -d pr_results ] || [ -z "$(ls -A pr_results)" ]; then
+            echo "Error: Target benchmark did not produce results"
+            exit 1
+          fi
+          echo "Target results:"
+          ls -la pr_results/
+          find pr_results/ -name "*.csv" | head -5 | xargs -I {} sh -c 'echo "=== {} ===" && head -10 {}'
+
+      # ============================================
+      # COMPARISON AND REPORTING
+      # ============================================
+
+      - name: Parse and compare benchmark results
+        id: comparison
+        run: |
+          bash benchmarks/compare_benchmark_results.sh \
+            baseline_results \
+            pr_results \
+            ${{ steps.threshold.outputs.value }} \
+            ${{ steps.baseline.outputs.tag }} \
+            ${{ steps.target.outputs.ref }}
+
+          # Check if regressions were found
+          REGRESSIONS=$(jq '.total_regressions' comparison_summary.json)
+          echo "regressions=$REGRESSIONS" >> $GITHUB_OUTPUT
+
+          # Set exit status for later use
+          if [ "$REGRESSIONS" -gt 0 ]; then
+            echo "regression_detected=true" >> $GITHUB_OUTPUT
+          else
+            echo "regression_detected=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Upload benchmark results as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: benchmark-results
+          path: |
+            baseline_results/
+            pr_results/
+            comparison_results/
+            comparison_reports/
+            comparison_report_combined.md
+            comparison_summary.json
+
+      - name: Combine benchmark results for gh-pages
+        run: |
+          # Combine all operation results into single JSON for gh-pages
+          jq -s 'add' comparison_results/*_results.json > combined_benchmark_results.json 2>/dev/null || echo "[]" > combined_benchmark_results.json
+
+      - name: Store benchmark result to gh-pages
+        uses: benchmark-action/github-action-benchmark@v1
+        with:
+          name: polars-bio Performance
+          tool: 'customSmallerIsBetter'
+          output-file-path: combined_benchmark_results.json
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          auto-push: true
+          alert-threshold: ${{ steps.threshold.outputs.value }}%
+          comment-on-alert: true
+          fail-on-alert: false
+          gh-pages-branch: gh-pages
+          benchmark-data-dir-path: dev/bench
+        continue-on-error: true
+
+      - name: Comment PR with comparison results
+        if: github.event_name == 'pull_request' || github.event.pull_request
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const report = fs.readFileSync('comparison_report_combined.md', 'utf8');
+
+            // Find existing comment
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number
+            });
+
+            const botComment = comments.find(comment =>
+              comment.user.type === 'Bot' &&
+              comment.body.includes('Benchmark Comparison')
+            );
+
+            const commentBody = `${report}\n\n---\n*Benchmark comparison generated by polars-bio CI*`;
+
+            if (botComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: commentBody
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: commentBody
+              });
+            }
+        continue-on-error: true
+
+      - name: Post comparison as workflow summary
+        run: |
+          cat comparison_report_combined.md >> $GITHUB_STEP_SUMMARY
+
+      - name: Fail if regressions detected
+        if: steps.comparison.outputs.regression_detected == 'true'
+        run: |
+          echo "::warning::Performance regressions detected! Check the comparison report for details."
+          # Don't fail the workflow, just warn
+          # Uncomment the line below to fail on regressions:
+          # exit 1
@@ -33,6 +33,8 @@ It provides a DataFrame API for genomics data and is designed to be blazing fast
 ## Performance benchmarks
 ![summary-results.png](docs/assets/summary-results.png)
 
+For developers: See [`benchmarks/README_BENCHMARKS.md`](benchmarks/README_BENCHMARKS.md) for information about running performance benchmarks via GitHub Actions.
+
 
 ## Citing