Skip to content

Run Pipelines

Run Pipelines #351

Workflow file for this run

name: Run Pipelines
on:
schedule:
# Run at 23:00 UTC daily
- cron: "0 23 * * *"
workflow_dispatch: # Allow manual trigger
inputs:
force_ingest:
description: "Force reingest of data"
type: boolean
default: false
required: false
force_process:
description: "Force reprocessing/re-export of data"
type: boolean
default: false
required: false
force_summaries:
description: "Force regeneration of summaries"
type: boolean
default: false
required: false
run_untracked_repos:
description: "Run untracked repos discovery"
type: boolean
default: false
required: false
summary_types_to_run:
description: "Comma-separated list of summary types to run (repository,overall,contributors)"
type: string
default: "repository,overall,contributors"
summary_intervals_to_run:
description: "Comma-separated list of intervals to run (daily,weekly,monthly)"
type: string
default: "daily,weekly,monthly"
startDate:
description: "Start date for data processing (format: YYYY-MM-DD)"
type: string
required: false
endDate:
description: "End date for data processing (format: YYYY-MM-DD)"
type: string
required: false
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
PIPELINE_DATA_BRANCH: "_data"
DATA_DIR: "data"
# =============================================================================
# FORK CONFIGURATION
# =============================================================================
# Config is loaded from JSON file. Priority: PIPELINE_CONFIG_FILE → config/config.json
#
# For forks, you have 2 options:
# 1. Create config/config.json (copy from config.example.json) - keeps this file unchanged
# 2. Create your own named config (e.g., config/myorg.json) and update PIPELINE_CONFIG_FILE below
#
# Either option = zero merge conflicts when syncing with upstream
# =============================================================================
PIPELINE_CONFIG_FILE: ${{ secrets.PIPELINE_CONFIG_FILE || 'config/example.json' }}
jobs:
ingest-export:
name: Ingest/Export Pipeline
runs-on: ubuntu-latest
timeout-minutes: 60
permissions:
contents: write # Needed for pushing to branches
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Install dependencies
run: bun install
# Set common conditional variables
- name: Set conditional variables
id: set-vars
run: |
START_DATE_ARG="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.startDate != '' && format(' -a {0}', github.event.inputs.startDate) || '' }}"
END_DATE_ARG="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.endDate != '' && format(' -b {0}', github.event.inputs.endDate) || '' }}"
FORCE_INGEST_ARG="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force_ingest == 'true' && ' -f' || '' }}"
FORCE_PROCESS_ARG="${{ github.event_name == 'workflow_dispatch' && github.event.inputs.force_process == 'true' && ' -f' || '' }}"
echo "start_date_arg=$START_DATE_ARG" >> $GITHUB_ENV
echo "end_date_arg=$END_DATE_ARG" >> $GITHUB_ENV
echo "force_ingest_arg=$FORCE_INGEST_ARG" >> $GITHUB_ENV
echo "force_process_arg=$FORCE_PROCESS_ARG" >> $GITHUB_ENV
# Set up pipeline-data branch worktree
- name: Setup pipeline-data branch
uses: ./.github/actions/pipeline-data
with:
operation: setup
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}
data_dir: ${{ env.DATA_DIR }}
# Restore database from pipeline-data branch
- name: Restore database
uses: ./.github/actions/restore-db
with:
operation: restore
dump_dir: ${{ env.DATA_DIR }}/dump
db_path: ${{ env.DATA_DIR }}/db.sqlite
- name: Run ingest pipeline
run: bun run pipeline ingest${{ env.force_ingest_arg }}${{ env.start_date_arg }}${{ env.end_date_arg }}
# Ingest untracked repos weekly (on Sundays) or on manual request
- name: Run ingest-untracked pipeline
if: github.event.inputs.run_untracked_repos == 'true' || (github.event_name == 'schedule')
run: |
DAY_OF_WEEK=$(date +%u)
# Run on Sundays for scheduled runs, or always for manual dispatch with flag
if [ "${{ github.event.inputs.run_untracked_repos }}" = "true" ] || [ "$DAY_OF_WEEK" = "7" ]; then
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
# Manual run: fail on error (no fallback)
bun run pipeline ingest-untracked
else
# Scheduled run: log warning and continue
bun run pipeline ingest-untracked || echo "::warning::Untracked repos ingestion failed (scheduled run, continuing)"
fi
fi
- name: Run process pipeline
run: bun run pipeline process${{ env.force_process_arg }}
- name: Run export pipeline # Export everything missing + overwrite last 2 days to ensure overlap
run: |
bun run pipeline export${{ env.start_date_arg }}${{ env.end_date_arg }}${{ env.force_process_arg }}
bun run pipeline export --days 2 -f
# Dump SQLite database to diffable files before updating pipeline-data branch
- name: Dump SQLite database
uses: ./.github/actions/restore-db
with:
operation: dump
db_path: ${{ env.DATA_DIR }}/db.sqlite
dump_dir: ${{ env.DATA_DIR }}/dump
# Update pipeline-data branch with new data
- name: Update pipeline-data branch
uses: ./.github/actions/pipeline-data
with:
operation: update
data_dir: ${{ env.DATA_DIR }}
commit_message: "Ingest/export run: $(date -u +'%Y-%m-%d %H:%M')"
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}
# Cleanup worktree (always runs)
- name: Cleanup
if: always()
uses: ./.github/actions/pipeline-data
with:
operation: cleanup
data_dir: ${{ env.DATA_DIR }}
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}
generate-summaries:
name: Generate Summaries
needs: ingest-export
runs-on: ubuntu-latest
timeout-minutes: 30
permissions:
contents: write # Needed for pushing to branches
# Skip summary generation if all summary types are disabled in a manual run
if: ${{ github.event_name != 'workflow_dispatch' || github.event.inputs.summary_types_to_run != '' }}
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Setup Bun
uses: oven-sh/setup-bun@v2
with:
bun-version: latest
- name: Install dependencies
run: bun install --frozen-lockfile
# Set up pipeline-data branch worktree
- name: Setup pipeline-data branch
uses: ./.github/actions/pipeline-data
with:
operation: setup
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}
data_dir: ${{ env.DATA_DIR }}
# Restore database from pipeline-data branch
- name: Restore database
uses: ./.github/actions/restore-db
with:
operation: restore
dump_dir: ${{ env.DATA_DIR }}/dump
db_path: ${{ env.DATA_DIR }}/db.sqlite
# Import any markdown summaries that exist but are missing from database
- name: Import missing summaries from markdown files
run: |
bun run pipeline import-summaries --from-github --interval month
bun run pipeline import-summaries --from-github --interval week
bun run pipeline import-summaries --from-github --interval day
# Determine which intervals and types to run
- name: Set run conditions
id: conditions
run: |
DAY_OF_WEEK=$(date +%u) # 1=Mon, 7=Sun
# Scheduled runs
if [ "${{ github.event_name }}" == "schedule" ]; then
echo "RUN_DAILY=true" >> $GITHUB_OUTPUT
if [ "$DAY_OF_WEEK" = "3" ] || [ "$DAY_OF_WEEK" = "6" ]; then
echo "RUN_WEEKLY=true" >> $GITHUB_OUTPUT
fi
if [ "$DAY_OF_WEEK" = "7" ]; then
echo "RUN_MONTHLY=true" >> $GITHUB_OUTPUT
echo "RUN_CONTRIBUTORS=true" >> $GITHUB_OUTPUT
fi
echo "RUN_REPOSITORIES=true" >> $GITHUB_OUTPUT
echo "RUN_OVERALL=true" >> $GITHUB_OUTPUT
# Manual runs
else
SUMMARY_TYPES="${{ github.event.inputs.summary_types_to_run }}"
SUMMARY_INTERVALS="${{ github.event.inputs.summary_intervals_to_run }}"
if [[ "$SUMMARY_INTERVALS" == *'daily'* ]]; then echo "RUN_DAILY=true"; else echo "RUN_DAILY=false"; fi >> $GITHUB_OUTPUT
if [[ "$SUMMARY_INTERVALS" == *'weekly'* ]]; then echo "RUN_WEEKLY=true"; else echo "RUN_WEEKLY=false"; fi >> $GITHUB_OUTPUT
if [[ "$SUMMARY_INTERVALS" == *'monthly'* ]]; then echo "RUN_MONTHLY=true"; else echo "RUN_MONTHLY=false"; fi >> $GITHUB_OUTPUT
if [[ "$SUMMARY_TYPES" == *'contributors'* ]]; then echo "RUN_CONTRIBUTORS=true"; else echo "RUN_CONTRIBUTORS=false"; fi >> $GITHUB_OUTPUT
if [[ "$SUMMARY_TYPES" == *'repository'* ]]; then echo "RUN_REPOSITORIES=true"; else echo "RUN_REPOSITORIES=false"; fi >> $GITHUB_OUTPUT
if [[ "$SUMMARY_TYPES" == *'overall'* ]]; then echo "RUN_OVERALL=true"; else echo "RUN_OVERALL=false"; fi >> $GITHUB_OUTPUT
fi
# Repository summaries must run BEFORE overall summaries
# because overall summaries aggregate repository summaries
- name: Run Repository Summaries
if: steps.conditions.outputs.RUN_REPOSITORIES == 'true'
uses: ./.github/actions/run-summary
with:
summary-type: "repository"
daily: ${{ steps.conditions.outputs.RUN_DAILY }}
weekly: ${{ steps.conditions.outputs.RUN_WEEKLY }}
monthly: ${{ steps.conditions.outputs.RUN_MONTHLY }}
force: ${{ github.event.inputs.force_summaries }}
start-date: ${{ github.event.inputs.startDate }}
end-date: ${{ github.event.inputs.endDate }}
- name: Run Contributor Summaries
if: steps.conditions.outputs.RUN_CONTRIBUTORS == 'true'
uses: ./.github/actions/run-summary
with:
summary-type: "contributors"
daily: ${{ steps.conditions.outputs.RUN_DAILY }}
weekly: ${{ steps.conditions.outputs.RUN_WEEKLY }}
monthly: ${{ steps.conditions.outputs.RUN_MONTHLY }}
force: ${{ github.event.inputs.force_summaries }}
start-date: ${{ github.event.inputs.startDate }}
end-date: ${{ github.event.inputs.endDate }}
- name: Run Overall Summaries
if: steps.conditions.outputs.RUN_OVERALL == 'true'
uses: ./.github/actions/run-summary
with:
summary-type: "overall"
daily: ${{ steps.conditions.outputs.RUN_DAILY }}
weekly: ${{ steps.conditions.outputs.RUN_WEEKLY }}
monthly: ${{ steps.conditions.outputs.RUN_MONTHLY }}
force: ${{ github.event.inputs.force_summaries }}
start-date: ${{ github.event.inputs.startDate }}
end-date: ${{ github.event.inputs.endDate }}
# Force regenerate steps also need correct order: repo -> contributor -> overall
- name: Force regenerate recent repository summaries
if: github.event_name == 'schedule' && steps.conditions.outputs.RUN_REPOSITORIES == 'true'
uses: ./.github/actions/run-summary
with:
summary-type: "repository"
daily: "true"
force: "true"
days: 1
- name: Force regenerate recent contributor summaries
if: github.event_name == 'schedule' && steps.conditions.outputs.RUN_CONTRIBUTORS == 'true'
uses: ./.github/actions/run-summary
with:
summary-type: "contributors"
daily: "true"
force: "true"
days: 1
- name: Force regenerate recent overall summaries
if: github.event_name == 'schedule'
uses: ./.github/actions/run-summary
with:
summary-type: "overall"
daily: "true"
force: "true"
days: 1
# Re-export stats files to include contributor summaries
- name: Re-export stats with contributor summaries
run: bun run pipeline export${{ env.start_date_arg }}${{ env.end_date_arg }}${{ github.event_name == 'schedule' && ' --days 2' || '' }} -f
# Determine site URL for API exports
- name: Determine site URL
id: site-config
run: |
REPO_NAME="${{ github.event.repository.name }}"
OWNER="${{ github.repository_owner }}"
# Auto-detect site URL based on repo type
if [[ "$REPO_NAME" == *.github.io ]]; then
echo "site_url=https://${OWNER}.github.io" >> $GITHUB_OUTPUT
else
echo "site_url=https://${OWNER}.github.io/${REPO_NAME}" >> $GITHUB_OUTPUT
fi
# Export JSON API files
- name: Export leaderboard JSON files
run: bun run pipeline export-leaderboard
env:
SITE_URL: ${{ steps.site-config.outputs.site_url }}
- name: Export summaries to JSON API
run: bun run pipeline export-summaries
env:
SITE_URL: ${{ steps.site-config.outputs.site_url }}
# Dump SQLite database to diffable files
- name: Dump SQLite database
uses: ./.github/actions/restore-db
with:
operation: dump
db_path: ${{ env.DATA_DIR }}/db.sqlite
dump_dir: ${{ env.DATA_DIR }}/dump
# Update pipeline-data branch with new summaries
- name: Update pipeline-data branch with summaries
uses: ./.github/actions/pipeline-data
with:
operation: update
data_dir: ${{ env.DATA_DIR }}
commit_message: "Summary generation run: $(date -u +'%Y-%m-%d %H:%M')"
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}
# Cleanup worktree (always runs)
- name: Cleanup
if: always()
uses: ./.github/actions/pipeline-data
with:
operation: cleanup
data_dir: ${{ env.DATA_DIR }}
branch_name: ${{ env.PIPELINE_DATA_BRANCH }}