Add sheet2docs automation script and workflow (#4729)

theletterf · v1v · claude · web-flow · commit 64852ce4e126 · 2026-01-21T17:27:35.000Z
## Summary Fixes elastic/docs-content-internal#654 ## Generative AI disclosure  1. Did you use a generative AI (GenAI) tool to assist in creating this contribution? - [X] Yes - [ ] No 2. If you answered "Yes" to the previous question, please specify the tool(s) and model(s) used (e.g., Google Gemini, OpenAI ChatGPT-4, etc.). Tool(s) and model(s) used: Claude Opus 4.5 in Cursor --------- Co-authored-by: Victor Martinez <victormartinezrubio@gmail.com> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/.github/workflows/sync-sheets-keyless.yml b/.github/workflows/sync-sheets-keyless.yml
@@ -0,0 +1,134 @@
+name: Sync Google Sheets to CSV (Keyless Auth)
+
+on:
+  # Scheduled trigger - daily at 2 AM UTC
+  schedule:
+    - cron: '0 2 * * *'
+
+  # TODO: Remove after testing
+  push:
+    branches: ['add-sheet2csv-automation']
+  # TODO: Remove after testing
+  push:
+    branches: ['add-sheet2csv-automation']
+
+  # Manual trigger
+  workflow_dispatch:
+    inputs:
+      dry_run:
+        description: 'Dry run (skip PR creation)'
+        required: false
+        default: false
+        type: boolean
+
+# Required permissions
+permissions:
+  contents: write
+  pull-requests: write
+  id-token: write  # Required for OIDC token authentication
+
+jobs:
+  sync-sheet:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v6
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: '3.13'
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r scripts/sheet2docs/requirements.txt
+
+      # Keyless authentication using Workload Identity Federation
+      - name: Authenticate to Google Cloud
+        uses: google-github-actions/auth@7c6bc770dae815cd3e89ee6cdf493a5fab2cc093 # v3.0.0
+        with:
+          workload_identity_provider: ${{ vars.GCP_WORKLOAD_IDENTITY_PROVIDER }}
+          service_account: ${{ vars.GCP_SERVICE_ACCOUNT_EMAIL }}
+          project_id: ${{ vars.GCP_PROJECT_ID }}
+          access_token_scopes: 'https://www.googleapis.com/auth/spreadsheets.readonly,https://www.googleapis.com/auth/drive.readonly'
+
+      # The auth action sets GOOGLE_APPLICATION_CREDENTIALS automatically
+      - name: Run sync script
+        env:
+          GOOGLE_SHEET_URL: ${{ secrets.GOOGLE_SHEET_URL }}
+        run: |
+          python scripts/sheet2docs/sync_sheet.py --config scripts/sheet2docs/config.yml --verbose
+
+      - name: Create Pull Request
+        if: github.event.inputs.dry_run != 'true'
+        uses: peter-evans/create-pull-request@c0f553fe549906ede9cf27b5156039d195d2ece0 # v8.1.0
+        with:
+          token: ${{ github.token }}
+          branch: automated/sheets-sync
+          delete-branch: true
+          title: "Update CSV from Google Sheets"
+          commit-message: |
+            Update CSV from Google Sheets
+
+            Generated: ${{ github.run_id }}
+            Workflow run: ${{ github.run_number }}
+          body: |
+            ## Summary
+
+            This PR updates the CSV file with the latest data from Google Sheets.
+
+            ### Changes
+
+            Please review the changes in the Files tab to ensure the data looks correct.
+
+            ### Next Steps
+
+            - [ ] Review the CSV changes
+            - [ ] Verify data accuracy
+            - [ ] Merge when ready
+
+            ---
+
+            🤖 Automated update from [sync-sheets workflow](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }})
+          add-paths: |
+            explore-analyze/elastic-inference/models.csv
+
+      - name: Upload CSV artifact
+        if: always()
+        uses: actions/upload-artifact@v6
+        with:
+          name: generated-csv-${{ github.run_number }}
+          path: explore-analyze/elastic-inference/models.csv
+          retention-days: 30
+          if-no-files-found: warn
+
+      - name: Generate job summary
+        if: always()
+        run: |
+          CSV_PATH="explore-analyze/elastic-inference/models.csv"
+          echo "## Sync Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Status:** ${{ job.status }}" >> $GITHUB_STEP_SUMMARY
+          echo "**Dry Run:** ${{ github.event.inputs.dry_run }}" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          if [ -f "$CSV_PATH" ]; then
+            echo "### Generated Files" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+            ls -lh "$CSV_PATH" >> $GITHUB_STEP_SUMMARY
+            echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+
+            # Show first few rows as preview
+            echo "### Preview (first 5 rows)" >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+            echo "\`\`\`csv" >> $GITHUB_STEP_SUMMARY
+            head -n 6 "$CSV_PATH" >> $GITHUB_STEP_SUMMARY
+            echo "\`\`\`" >> $GITHUB_STEP_SUMMARY
+          else
+            echo "⚠️ No CSV file found" >> $GITHUB_STEP_SUMMARY
+          fi
diff --git a/scripts/sheet2docs/config.yml b/scripts/sheet2docs/config.yml
@@ -0,0 +1,39 @@
+# Google Sheets to CSV Configuration
+#
+# This file defines which Google Sheet to sync and how to transform the data
+
+# Google Sheet source
+source:
+  # Google Sheets URL or spreadsheet ID
+  # SECURITY: For public repos, use environment variable substitution
+  # to avoid exposing the sheet URL
+  #
+  # Option 1: Environment variable (recommended for public repos)
+  # Set GOOGLE_SHEET_URL as a GitHub Secret
+  # sheet_url: "${GOOGLE_SHEET_URL}"
+  #
+  # Option 2: Direct value (for testing/private repos)
+  sheet_url: "${GOOGLE_SHEET_URL}"
+
+  # Name of the tab/sheet within the spreadsheet
+  # Can also use environment variable if needed
+  tab_name: "Models"
+
+# Column configuration
+# Define which columns to include in the CSV and optionally rename them
+columns:
+  - source: "Type"
+  - source: "Author"
+  - source: "Name"
+  - source: "ID"
+
+# Output configuration
+output:
+  # Output CSV filename
+  filename: "models.csv"
+
+  # Output directory (relative to repo root)
+  directory: "explore-analyze/elastic-inference"
+
+  # CSV delimiter (default: comma)
+  delimiter: ","
diff --git a/scripts/sheet2docs/readme.txt b/scripts/sheet2docs/readme.txt
@@ -0,0 +1,100 @@
+sheet2docs - Google Sheets to CSV Sync
+======================================
+
+This automation syncs data from a Google Sheet to a CSV file in the docs-content
+repository. It runs daily at 2 AM UTC and creates/updates a pull request when
+the sheet data changes.
+
+
+How it works
+------------
+
+1. GitHub Actions runs the sync workflow daily (or manually).
+2. The Python script fetches data from the configured Google Sheet.
+3. If the CSV has changed, a PR is created or updated on the `automated/sheets-sync` branch.
+4. A team member reviews and merges the PR.
+5. The updated CSV is available in the repository.
+
+
+Configuration
+-------------
+
+Edit `scripts/sheet2docs/config.yml` to configure:
+
+- source.sheet_url: The Google Sheet URL (uses GOOGLE_SHEET_URL secret)
+- source.tab_name: The tab/sheet name within the spreadsheet
+- columns: Which columns to include and optionally rename
+- output.filename: The output CSV filename
+- output.directory: Where to save the CSV (relative to repo root)
+
+
+Adding or removing columns
+--------------------------
+
+Edit the `columns` section in config.yml:
+
+    columns:
+      - source: "Column Name"           # Keep original name
+      - source: "Old Name"
+        target: "New Name"              # Rename in CSV
+
+
+Changing the output location
+----------------------------
+
+Edit the `output` section in config.yml:
+
+    output:
+      filename: "models.csv"
+      directory: "path/to/output"
+
+Note: You must also update the CSV_PATH in the workflow file
+(.github/workflows/sync-sheets-keyless.yml) to match.
+
+
+Running manually
+----------------
+
+1. Go to the Actions tab in GitHub.
+2. Select "Sync Google Sheets to CSV (Keyless Auth)".
+3. Click "Run workflow".
+4. Optionally enable "Dry run" to test without creating a PR.
+
+
+Required GitHub configuration
+-----------------------------
+
+Secrets:
+- GOOGLE_SHEET_URL: Full URL of the Google Sheet
+
+Variables:
+- GCP_WORKLOAD_IDENTITY_PROVIDER: Workload Identity Provider resource name
+- GCP_SERVICE_ACCOUNT_EMAIL: Service account email
+- GCP_PROJECT_ID: GCP project ID
+
+
+Google Sheet setup
+------------------
+
+The Google Sheet must be shared with the service account email
+(Viewer permission). The service account email is the value of
+GCP_SERVICE_ACCOUNT_EMAIL.
+
+
+Troubleshooting
+---------------
+
+"Spreadsheet not found"
+  - Ensure the sheet is shared with the service account email.
+  - Check that GOOGLE_SHEET_URL secret is correct.
+
+"Tab 'X' not found"
+  - Verify the tab name matches exactly (case-sensitive).
+
+"Column 'X' not found"
+  - Column names must match the sheet headers exactly (case-sensitive).
+
+"Google Sheets API has not been enabled"
+  - Enable Google Sheets API and Drive API in the GCP project.
+
+For detailed GCP setup, see SETUP-KEYLESS.md.
diff --git a/scripts/sheet2docs/requirements.txt b/scripts/sheet2docs/requirements.txt
@@ -0,0 +1,7 @@
+# Google Sheets API
+gspread==6.1.2
+google-auth==2.35.0
+requests>=2.31.0
+
+# Configuration parsing
+PyYAML==6.0.2
diff --git a/scripts/sheet2docs/sync_sheet.py b/scripts/sheet2docs/sync_sheet.py