open-metadata · ayush-shah · Jan 14, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 7, 2026
diff --git a/.github/workflows/broken-links-nightly.yml b/.github/workflows/broken-links-nightly.yml
@@ -0,0 +1,211 @@
+name: Broken Links Check - Nightly
+
+on:
+  schedule:
+    # Run every day at 2:00 AM UTC
+    - cron: '0 2 * * *'
+  workflow_dispatch: # Allow manual trigger
+
+permissions:
+  contents: read
+
+jobs:
+  check-broken-links:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '18'
-          node-version: '18'
+          node-version: '20'
-          node-version: '18'
+          node-version: '20'
+          cache: 'npm'
+
+      - name: Install dependencies
+        run: |
+          PUPPETEER_SKIP_DOWNLOAD=true npm install
+        env:
+          NODE_ENV: production
+
+      - name: Run broken links check
+        id: broken_links
+        run: |
+          echo "Running broken links check..."
+          OUTPUT=$(./node_modules/.bin/mint broken-links 2>&1 || true)
+          echo "$OUTPUT"
+
+          # Save output to file
+          echo "$OUTPUT" > broken-links-output.txt
+
+          # Extract the summary line
+          SUMMARY=$(echo "$OUTPUT" | grep -E "found [0-9]+ broken links" || echo "No broken links found")
+          echo "summary=$SUMMARY" >> $GITHUB_OUTPUT
+
+          # Check if there are any broken links (non-zero count)
+          if echo "$SUMMARY" | grep -qE "found [1-9][0-9]* broken links"; then
+            echo "has_broken_links=true" >> $GITHUB_OUTPUT
+            # Count total broken links
+            TOTAL_LINKS=$(echo "$SUMMARY" | grep -oE "[0-9]+" | head -1)
+            echo "total_links=$TOTAL_LINKS" >> $GITHUB_OUTPUT
+            # Count files with broken links
+            TOTAL_FILES=$(echo "$SUMMARY" | grep -oE "[0-9]+" | tail -1)
+            echo "total_files=$TOTAL_FILES" >> $GITHUB_OUTPUT
+          else
+            echo "has_broken_links=false" >> $GITHUB_OUTPUT
+            echo "total_links=0" >> $GITHUB_OUTPUT
+            echo "total_files=0" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Prepare Slack message
+        id: slack_message
+        run: |
+          OUTPUT_FILE="broken-links-output.txt"
+          HAS_BROKEN_LINKS="${{ steps.broken_links.outputs.has_broken_links }}"
+          SUMMARY="${{ steps.broken_links.outputs.summary }}"
+          TOTAL_LINKS="${{ steps.broken_links.outputs.total_links }}"
+          TOTAL_FILES="${{ steps.broken_links.outputs.total_files }}"
+          REPO_URL="https://github.com/${{ github.repository }}"
+
+          if [ "$HAS_BROKEN_LINKS" = "true" ]; then
+            # Create a truncated version of the output for Slack (first 100 lines)
+            TRUNCATED_OUTPUT=$(head -100 "$OUTPUT_FILE")
-            # Create a truncated version of the output for Slack (first 100 lines)
-            TRUNCATED_OUTPUT=$(head -100 "$OUTPUT_FILE")
+            # Create a truncated version of the output for Slack (first 100 lines) and add a truncation note
+            TRUNCATED_OUTPUT="$(head -100 "$OUTPUT_FILE")
+
+[Output truncated to first 100 lines. See full report in workflow artifacts.]"
-            # Create a truncated version of the output for Slack (first 100 lines)
-            TRUNCATED_OUTPUT=$(head -100 "$OUTPUT_FILE")
+            # Create a truncated version of the output for Slack (first 100 lines) and add a truncation note
+            TRUNCATED_OUTPUT="$(head -100 "$OUTPUT_FILE")
+
+[Output truncated to first 100 lines. See full report in workflow artifacts.]"
+
+            # Add truncation note if output was truncated
+            LINE_COUNT=$(wc -l < "$OUTPUT_FILE")
+            if [ "$LINE_COUNT" -gt 100 ]; then
+              TRUNCATED_OUTPUT="${TRUNCATED_OUTPUT}"$'\n\n'"[Output truncated to first 100 lines. See full report in workflow artifacts.]"
+            fi
+
+            # Create JSON payload for Slack using jq for proper escaping
+            jq -n \
+              --arg repo "${{ github.repository }}" \
+              --arg repo_url "$REPO_URL" \
+              --arg date "$(date -u +"%Y-%m-%d %H:%M UTC")" \
+              --arg total_links "$TOTAL_LINKS" \
+              --arg total_files "$TOTAL_FILES" \
+              --arg output "$TRUNCATED_OUTPUT" \
+              --arg run_url "$REPO_URL/actions/runs/${{ github.run_id }}" \
+              '{
+                "blocks": [
+                  {
+                    "type": "header",
+                    "text": {
+                      "type": "plain_text",
+                      "text": "🔗 Broken Links Detection Report",
+                      "emoji": true
+                    }
+                  },
+                  {
+                    "type": "section",
+                    "fields": [
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Repository:*\n<" + $repo_url + "|" + $repo + ">")
+                      },
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Date:*\n" + $date)
+                      }
+                    ]
+                  },
+                  {
+                    "type": "section",
+                    "fields": [
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Total Broken Links:*\n" + $total_links)
+                      },
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Files Affected:*\n" + $total_files)
+                      }
+                    ]
+                  },
+                  {
+                    "type": "divider"
+                  },
+                  {
+                    "type": "section",
+                    "text": {
+                      "type": "mrkdwn",
+                      "text": ("*Sample of Broken Links:*\n```" + $output + "```")
+                    }
+                  },
+                  {
+                    "type": "section",
+                    "text": {
+                      "type": "mrkdwn",
+                      "text": ("📄 *Full report is available in the <" + $run_url + "|GitHub Actions workflow run>*")
+                    }
+                  }
+                ]
+              }' > slack-payload.json
+          else
+            # Create success message for Slack using jq
+            jq -n \
+              --arg repo "${{ github.repository }}" \
+              --arg repo_url "$REPO_URL" \
+              --arg date "$(date -u +"%Y-%m-%d %H:%M UTC")" \
+              '{
+                "blocks": [
+                  {
+                    "type": "header",
+                    "text": {
+                      "type": "plain_text",
+                      "text": "✅ Broken Links Check Passed",
+                      "emoji": true
+                    }
+                  },
+                  {
+                    "type": "section",
+                    "fields": [
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Repository:*\n<" + $repo_url + "|" + $repo + ">")
+                      },
+                      {
+                        "type": "mrkdwn",
+                        "text": ("*Date:*\n" + $date)
+                      }
+                    ]
+                  },
+                  {
+                    "type": "section",
+                    "text": {
+                      "type": "mrkdwn",
+                      "text": "No broken links detected in the documentation. 🎉"
+                    }
+                  }
+                ]
+              }' > slack-payload.json
+          fi
+
+          cat slack-payload.json
+
+      - name: Send Slack notification
+        if: always()
+        env:
+          SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }}
+        run: |
+          if [ -z "$SLACK_WEBHOOK_URL" ]; then
-          if [ -z "$SLACK_WEBHOOK_URL" ]; then
+          if [ -z "$SLACK_WEBHOOK_URL" ]; then
+            echo "::warning::SLACK_WEBHOOK_URL not configured"
-          if [ -z "$SLACK_WEBHOOK_URL" ]; then
+          if [ -z "$SLACK_WEBHOOK_URL" ]; then
+            echo "::warning::SLACK_WEBHOOK_URL not configured"
+            echo "::warning::SLACK_WEBHOOK_URL secret is not set. Skipping Slack notification."
+            echo "Please add SLACK_WEBHOOK_URL to repository secrets to enable Slack notifications."
+            exit 0
+          fi
+
+          curl -X POST \
+            -H 'Content-type: application/json' \
+            --data @slack-payload.json \
+            "$SLACK_WEBHOOK_URL"
+
+          echo "✅ Slack notification sent successfully"
+
+      - name: Upload broken links report
+        if: steps.broken_links.outputs.has_broken_links == 'true'
+        uses: actions/upload-artifact@v4
+        with:
+          name: broken-links-report-${{ github.run_number }}
+          path: broken-links-output.txt
+          retention-days: 30
diff --git a/.github/workflows/broken-links-pr.yml b/.github/workflows/broken-links-pr.yml
@@ -0,0 +1,187 @@
+name: Broken Links Check - PR
+
+on:
+  pull_request:
+    types: [opened, synchronize, reopened]
+    paths:
+      - '**.mdx'
+      - '**.md'
+      - 'docs.json'
+
+permissions:
+  contents: read
+  pull-requests: write
+
+jobs:
+  check-broken-links:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Setup Node.js
+        uses: actions/setup-node@v4
+        with:
+          node-version: '18'
-          node-version: '18'
+          node-version: '22'
-          node-version: '18'
+          node-version: '22'
+          cache: 'npm'
+
+      - name: Install dependencies
+        run: |
+          PUPPETEER_SKIP_DOWNLOAD=true npm install
+        env:
+          NODE_ENV: production
+
+      - name: Run broken links check
+        id: broken_links
+        run: |
+          echo "Running broken links check..."
+          OUTPUT=$(./node_modules/.bin/mint broken-links 2>&1 || true)
+          echo "$OUTPUT"
+
+          # Save output to file for parsing
+          echo "$OUTPUT" > broken-links-output.txt
+
+          # Extract the summary line (e.g., "found 91 broken links in 47 files")
+          SUMMARY=$(echo "$OUTPUT" | grep -E "found [0-9]+ broken links" || echo "No broken links found")
+          echo "summary=$SUMMARY" >> $GITHUB_OUTPUT
+
+          # Check if there are any broken links (non-zero count)
+          if echo "$SUMMARY" | grep -qE "found [1-9][0-9]* broken links"; then
+            echo "has_broken_links=true" >> $GITHUB_OUTPUT
+          else
+            echo "has_broken_links=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Get changed files
+        id: changed_files
+        run: |
+          # Get list of changed .mdx, .md, and .json files in this PR
+          git diff --name-only ${{ github.event.pull_request.base.sha }}...HEAD | grep -E '\.(mdx?|json)$' > changed-files.txt || echo "" > changed-files.txt
+          echo "Changed files:"
+          cat changed-files.txt
+
+      - name: Filter broken links for changed files
+        id: filter_links
+        run: |
+          # Create a script to filter broken links for changed files
+          cat > filter-links.sh << 'EOF'
+          #!/bin/bash
+
+          OUTPUT_FILE="broken-links-output.txt"
+          CHANGED_FILES="changed-files.txt"
+
+          # Read changed files into array
+          mapfile -t CHANGED < "$CHANGED_FILES"
+
+          # Parse broken links output and filter for changed files
+          FILTERED_OUTPUT=""
+          CURRENT_FILE=""
+          CAPTURE=false
+
+          while IFS= read -r line; do
+            # Check if line is a file path (file paths end with .mdx, .md, or .json)
+            if [[ $line =~ ^[a-zA-Z0-9] ]] && ([[ $line == *.mdx ]] || [[ $line == *.md ]] || [[ $line == *.json ]]); then
+              CURRENT_FILE="$line"
+              # Check if this file is in changed files
+              CAPTURE=false
+              for changed in "${CHANGED[@]}"; do
+                if [[ "$CURRENT_FILE" == "$changed" ]]; then
+                  CAPTURE=true
+                  break
+                fi
+              done
+              if $CAPTURE; then
+                FILTERED_OUTPUT+="${CURRENT_FILE}"$'\n'
+              fi
+            elif $CAPTURE && [[ $line =~ ^[[:space:]]*⎿ ]]; then
-          CAPTURE=false
-          
-          while IFS= read -r line; do
-            # Check if line is a file path (file paths end with .mdx, .md, or .json)
-            if [[ $line =~ ^[a-zA-Z0-9] ]] && ([[ $line == *.mdx ]] || [[ $line == *.md ]] || [[ $line == *.json ]]); then
-              CURRENT_FILE="$line"
-              # Check if this file is in changed files
-              CAPTURE=false
-              for changed in "${CHANGED[@]}"; do
-                if [[ "$CURRENT_FILE" == "$changed" ]]; then
-                  CAPTURE=true
-                  break
-                fi
-              done
-              if $CAPTURE; then
-                FILTERED_OUTPUT+="${CURRENT_FILE}"$'\n'
-              fi
-            elif $CAPTURE && [[ $line =~ ^[[:space:]]*⎿ ]]; then
+          SHOULD_INCLUDE_FILE=false
+          
+          while IFS= read -r line; do
+            # Check if line is a file path (file paths end with .mdx, .md, or .json)
+            if [[ $line =~ ^[a-zA-Z0-9] ]] && ([[ $line == *.mdx ]] || [[ $line == *.md ]] || [[ $line == *.json ]]); then
+              CURRENT_FILE="$line"
+              # Check if this file is in changed files
+              SHOULD_INCLUDE_FILE=false
+              for changed in "${CHANGED[@]}"; do
+                if [[ "$CURRENT_FILE" == "$changed" ]]; then
+                  SHOULD_INCLUDE_FILE=true
+                  break
+                fi
+              done
+              if $SHOULD_INCLUDE_FILE; then
+                FILTERED_OUTPUT+="${CURRENT_FILE}"$'\n'
+              fi
+            elif $SHOULD_INCLUDE_FILE && [[ $line =~ ^[[:space:]]*⎿ ]]; then
-          CAPTURE=false
-          
-          while IFS= read -r line; do
-            # Check if line is a file path (file paths end with .mdx, .md, or .json)
-            if [[ $line =~ ^[a-zA-Z0-9] ]] && ([[ $line == *.mdx ]] || [[ $line == *.md ]] || [[ $line == *.json ]]); then
-              CURRENT_FILE="$line"
-              # Check if this file is in changed files
-              CAPTURE=false
-              for changed in "${CHANGED[@]}"; do
-                if [[ "$CURRENT_FILE" == "$changed" ]]; then
-                  CAPTURE=true
-                  break
-                fi
-              done
-              if $CAPTURE; then
-                FILTERED_OUTPUT+="${CURRENT_FILE}"$'\n'
-              fi
-            elif $CAPTURE && [[ $line =~ ^[[:space:]]*⎿ ]]; then
+          SHOULD_INCLUDE_FILE=false
+          
+          while IFS= read -r line; do
+            # Check if line is a file path (file paths end with .mdx, .md, or .json)
+            if [[ $line =~ ^[a-zA-Z0-9] ]] && ([[ $line == *.mdx ]] || [[ $line == *.md ]] || [[ $line == *.json ]]); then
+              CURRENT_FILE="$line"
+              # Check if this file is in changed files
+              SHOULD_INCLUDE_FILE=false
+              for changed in "${CHANGED[@]}"; do
+                if [[ "$CURRENT_FILE" == "$changed" ]]; then
+                  SHOULD_INCLUDE_FILE=true
+                  break
+                fi
+              done
+              if $SHOULD_INCLUDE_FILE; then
+                FILTERED_OUTPUT+="${CURRENT_FILE}"$'\n'
+              fi
+            elif $SHOULD_INCLUDE_FILE && [[ $line =~ ^[[:space:]]*⎿ ]]; then
+              # This is a broken link for a changed file
+              FILTERED_OUTPUT+="${line}"$'\n'
+            fi
+          done < "$OUTPUT_FILE"
+
+          if [ -z "$FILTERED_OUTPUT" ]; then
+            echo "No broken links found in changed files."
+          else
+            printf '%s' "$FILTERED_OUTPUT"
+          fi
+          EOF
+
+          chmod +x filter-links.sh
+          FILTERED=$(./filter-links.sh)
+
+          # Save filtered output
+          echo "$FILTERED" > filtered-links.txt
+
+          # Escape for GitHub output
+          {
+            echo 'filtered<<EOF'
+            echo "$FILTERED"
+            echo 'EOF'
-            echo 'filtered<<EOF'
-            echo "$FILTERED"
-            echo 'EOF'
+            echo 'filtered<<FILTERED_LINKS_EOF'
+            echo "$FILTERED"
+            echo 'FILTERED_LINKS_EOF'
-            echo 'filtered<<EOF'
-            echo "$FILTERED"
-            echo 'EOF'
+            echo 'filtered<<FILTERED_LINKS_EOF'
+            echo "$FILTERED"
+            echo 'FILTERED_LINKS_EOF'
+          } >> $GITHUB_OUTPUT
+
+      - name: Post PR comment
+        uses: actions/github-script@v7
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            const fs = require('fs');
+            const summary = '${{ steps.broken_links.outputs.summary }}';
+            const hasBrokenLinks = '${{ steps.broken_links.outputs.has_broken_links }}' === 'true';
+            const filteredLinks = fs.readFileSync('filtered-links.txt', 'utf8');
+
+            // Read full output for overall repository status
+            const fullOutput = fs.readFileSync('broken-links-output.txt', 'utf8');
+
+            // Create comment body
+            let commentBody = '## 🔗 Broken Links Check Report\n\n';
+
+            if (hasBrokenLinks) {
+              commentBody += `### 📊 Overall Repository Status\n\`\`\`\n${summary}\n\`\`\`\n\n`;
+
+              commentBody += '### 📝 Broken Links in Changed Files\n';
+              if (filteredLinks.trim() && filteredLinks !== 'No broken links found in changed files.') {
+                commentBody += '```\n' + filteredLinks + '\n```\n\n';
+              } else {
+                commentBody += '✅ No broken links found in the files changed by this PR.\n\n';
+              }
+
+              commentBody += '<details>\n<summary>📋 Full Repository Report (click to expand)</summary>\n\n';
+              commentBody += '```\n' + fullOutput + '\n```\n';
+              commentBody += '</details>\n';
+            } else {
+              commentBody += '✅ **No broken links found!**\n';
+            }
+
+            commentBody += '\n---\n*This check was performed automatically. Please review and fix any broken links before merging.*';
+
+            // Find existing comment
+            const { data: comments } = await github.rest.issues.listComments({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: context.issue.number,
+            });
+
+            const botComment = comments.find(comment => 
+              comment.user.type === 'Bot' && 
+              comment.body.includes('🔗 Broken Links Check Report')
+            );
+
+            // Post or update comment
+            if (botComment) {
+              await github.rest.issues.updateComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                comment_id: botComment.id,
+                body: commentBody
+              });
+            } else {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: context.issue.number,
+                body: commentBody
+              });
+            }