Skip to content

Monitor ClickHouse Errors #858

Monitor ClickHouse Errors

Monitor ClickHouse Errors #858

name: Monitor ClickHouse Errors
on:
schedule:
# Every hour at the top of the hour
- cron: '0 * * * *'
workflow_dispatch:
jobs:
check-errors:
runs-on: ubuntu-latest
steps:
- name: Query ClickHouse for OOM and critical errors
env:
CLICKHOUSE_HOST: ${{ secrets.CLICKHOUSE_HOST }}
CLICKHOUSE_USER: ${{ secrets.CLICKHOUSE_MONITORING_USER }}
CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_MONITORING_PASSWORD }}
run: |
QUERY=$(cat <<'SQL'
SELECT
exception_code,
user,
count() as cnt,
any(substring(exception, 1, 300)) as sample_error,
any(substring(query, 1, 200)) as sample_query
FROM system.query_log
WHERE type = 'ExceptionWhileProcessing'
AND exception_code IN (
241, -- MEMORY_LIMIT_EXCEEDED
242, -- TABLE_SIZE_EXCEEDED
243 -- QUERY_WAS_CANCELLED (often memory-related)
)
AND event_time > now() - INTERVAL 65 MINUTE
AND user NOT IN ('operator-internal', 'monitoring-internal', 'observability-internal')
GROUP BY exception_code, user
FORMAT JSONEachRow
SETTINGS max_memory_usage = 500000000
SQL
)
HTTP_CODE=$(curl -s -o /tmp/oom-result.txt -w '%{http_code}' \
"https://${CLICKHOUSE_HOST}:8443/" \
-u "${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}" \
--data "$QUERY")
RESULT=$(cat /tmp/oom-result.txt)
if [ "$HTTP_CODE" != "200" ]; then
echo "::error::ClickHouse query failed with HTTP $HTTP_CODE"
echo "Response body: $RESULT"
echo "has_errors=true" >> "$GITHUB_ENV"
echo "query_unreachable=true" >> "$GITHUB_ENV"
echo "$RESULT" > /tmp/query-error.txt
elif [ -z "$RESULT" ]; then
echo "No critical errors found in the last hour."
echo "has_errors=false" >> "$GITHUB_ENV"
echo "query_unreachable=false" >> "$GITHUB_ENV"
else
echo "Critical errors detected:"
echo "$RESULT" | jq .
echo "has_errors=true" >> "$GITHUB_ENV"
echo "query_unreachable=false" >> "$GITHUB_ENV"
echo "$RESULT" > /tmp/errors.jsonl
fi
- name: Query top resource-consuming queries
env:
CLICKHOUSE_HOST: ${{ secrets.CLICKHOUSE_HOST }}
CLICKHOUSE_USER: ${{ secrets.CLICKHOUSE_MONITORING_USER }}
CLICKHOUSE_PASSWORD: ${{ secrets.CLICKHOUSE_MONITORING_PASSWORD }}
run: |
# Top 10 queries by memory usage
MEM_QUERY=$(cat <<'SQL'
SELECT
query_id,
event_time,
user,
query_duration_ms,
read_rows,
round(memory_usage / 1024 / 1024, 0) as mem_mb,
round(read_bytes / 1024 / 1024, 0) as read_mb,
substring(query, 1, 200) as query_snippet
FROM system.query_log
WHERE type = 'QueryFinish'
AND query_kind = 'Select'
AND event_time > now() - INTERVAL 65 MINUTE
AND user NOT IN ('operator-internal', 'monitoring-internal', 'observability-internal')
ORDER BY memory_usage DESC
LIMIT 10
FORMAT JSONEachRow
SETTINGS max_memory_usage = 500000000
SQL
)
MEM_HTTP_CODE=$(curl -s -o /tmp/mem-result.txt -w '%{http_code}' \
"https://${CLICKHOUSE_HOST}:8443/" \
-u "${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}" \
--data "$MEM_QUERY")
if [ "$MEM_HTTP_CODE" = "200" ] && [ -s /tmp/mem-result.txt ]; then
echo "Top queries by memory retrieved."
echo "has_mem_data=true" >> "$GITHUB_ENV"
# Write to step summary
echo "### Top Queries by Memory Usage (last 65 min)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Query ID | Time | User | Duration (ms) | Memory | Rows Read | Read | Query |" >> "$GITHUB_STEP_SUMMARY"
echo "|---|---|---|---|---|---|---|---|" >> "$GITHUB_STEP_SUMMARY"
cat /tmp/mem-result.txt | jq -r '"| `\(.query_id[0:8])` | \(.event_time) | \(.user) | \(.query_duration_ms) | \(.mem_mb) MB | \(.read_rows) | \(.read_mb) MB | `\(.query_snippet | gsub("[\\n\\r]"; " ") | .[0:80])…` |"' >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
else
echo "::warning::Failed to fetch top memory queries (HTTP $MEM_HTTP_CODE)"
echo "has_mem_data=false" >> "$GITHUB_ENV"
echo "### Top Queries by Memory Usage" >> "$GITHUB_STEP_SUMMARY"
echo "⚠️ Query failed (HTTP $MEM_HTTP_CODE)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
fi
# Top 10 queries by duration
DUR_QUERY=$(cat <<'SQL'
SELECT
query_id,
event_time,
user,
query_duration_ms,
read_rows,
round(memory_usage / 1024 / 1024, 0) as mem_mb,
round(read_bytes / 1024 / 1024, 0) as read_mb,
substring(query, 1, 200) as query_snippet
FROM system.query_log
WHERE type = 'QueryFinish'
AND query_kind = 'Select'
AND event_time > now() - INTERVAL 65 MINUTE
AND user NOT IN ('operator-internal', 'monitoring-internal', 'observability-internal')
ORDER BY query_duration_ms DESC
LIMIT 10
FORMAT JSONEachRow
SETTINGS max_memory_usage = 500000000
SQL
)
DUR_HTTP_CODE=$(curl -s -o /tmp/dur-result.txt -w '%{http_code}' \
"https://${CLICKHOUSE_HOST}:8443/" \
-u "${CLICKHOUSE_USER}:${CLICKHOUSE_PASSWORD}" \
--data "$DUR_QUERY")
if [ "$DUR_HTTP_CODE" = "200" ] && [ -s /tmp/dur-result.txt ]; then
echo "Top queries by duration retrieved."
echo "has_dur_data=true" >> "$GITHUB_ENV"
# Write to step summary
echo "### Top Queries by Duration (last 65 min)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
echo "| Query ID | Time | User | Duration (ms) | Memory | Rows Read | Read | Query |" >> "$GITHUB_STEP_SUMMARY"
echo "|---|---|---|---|---|---|---|---|" >> "$GITHUB_STEP_SUMMARY"
cat /tmp/dur-result.txt | jq -r '"| `\(.query_id[0:8])` | \(.event_time) | \(.user) | \(.query_duration_ms) | \(.mem_mb) MB | \(.read_rows) | \(.read_mb) MB | `\(.query_snippet | gsub("[\\n\\r]"; " ") | .[0:80])…` |"' >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
else
echo "::warning::Failed to fetch top duration queries (HTTP $DUR_HTTP_CODE)"
echo "has_dur_data=false" >> "$GITHUB_ENV"
echo "### Top Queries by Duration" >> "$GITHUB_STEP_SUMMARY"
echo "⚠️ Query failed (HTTP $DUR_HTTP_CODE)" >> "$GITHUB_STEP_SUMMARY"
echo "" >> "$GITHUB_STEP_SUMMARY"
fi
- name: Create or update GitHub Issue
if: env.has_errors == 'true'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
RUN_URL="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
TIMESTAMP=$(date -u '+%Y-%m-%d %H:%M UTC')
# Start building the issue body
cat > /tmp/issue-body.md <<HEADER_EOF
## ClickHouse Critical Errors Detected
The hourly monitoring check found critical errors.
**Time window**: last 65 minutes (as of ${TIMESTAMP})
HEADER_EOF
# Query health section
if [ "${{ env.query_unreachable }}" = "true" ]; then
QUERY_ERROR=$(cat /tmp/query-error.txt 2>/dev/null || echo "Unknown error")
cat >> /tmp/issue-body.md <<UNREACHABLE_EOF
### :rotating_light: Query Unreachable
The monitoring query itself failed — ClickHouse may be under severe memory pressure or unreachable.
\`\`\`
${QUERY_ERROR}
\`\`\`
UNREACHABLE_EOF
fi
# OOM errors table
if [ -f /tmp/errors.jsonl ] && [ -s /tmp/errors.jsonl ]; then
ERRORS=$(cat /tmp/errors.jsonl)
TABLE_ROWS=$(echo "$ERRORS" | jq -r '"| \(.exception_code) | \(.user) | \(.cnt) | \(.sample_error | gsub("[\\n\\r]"; " ") | .[0:120]) |"')
cat >> /tmp/issue-body.md <<ERRORS_EOF
### OOM and Critical Errors
| Exception Code | User | Count | Sample Error |
|---|---|---|---|
${TABLE_ROWS}
ERRORS_EOF
fi
# Top queries by memory
if [ "${{ env.has_mem_data }}" = "true" ] && [ -s /tmp/mem-result.txt ]; then
MEM_ROWS=$(cat /tmp/mem-result.txt | jq -r '"| \(.query_id) | \(.event_time) | \(.user) | \(.query_duration_ms) | \(.mem_mb) MB | \(.read_rows) | \(.query_snippet | gsub("[\\n\\r]"; " ") | .[0:80]) |"')
cat >> /tmp/issue-body.md <<MEM_EOF
### Top Queries by Memory Usage
| Query ID | Time | User | Duration (ms) | Memory | Rows Read | Query |
|---|---|---|---|---|---|---|
${MEM_ROWS}
MEM_EOF
fi
# Top queries by duration
if [ "${{ env.has_dur_data }}" = "true" ] && [ -s /tmp/dur-result.txt ]; then
DUR_ROWS=$(cat /tmp/dur-result.txt | jq -r '"| \(.query_id) | \(.event_time) | \(.user) | \(.query_duration_ms) | \(.mem_mb) MB | \(.read_rows) | \(.query_snippet | gsub("[\\n\\r]"; " ") | .[0:80]) |"')
cat >> /tmp/issue-body.md <<DUR_EOF
### Top Queries by Duration
| Query ID | Time | User | Duration (ms) | Memory | Rows Read | Query |
|---|---|---|---|---|---|---|
${DUR_ROWS}
DUR_EOF
fi
# Error codes reference and footer
cat >> /tmp/issue-body.md <<FOOTER_EOF
### Error Codes Reference
| Code | Name | Description |
|---|---|---|
| 241 | MEMORY_LIMIT_EXCEEDED | Query exceeded memory limit (OOM) |
| 242 | TABLE_SIZE_EXCEEDED | Table size limit exceeded |
| 243 | QUERY_WAS_CANCELLED | Query cancelled (often memory-related) |
---
*Auto-generated by [monitor-clickhouse](${RUN_URL}) workflow.*
FOOTER_EOF
# Remove leading whitespace from heredoc lines
sed -i 's/^ //' /tmp/issue-body.md
# Check for an existing open issue with the clickhouse-oom label
EXISTING=$(gh issue list \
--repo "${{ github.repository }}" \
--label "clickhouse-oom" \
--state open \
--limit 1 \
--json number \
--jq '.[0].number // empty')
if [ -n "$EXISTING" ]; then
gh issue comment "$EXISTING" \
--repo "${{ github.repository }}" \
--body-file /tmp/issue-body.md
echo "Updated existing issue #${EXISTING}"
else
gh label create "clickhouse-oom" \
--repo "${{ github.repository }}" \
--description "ClickHouse OOM or critical error alert" \
--color "D93F0B" \
2>/dev/null || true
gh issue create \
--repo "${{ github.repository }}" \
--title "ClickHouse critical errors detected ($(date -u '+%Y-%m-%d'))" \
--body-file /tmp/issue-body.md \
--label "clickhouse-oom"
echo "Created new issue"
fi