1+ #! /bin/bash
2+
3+ # Databend Meta Server Diagnostic Script
4+ # Purpose: Identify root causes for node failures, OOM errors, and query log issues
5+ # Author: Generated for databend-meta troubleshooting
6+
7+ set -euo pipefail
8+
9+ # Colors for output
10+ RED=' \033[0;31m'
11+ GREEN=' \033[0;32m'
12+ YELLOW=' \033[1;33m'
13+ BLUE=' \033[0;34m'
14+ NC=' \033[0m' # No Color
15+
16+ # Global variables
17+ REPORT_FILE=" databend-meta-diagnostic-$( date +%Y%m%d-%H%M%S) .txt"
18+ TEMP_DIR=" /tmp/databend-meta-diag-$$ "
19+ DATABEND_PROCESSES=" "
20+
21+ # Helper functions
22+ log_step () {
23+ echo -e " ${BLUE} [$( date ' +%H:%M:%S' ) ] STEP: $1 ${NC} "
24+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] STEP: $1 " >> " $REPORT_FILE "
25+ }
26+
27+ log_result () {
28+ echo -e " ${GREEN} [$( date ' +%H:%M:%S' ) ] RESULT: $1 ${NC} "
29+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] RESULT: $1 " >> " $REPORT_FILE "
30+ }
31+
32+ log_warning () {
33+ echo -e " ${YELLOW} [$( date ' +%H:%M:%S' ) ] WARNING: $1 ${NC} "
34+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] WARNING: $1 " >> " $REPORT_FILE "
35+ }
36+
37+ log_error () {
38+ echo -e " ${RED} [$( date ' +%H:%M:%S' ) ] ERROR: $1 ${NC} "
39+ echo " [$( date ' +%Y-%m-%d %H:%M:%S' ) ] ERROR: $1 " >> " $REPORT_FILE "
40+ }
41+
42+ create_temp_dir () {
43+ mkdir -p " $TEMP_DIR "
44+ trap " rm -rf $TEMP_DIR " EXIT
45+ }
46+
47+ # Initialize diagnostic report
48+ init_report () {
49+ log_step " Initializing databend-meta diagnostic report"
50+
51+ cat > " $REPORT_FILE " << EOF
52+ ================================================================================
53+ DATABEND META SERVER DIAGNOSTIC REPORT
54+ Generated: $( date ' +%Y-%m-%d %H:%M:%S' )
55+ Hostname: $( hostname)
56+ ================================================================================
57+
58+ EOF
59+
60+ log_result " Report initialized: $REPORT_FILE "
61+ }
62+
63+ # Check system basic information
64+ check_system_info () {
65+ log_step " Collecting system information"
66+
67+ {
68+ echo " === SYSTEM INFORMATION ==="
69+ echo " Hostname: $( hostname) "
70+ echo " Uptime: $( uptime) "
71+ echo " Kernel: $( uname -r) "
72+ echo " Distribution: $( cat /etc/os-release | grep PRETTY_NAME | cut -d' "' -f2 2> /dev/null || echo ' Unknown' ) "
73+ echo " Architecture: $( uname -m) "
74+ echo " CPU cores: $( nproc) "
75+ echo " "
76+ } >> " $REPORT_FILE "
77+
78+ log_result " System information collected"
79+ }
80+
81+ # Check memory and OOM killer activity
82+ check_memory_oom () {
83+ log_step " Checking memory status and OOM killer activity"
84+
85+ {
86+ echo " === MEMORY STATUS ==="
87+ free -h
88+ echo " "
89+
90+ echo " === SWAP USAGE ==="
91+ swapon --show 2> /dev/null || echo " No swap configured"
92+ echo " "
93+
94+ echo " === OOM KILLER ACTIVITY (last 100 entries) ==="
95+ dmesg | grep -i " killed process\|out of memory\|oom-killer\|memory: usage" | tail -100 || echo " No OOM killer activity found in dmesg"
96+ echo " "
97+
98+ echo " === RECENT OOM KILLS IN SYSTEM LOG ==="
99+ journalctl --since " 7 days ago" | grep -i " killed process\|out of memory\|oom" | tail -50 || echo " No recent OOM kills found in journal"
100+ echo " "
101+ } >> " $REPORT_FILE "
102+
103+ # Check if databend processes were killed by OOM
104+ local oom_databend=$( dmesg | grep -i " killed process" | grep -i databend | wc -l)
105+ if [ " $oom_databend " -gt 0 ]; then
106+ log_warning " Found $oom_databend databend processes killed by OOM killer"
107+ {
108+ echo " === DATABEND PROCESSES KILLED BY OOM ==="
109+ dmesg | grep -i " killed process" | grep -i databend
110+ echo " "
111+ } >> " $REPORT_FILE "
112+ else
113+ log_result " No databend processes found in OOM killer logs"
114+ fi
115+ }
116+
117+ # Check databend-meta processes
118+ check_databend_processes () {
119+ log_step " Analyzing databend-meta processes"
120+
121+ # Find running databend processes
122+ DATABEND_PROCESSES=$( pgrep -f " databend.*meta" || true)
123+
124+ {
125+ echo " === DATABEND META PROCESSES ==="
126+ if [ -n " $DATABEND_PROCESSES " ]; then
127+ echo " Running databend-meta processes:"
128+ ps aux | grep -E " databend.*meta" | grep -v grep
129+ echo " "
130+
131+ echo " === PROCESS RESOURCE USAGE ==="
132+ for pid in $DATABEND_PROCESSES ; do
133+ if [ -d " /proc/$pid " ]; then
134+ echo " PID $pid :"
135+ echo " Command: $( cat /proc/$pid /cmdline | tr ' \0' ' ' ) "
136+ echo " Memory (VmRSS): $( grep VmRSS /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
137+ echo " Memory (VmSize): $( grep VmSize /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
138+ echo " Threads: $( grep Threads /proc/$pid /status 2> /dev/null || echo ' N/A' ) "
139+ echo " File descriptors: $( ls /proc/$pid /fd 2> /dev/null | wc -l || echo ' N/A' ) "
140+ echo " "
141+ fi
142+ done
143+ else
144+ echo " No databend-meta processes currently running"
145+ fi
146+ echo " "
147+ } >> " $REPORT_FILE "
148+
149+ if [ -n " $DATABEND_PROCESSES " ]; then
150+ log_result " Found $( echo $DATABEND_PROCESSES | wc -w) databend-meta processes"
151+ else
152+ log_warning " No databend-meta processes currently running"
153+ fi
154+ }
155+
156+ # Check system resource limits and usage
157+ check_system_resources () {
158+ log_step " Checking system resource usage and limits"
159+
160+ {
161+ echo " === CPU USAGE ==="
162+ top -bn1 | head -20
163+ echo " "
164+
165+ echo " === LOAD AVERAGE ==="
166+ cat /proc/loadavg
167+ echo " "
168+
169+ echo " === DISK USAGE ==="
170+ df -h
171+ echo " "
172+
173+ echo " === INODE USAGE ==="
174+ df -i
175+ echo " "
176+
177+ echo " === MEMORY USAGE BY PROCESS ==="
178+ ps aux --sort=-%mem | head -20
179+ echo " "
180+
181+ echo " === SYSTEM LIMITS ==="
182+ echo " Max open files (system): $( cat /proc/sys/fs/file-max) "
183+ echo " Current open files: $( cat /proc/sys/fs/file-nr | cut -f1) "
184+ echo " Max processes: $( cat /proc/sys/kernel/pid_max) "
185+ echo " Max memory map areas: $( cat /proc/sys/vm/max_map_count) "
186+ echo " "
187+
188+ if [ -n " $DATABEND_PROCESSES " ]; then
189+ echo " === DATABEND PROCESS LIMITS ==="
190+ for pid in $DATABEND_PROCESSES ; do
191+ if [ -d " /proc/$pid " ]; then
192+ echo " PID $pid limits:"
193+ cat /proc/$pid /limits 2> /dev/null | grep -E " open files|processes|address space" || echo " Unable to read limits"
194+ echo " "
195+ fi
196+ done
197+ fi
198+ } >> " $REPORT_FILE "
199+
200+ log_result " System resource information collected"
201+ }
202+
203+ # Check databend-meta logs
204+ check_databend_logs () {
205+ log_step " Analyzing databend-meta logs"
206+
207+ {
208+ echo " === DATABEND META LOG ANALYSIS ==="
209+
210+ # Common log locations to check
211+ local log_paths=(
212+ " /var/log/databend"
213+ " /opt/databend/logs"
214+ " /usr/local/databend/logs"
215+ " /home/*/databend/logs"
216+ " $( pwd) /logs"
217+ " ./logs"
218+ )
219+
220+ local found_logs=false
221+
222+ for log_path in " ${log_paths[@]} " ; do
223+ if [ -d " $log_path " ] && [ " $( find " $log_path " -name " *meta*log*" -o -name " *databend*log*" 2> /dev/null | wc -l) " -gt 0 ]; then
224+ echo " Found logs in: $log_path "
225+ find " $log_path " -name " *meta*log*" -o -name " *databend*log*" | head -10
226+ echo " "
227+ found_logs=true
228+
229+ # Analyze recent errors in logs
230+ echo " === RECENT ERRORS IN LOGS (last 100 lines) ==="
231+ find " $log_path " -name " *meta*log*" -o -name " *databend*log*" | while read -r logfile; do
232+ if [ -f " $logfile " ]; then
233+ echo " Analyzing: $logfile "
234+ tail -100 " $logfile " | grep -i -E " error|panic|fatal|oom|memory|fail" | tail -20 || echo " No recent errors found"
235+ echo " "
236+ fi
237+ done
238+ break
239+ fi
240+ done
241+
242+ if [ " $found_logs " = false ]; then
243+ echo " No databend-meta log files found in standard locations"
244+ echo " Checked locations: ${log_paths[*]} "
245+ echo " "
246+
247+ # Try to find logs using systemd if service is running
248+ echo " === CHECKING SYSTEMD LOGS ==="
249+ journalctl -u databend-meta --since " 24 hours ago" --no-pager | tail -100 2> /dev/null || echo " No systemd logs found for databend-meta service"
250+ echo " "
251+ fi
252+ } >> " $REPORT_FILE "
253+
254+ log_result " Log analysis completed"
255+ }
256+
257+ # Check network and connectivity
258+ check_network () {
259+ log_step " Checking network configuration and connectivity"
260+
261+ {
262+ echo " === NETWORK CONFIGURATION ==="
263+ ss -tlnp | grep -E " :9191|:8080|:3307|:8000" || echo " No databend-related ports found listening"
264+ echo " "
265+
266+ echo " === NETWORK CONNECTIONS ==="
267+ if [ -n " $DATABEND_PROCESSES " ]; then
268+ for pid in $DATABEND_PROCESSES ; do
269+ echo " Connections for PID $pid :"
270+ lsof -p " $pid " -i 2> /dev/null | head -20 || echo " Unable to check connections for PID $pid "
271+ echo " "
272+ done
273+ fi
274+
275+ echo " === FIREWALL STATUS ==="
276+ if command -v ufw > /dev/null; then
277+ ufw status 2> /dev/null || echo " UFW not active"
278+ elif command -v firewall-cmd > /dev/null; then
279+ firewall-cmd --state 2> /dev/null || echo " Firewalld not active"
280+ elif command -v iptables > /dev/null; then
281+ iptables -L -n | head -20 2> /dev/null || echo " Unable to check iptables"
282+ else
283+ echo " No common firewall tools found"
284+ fi
285+ echo " "
286+ } >> " $REPORT_FILE "
287+
288+ log_result " Network analysis completed"
289+ }
290+
291+ # Generate final diagnostic report
292+ generate_final_report () {
293+ log_step " Generating final diagnostic summary"
294+
295+ {
296+ echo " "
297+ echo " ================================================================================"
298+ echo " DIAGNOSTIC SUMMARY"
299+ echo " ================================================================================"
300+ echo " "
301+
302+ echo " === CRITICAL FINDINGS ==="
303+
304+ # Check for OOM issues
305+ local oom_count=$( dmesg | grep -i " killed process" | grep -i databend | wc -l)
306+ if [ " $oom_count " -gt 0 ]; then
307+ echo " ❌ CRITICAL: $oom_count databend processes killed by OOM killer"
308+ fi
309+
310+ # Check memory pressure
311+ local mem_available=$( free -m | awk ' NR==2{printf "%.0f", $7/$2*100}' )
312+ if [ " $mem_available " -lt 10 ]; then
313+ echo " ❌ CRITICAL: Very low available memory (${mem_available} %)"
314+ elif [ " $mem_available " -lt 20 ]; then
315+ echo " ⚠️ WARNING: Low available memory (${mem_available} %)"
316+ fi
317+
318+ # Check if processes are running
319+ if [ -z " $DATABEND_PROCESSES " ]; then
320+ echo " ❌ CRITICAL: No databend-meta processes currently running"
321+ else
322+ echo " ✅ INFO: $( echo $DATABEND_PROCESSES | wc -w) databend-meta processes running"
323+ fi
324+
325+ # Check disk space
326+ local disk_usage=$( df / | awk ' NR==2{print $5}' | sed ' s/%//' )
327+ if [ " $disk_usage " -gt 90 ]; then
328+ echo " ❌ CRITICAL: Root filesystem ${disk_usage} % full"
329+ elif [ " $disk_usage " -gt 80 ]; then
330+ echo " ⚠️ WARNING: Root filesystem ${disk_usage} % full"
331+ fi
332+
333+ echo " "
334+ echo " === RECOMMENDATIONS ==="
335+ echo " 1. Check system logs for OOM killer activity"
336+ echo " 2. Monitor memory usage during peak loads"
337+ echo " 3. Consider increasing system memory or optimizing databend-meta configuration"
338+ echo " 4. Verify databend-meta service configuration and startup scripts"
339+ echo " 5. Check application logs for query execution patterns"
340+ echo " "
341+
342+ echo " === NEXT STEPS ==="
343+ echo " 1. Review the detailed findings above"
344+ echo " 2. Share this report with databend support team"
345+ echo " 3. Consider implementing monitoring for memory usage"
346+ echo " 4. Set up log rotation if not already configured"
347+ echo " "
348+
349+ echo " Report generated: $( date ' +%Y-%m-%d %H:%M:%S' ) "
350+ echo " Report location: $( pwd) /$REPORT_FILE "
351+
352+ } >> " $REPORT_FILE "
353+
354+ log_result " Final diagnostic report generated"
355+ }
356+
357+ # Main execution
358+ main () {
359+ echo -e " ${BLUE} Databend Meta Server Diagnostic Tool${NC} "
360+ echo " ======================================"
361+
362+ create_temp_dir
363+ init_report
364+
365+ check_system_info
366+ check_memory_oom
367+ check_databend_processes
368+ check_system_resources
369+ check_databend_logs
370+ check_network
371+ generate_final_report
372+
373+ echo " "
374+ echo -e " ${GREEN} Diagnostic completed successfully!${NC} "
375+ echo -e " Report saved to: ${YELLOW} $REPORT_FILE ${NC} "
376+ echo " "
377+ echo -e " ${BLUE} To view the report:${NC} "
378+ echo -e " cat $REPORT_FILE "
379+ echo " "
380+ echo -e " ${BLUE} To share with support:${NC} "
381+ echo -e " Send the file: $REPORT_FILE "
382+ }
383+
384+ # Run main function
385+ main " $@ "
0 commit comments