1+ #! /bin/bash
2+ # Health check script for EKS Helm Client GitHub Action
3+ # This script verifies that the container is healthy and ready to execute
4+
5+ set -e
6+
7+ # Exit codes
8+ readonly HEALTH_OK=0
9+ readonly HEALTH_ERROR=1
10+
11+ # Timeout for health checks (should be less than Docker healthcheck timeout)
12+ readonly HEALTH_TIMEOUT=8
13+
14+ # Color codes for output (only if terminal supports it)
15+ if [[ -t 1 ]]; then
16+ readonly RED=' \033[0;31m'
17+ readonly GREEN=' \033[0;32m'
18+ readonly YELLOW=' \033[1;33m'
19+ readonly NC=' \033[0m'
20+ else
21+ readonly RED=' '
22+ readonly GREEN=' '
23+ readonly YELLOW=' '
24+ readonly NC=' '
25+ fi
26+
27+ # Logging functions
28+ log_health_info () {
29+ echo " [HEALTH] $1 " >&2
30+ }
31+
32+ log_health_error () {
33+ echo -e " ${RED} [HEALTH ERROR]${NC} $1 " >&2
34+ }
35+
36+ log_health_ok () {
37+ echo -e " ${GREEN} [HEALTH OK]${NC} $1 " >&2
38+ }
39+
40+ # Function to check if a command exists and is executable
41+ check_command () {
42+ local cmd=$1
43+ local description=${2:- $cmd }
44+
45+ if command -v " $cmd " > /dev/null 2>&1 ; then
46+ log_health_info " ✓ $description is available"
47+ return 0
48+ else
49+ log_health_error " ✗ $description is not available or not executable"
50+ return 1
51+ fi
52+ }
53+
54+ # Function to check if a command can execute and return version
55+ check_command_version () {
56+ local cmd=$1
57+ local version_flag=${2:- --version}
58+ local description=${3:- $cmd }
59+
60+ if timeout 3 " $cmd " " $version_flag " > /dev/null 2>&1 ; then
61+ log_health_info " ✓ $description is functional"
62+ return 0
63+ else
64+ log_health_error " ✗ $description failed to execute or return version"
65+ return 1
66+ fi
67+ }
68+
69+ # Function to check file system health
70+ check_filesystem () {
71+ log_health_info " Checking filesystem health..."
72+
73+ # Check if required directories exist and are writable
74+ local dirs=(" /opt/kubernetes" " /opt/helm" " /opt/scripts" " /tmp" )
75+ local failed=0
76+
77+ for dir in " ${dirs[@]} " ; do
78+ if [[ -d " $dir " && -w " $dir " ]]; then
79+ log_health_info " ✓ Directory $dir is accessible and writable"
80+ else
81+ log_health_error " ✗ Directory $dir is not accessible or not writable"
82+ failed=1
83+ fi
84+ done
85+
86+ # Test write capability in temp directory
87+ local test_file=" /tmp/health-check-$$ "
88+ if echo " health-check" > " $test_file " 2> /dev/null && rm -f " $test_file " 2> /dev/null; then
89+ log_health_info " ✓ Filesystem write test passed"
90+ else
91+ log_health_error " ✗ Filesystem write test failed"
92+ failed=1
93+ fi
94+
95+ return $failed
96+ }
97+
98+ # Function to check memory and basic system resources
99+ check_system_resources () {
100+ log_health_info " Checking system resources..."
101+
102+ # Check available memory (should have at least 50MB available)
103+ if [[ -f /proc/meminfo ]]; then
104+ local available_mem
105+ available_mem=$( awk ' /MemAvailable/ {print $2}' /proc/meminfo 2> /dev/null || echo " 0" )
106+ local available_mb=$(( available_mem / 1024 ))
107+
108+ if [[ $available_mb -gt 50 ]]; then
109+ log_health_info " ✓ Sufficient memory available: ${available_mb} MB"
110+ else
111+ log_health_error " ✗ Low memory: ${available_mb} MB available"
112+ return 1
113+ fi
114+ fi
115+
116+ # Check disk space in /tmp (should have at least 100MB)
117+ local available_space
118+ available_space=$( df /tmp 2> /dev/null | awk ' NR==2 {print $4}' || echo " 0" )
119+ local available_space_mb=$(( available_space / 1024 ))
120+
121+ if [[ $available_space_mb -gt 100 ]]; then
122+ log_health_info " ✓ Sufficient disk space: ${available_space_mb} MB"
123+ else
124+ log_health_error " ✗ Low disk space: ${available_space_mb} MB available"
125+ return 1
126+ fi
127+
128+ return 0
129+ }
130+
131+ # Function to check required tools
132+ check_required_tools () {
133+ log_health_info " Checking required tools..."
134+
135+ local failed=0
136+
137+ # Check basic commands
138+ check_command " bash" " Bash shell" || failed=1
139+ check_command " curl" " curl" || failed=1
140+ check_command " jq" " jq JSON processor" || failed=1
141+ check_command " envsubst" " envsubst" || failed=1
142+ check_command " nc" " netcat" || failed=1
143+
144+ # Check AWS CLI
145+ if check_command " aws" " AWS CLI" ; then
146+ check_command_version " aws" " --version" " AWS CLI" || failed=1
147+ else
148+ failed=1
149+ fi
150+
151+ # Check kubectl
152+ if check_command " kubectl" " kubectl" ; then
153+ check_command_version " kubectl" " version --client --short" " kubectl" || failed=1
154+ else
155+ failed=1
156+ fi
157+
158+ # Check Helm
159+ if check_command " helm" " Helm" ; then
160+ check_command_version " helm" " version --short" " Helm" || failed=1
161+ else
162+ failed=1
163+ fi
164+
165+ # Check AWS IAM Authenticator
166+ if check_command " aws-iam-authenticator" " AWS IAM Authenticator" ; then
167+ check_command_version " aws-iam-authenticator" " version" " AWS IAM Authenticator" || failed=1
168+ else
169+ failed=1
170+ fi
171+
172+ # Check eksctl (optional but should be present)
173+ if check_command " eksctl" " eksctl" ; then
174+ check_command_version " eksctl" " version" " eksctl" || failed=1
175+ else
176+ log_health_info " ⚠ eksctl not available (optional)"
177+ fi
178+
179+ return $failed
180+ }
181+
182+ # Function to check configuration templates
183+ check_templates () {
184+ log_health_info " Checking configuration templates..."
185+
186+ local failed=0
187+ local templates=(" /config.template" " /private-config.template" )
188+
189+ for template in " ${templates[@]} " ; do
190+ if [[ -f " $template " && -r " $template " ]]; then
191+ log_health_info " ✓ Template $template is accessible"
192+
193+ # Basic validation that template contains expected variables
194+ if grep -q ' \${CLUSTER_NAME}' " $template " && grep -q ' \${REGION_CODE}' " $template " ; then
195+ log_health_info " ✓ Template $template contains required variables"
196+ else
197+ log_health_error " ✗ Template $template missing required variables"
198+ failed=1
199+ fi
200+ else
201+ log_health_error " ✗ Template $template is not accessible"
202+ failed=1
203+ fi
204+ done
205+
206+ return $failed
207+ }
208+
209+ # Function to check script permissions and dependencies
210+ check_scripts () {
211+ log_health_info " Checking script dependencies..."
212+
213+ local failed=0
214+ local scripts=(" /entrypoint.sh" " /cleanup.sh" " /setup-tools.sh" )
215+
216+ for script in " ${scripts[@]} " ; do
217+ if [[ -f " $script " && -x " $script " ]]; then
218+ log_health_info " ✓ Script $script is executable"
219+ else
220+ log_health_error " ✗ Script $script is not executable or missing"
221+ failed=1
222+ fi
223+ done
224+
225+ return $failed
226+ }
227+
228+ # Function to check environment readiness (basic test)
229+ check_environment_readiness () {
230+ log_health_info " Checking environment readiness..."
231+
232+ # Test that we can create and manipulate files in working directory
233+ local test_dir=" /opt/scripts/health-test-$$ "
234+ if mkdir -p " $test_dir " 2> /dev/null; then
235+ if echo " test" > " $test_dir /test.txt" 2> /dev/null; then
236+ if [[ -f " $test_dir /test.txt" ]]; then
237+ log_health_info " ✓ Environment file operations work"
238+ rm -rf " $test_dir " 2> /dev/null
239+ return 0
240+ fi
241+ fi
242+ rm -rf " $test_dir " 2> /dev/null
243+ fi
244+
245+ log_health_error " ✗ Environment file operations failed"
246+ return 1
247+ }
248+
249+ # Function to validate environment variables (if set)
250+ check_environment_variables () {
251+ log_health_info " Checking environment configuration..."
252+
253+ # Check if PATH includes required directories
254+ if echo " $PATH " | grep -q " /usr/local/bin" ; then
255+ log_health_info " ✓ PATH includes /usr/local/bin"
256+ else
257+ log_health_error " ✗ PATH missing /usr/local/bin"
258+ return 1
259+ fi
260+
261+ # Check Helm environment variables if they exist
262+ if [[ -n " ${HELM_HOME:- } " ]]; then
263+ if [[ -d " ${HELM_HOME} " ]]; then
264+ log_health_info " ✓ HELM_HOME directory exists: ${HELM_HOME} "
265+ else
266+ log_health_error " ✗ HELM_HOME directory doesn't exist: ${HELM_HOME} "
267+ return 1
268+ fi
269+ fi
270+
271+ return 0
272+ }
273+
274+ # Main health check function
275+ main_health_check () {
276+ log_health_info " Starting container health check..."
277+
278+ local overall_status=0
279+
280+ # Run all health checks
281+ check_filesystem || overall_status=1
282+ check_system_resources || overall_status=1
283+ check_required_tools || overall_status=1
284+ check_templates || overall_status=1
285+ check_scripts || overall_status=1
286+ check_environment_readiness || overall_status=1
287+ check_environment_variables || overall_status=1
288+
289+ # Final health status
290+ if [[ $overall_status -eq 0 ]]; then
291+ log_health_ok " Container health check passed"
292+ return $HEALTH_OK
293+ else
294+ log_health_error " Container health check failed"
295+ return $HEALTH_ERROR
296+ fi
297+ }
298+
299+ # Quick health check (for frequent docker health checks)
300+ quick_health_check () {
301+ # Only check the most critical components for quick checks
302+ local failed=0
303+
304+ # Check if critical commands exist
305+ command -v kubectl > /dev/null 2>&1 || failed=1
306+ command -v helm > /dev/null 2>&1 || failed=1
307+ command -v aws > /dev/null 2>&1 || failed=1
308+
309+ # Check if we can write to temp
310+ echo " test" > /tmp/quick-health-$$ 2> /dev/null && rm -f /tmp/quick-health-$$ 2> /dev/null || failed=1
311+
312+ return $failed
313+ }
314+
315+ # Handle timeout
316+ timeout_handler () {
317+ log_health_error " Health check timed out after ${HEALTH_TIMEOUT} seconds"
318+ exit $HEALTH_ERROR
319+ }
320+
321+ # Set up timeout
322+ trap timeout_handler TERM
323+ timeout $HEALTH_TIMEOUT bash -c '
324+ # Check if quick mode is requested (for Docker health checks)
325+ if [[ "${1:-}" == "--quick" ]]; then
326+ quick_health_check
327+ else
328+ main_health_check
329+ fi
330+ ' -- " $@ " &
331+
332+ wait $!
333+ exit_code=$?
334+
335+ # Clean exit
336+ trap - TERM
337+ exit $exit_code
0 commit comments