@@ -2,10 +2,13 @@ services:
22 db :
33 image : postgres:13.4
44 healthcheck :
5- test : [ "CMD-SHELL", "pg_isready -U postgres || exit 1" ]
6- interval : 10s
7- timeout : 5s
8- retries : 60
5+ # More robust check: verify PostgreSQL can actually execute a query
6+ # pg_isready only checks if accepting connections, not if responsive
7+ test : [ "CMD-SHELL", "pg_isready -U ${WILDBOOK_DB_USER:-wildbook} -d ${WILDBOOK_DB_NAME:-wildbook} && psql -U ${WILDBOOK_DB_USER:-wildbook} -d ${WILDBOOK_DB_NAME:-wildbook} -c 'SELECT 1' || exit 1" ]
8+ interval : 30s
9+ timeout : 10s
10+ retries : 3
11+ start_period : 60s
912 labels :
1013 - autoheal=true
1114 user : postgres
@@ -74,10 +77,13 @@ services:
7477 opensearch :
7578 image : opensearchproject/opensearch:2.15.0
7679 healthcheck :
77- test : [ "CMD-SHELL", "curl --silent --fail 127.0.0.1:9200/_cluster/health || exit 1" ]
78- interval : 10s
79- timeout : 5s
80- retries : 60
80+ # Check cluster health and verify status is not "red"
81+ # Green = all shards allocated, Yellow = primary shards ok, Red = cluster down
82+ test : [ "CMD-SHELL", "curl --silent --fail 127.0.0.1:9200/_cluster/health | grep -qE '\"status\":\"(green|yellow)\"' || exit 1" ]
83+ interval : 30s
84+ timeout : 10s
85+ retries : 3
86+ start_period : 120s
8187 labels :
8288 - autoheal=true
8389 volumes :
@@ -108,6 +114,15 @@ services:
108114 # TODO dkim and spf needs to be added/supported
109115 smtp :
110116 image : boky/postfix
117+ healthcheck :
118+ # Check if postfix master process is running
119+ test : [ "CMD-SHELL", "pgrep -x master || exit 1" ]
120+ interval : 60s
121+ timeout : 10s
122+ retries : 3
123+ start_period : 30s
124+ labels :
125+ - autoheal=true
111126 networks :
112127 - intranet
113128 ports :
@@ -125,11 +140,14 @@ services:
125140 image : nginx:1.23.4
126141 depends_on :
127142 - wildbook
128- # healthcheck:
129- # test: [ "CMD", "curl", "-f", "http://localhost:84/"]
130- # interval: 10s
131- # timeout: 5s
132- # retries: 60
143+ healthcheck :
144+ # Check if nginx master process is running and can accept connections
145+ # nginx:1.23.4 doesn't include curl, so we check the pid file and use nginx -t
146+ test : [ "CMD-SHELL", "nginx -t 2>/dev/null && kill -0 $(cat /var/run/nginx.pid 2>/dev/null) || exit 1" ]
147+ interval : 30s
148+ timeout : 10s
149+ retries : 3
150+ start_period : 30s
133151 labels :
134152 - autoheal=true
135153 volumes :
0 commit comments