Skip to content

Commit e89d5c9

Browse files
authored
Abstracted wait-script to use label names and variables for time limits. (#252)
1 parent 3e66d4b commit e89d5c9

File tree

4 files changed

+214
-95
lines changed

4 files changed

+214
-95
lines changed

.github/workflows/helm-tests.yml

Lines changed: 142 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,21 @@ jobs:
9595
run: |
9696
echo "===== Jobs ====="
9797
kubectl get jobs -o wide
98+
echo ""
99+
echo "===== Job Status Details ====="
100+
kubectl get jobs -o custom-columns='NAME:.metadata.name,COMPLETIONS:.spec.completions,SUCCESSFUL:.status.succeeded,FAILED:.status.failed,AGE:.metadata.creationTimestamp'
101+
echo ""
102+
echo "===== All Pods ====="
103+
kubectl get pods -o wide
104+
echo ""
98105
echo "===== Pods (pgstac) ====="
99106
kubectl get pods | grep -i pgstac || true
107+
echo ""
108+
echo "===== Pod Phase Summary ====="
109+
kubectl get pods --no-headers | awk '{print $3}' | sort | uniq -c
110+
echo ""
111+
echo "===== Events (last 10 minutes) ====="
112+
kubectl get events --sort-by='.lastTimestamp' | tail -20
100113
101114
- name: debug pgstac-eoapi-superuser-init-db job failure
102115
if: steps.helm-render-install-eoapi-templates.outcome == 'failure'
@@ -141,87 +154,165 @@ jobs:
141154
if: steps.helm-render-install-eoapi-templates.outcome == 'failure'
142155
continue-on-error: true
143156
run: |
144-
echo "Extracting pgstac-migrate job info and logs for debugging..."
157+
echo "Extracting comprehensive pgstac-migrate job info and logs for debugging..."
145158
146-
# Get job details
147-
echo "===== pgstac-migrate Job Details ====="
148-
kubectl get job "$RELEASE_NAME-pgstac-migrate" -o yaml || echo "Could not get pgstac-migrate job details"
159+
# Get all jobs with details
160+
echo "===== All Jobs Status ====="
161+
kubectl get jobs -o wide
162+
echo ""
149163
150-
# Get pod details
151-
echo "===== pgstac-migrate Pod Details ====="
152-
kubectl get pods -l app=pgstac-migrate --all-namespaces || echo "Could not find pgstac-migrate pods"
164+
# Get specific job details using labels
165+
echo "===== pgstac-migrate Job Details (by label) ====="
166+
kubectl get jobs -l app=pgstac-migrate -o yaml || echo "Could not get pgstac-migrate job details"
167+
echo ""
168+
169+
# Get pod details - both by label and by job-name
170+
echo "===== pgstac-migrate Pod Details (by label) ====="
171+
kubectl get pods -l app=pgstac-migrate --all-namespaces -o wide || echo "Could not find pgstac-migrate pods by label"
172+
echo ""
173+
174+
echo "===== pgstac-migrate Pod Details (by app label) ====="
175+
kubectl get pods -l app=pgstac-migrate -o wide || echo "Could not find pgstac-migrate pods by app label"
176+
echo ""
153177
154-
# Extract logs from pgstac-migrate pod(s)
178+
# Extract logs from all pgstac-migrate pods (running, completed, failed)
155179
echo "===== pgstac-migrate Pod Logs ====="
156-
echo "Looking for completed pods from pgstac-migrate job..."
157-
COMPLETED_PODS=$(kubectl get pods --selector=job-name="$RELEASE_NAME-pgstac-migrate" --field-selector=status.phase=Succeeded,status.phase=Failed -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
158-
if [ -n "$COMPLETED_PODS" ]; then
159-
echo "Found completed pods from pgstac-migrate job. Extracting logs from each:"
160-
for POD in $COMPLETED_PODS; do
161-
echo "--- Logs from completed pod $POD ---"
162-
kubectl logs pod/$POD || echo "Could not get logs from pod $POD"
180+
ALL_PODS=$(kubectl get pods -l app=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
181+
if [ -n "$ALL_PODS" ]; then
182+
echo "Found pgstac-migrate job pods. Extracting logs from each:"
183+
for POD in $ALL_PODS; do
184+
echo "--- Pod $POD status ---"
185+
kubectl get pod "$POD" -o wide
186+
echo "--- Logs from pod $POD ---"
187+
kubectl logs pod/$POD --tail=100 || echo "Could not get logs from pod $POD"
188+
echo "--- Previous logs from pod $POD (if container restarted) ---"
189+
kubectl logs pod/$POD --previous --tail=50 || echo "No previous logs for pod $POD"
190+
echo ""
163191
done
164192
else
165-
echo "No completed pods found for pgstac-migrate job"
193+
echo "No pods found for pgstac-migrate jobs"
194+
echo "Checking for pods with broader label search..."
195+
LABEL_PODS=$(kubectl get pods -l app=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null)
196+
if [ -n "$LABEL_PODS" ]; then
197+
for POD in $LABEL_PODS; do
198+
echo "--- Pod $POD (found by label) ---"
199+
kubectl describe pod "$POD"
200+
kubectl logs pod/$POD --tail=50 || true
201+
done
202+
fi
166203
fi
167204
168205
# Get details about the database pods/services
169206
echo "===== Database Pod/Service Details ====="
170-
# Find database service
171-
kubectl get svc | grep "db" || echo "Could not find database services"
172-
# Find database pods
173-
kubectl get pods | grep "db-" || echo "Could not find database pods"
207+
kubectl get svc | grep -E "db|postgres" || echo "Could not find database services"
208+
kubectl get pods | grep -E "db-|postgres" || echo "Could not find database pods"
209+
echo ""
210+
211+
# Check ConfigMaps and Secrets
212+
echo "===== Relevant ConfigMaps ====="
213+
kubectl get configmaps | grep -E "pgstac|initdb" || echo "No pgstac configmaps found"
214+
echo ""
174215
175216
# Check for any events related to the job or pods
176-
echo "===== Related Kubernetes Events ====="
177-
kubectl get events | grep -E "pgstac|db" || echo "No relevant events found"
217+
echo "===== Related Kubernetes Events (last 50) ====="
218+
kubectl get events --sort-by='.lastTimestamp' | grep -E "pgstac|db|migrate" || echo "No relevant events found"
178219
179220
- id: watchservices
180221
name: watch services boot
181222
timeout-minutes: 3
182223
continue-on-error: true
183224
run: |
184225
# Wait for services to be ready using native readiness checks
226+
echo "===== Current Pod Status ====="
227+
kubectl get pods -o wide
228+
echo ""
229+
185230
echo "Waiting for raster service to be ready..."
186-
kubectl wait --for=condition=Ready pod -l app=raster-${RELEASE_NAME} --timeout=180s
231+
kubectl wait --for=condition=Ready pod -l app=raster-${RELEASE_NAME} --timeout=180s || {
232+
echo "Raster service failed to become ready. Checking status..."
233+
kubectl get pods -l app=raster-${RELEASE_NAME} -o wide
234+
kubectl describe pods -l app=raster-${RELEASE_NAME}
235+
exit 1
236+
}
187237
echo "raster service is ready, moving on..."
188-
238+
189239
echo "Waiting for vector service to be ready..."
190-
kubectl wait --for=condition=Ready pod -l app=vector-${RELEASE_NAME} --timeout=180s
240+
kubectl wait --for=condition=Ready pod -l app=vector-${RELEASE_NAME} --timeout=180s || {
241+
echo "Vector service failed to become ready. Checking status..."
242+
kubectl get pods -l app=vector-${RELEASE_NAME} -o wide
243+
kubectl describe pods -l app=vector-${RELEASE_NAME}
244+
exit 1
245+
}
191246
echo "vector service is ready, moving on..."
192-
247+
193248
echo "Waiting for stac service to be ready..."
194-
kubectl wait --for=condition=Ready pod -l app=stac-${RELEASE_NAME} --timeout=180s
249+
kubectl wait --for=condition=Ready pod -l app=stac-${RELEASE_NAME} --timeout=180s || {
250+
echo "STAC service failed to become ready. Checking status..."
251+
kubectl get pods -l app=stac-${RELEASE_NAME} -o wide
252+
kubectl describe pods -l app=stac-${RELEASE_NAME}
253+
exit 1
254+
}
195255
echo "all services are ready, moving on..."
196256
197257
- name: cleanup if services fail to boot
198258
if: steps.watchservices.outcome == 'failure'
199259
run: |
200-
echo "The watchservices step failed or timed out. Extracting pod logs for debugging..."
201-
202-
# Get and display all pods status
203-
echo "===== Pod Status ====="
204-
kubectl get pods
260+
echo "The watchservices step failed or timed out. Extracting comprehensive debugging info..."
205261
206-
# Extract logs from raster pod init container (wait-for-pgstac-jobs)
207-
echo "===== Raster Pod Init Container Logs (wait-for-pgstac-jobs) ====="
208-
kubectl get pod | grep "^raster-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} -c wait-for-pgstac-jobs --tail=100 || echo "Could not get raster init container logs"
209-
210-
# Extract logs from raster pod main container
211-
echo "===== Raster Pod Main Container Logs ====="
212-
kubectl get pod | grep "^raster-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get raster main container logs"
213-
214-
# Extract logs from vector pod
215-
echo "===== Vector Pod Logs ====="
216-
kubectl get pod | grep "^vector-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get vector logs"
217-
218-
# Extract logs from stac pod
219-
echo "===== STAC Pod Logs ====="
220-
kubectl get pod | grep "^stac-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get STAC logs"
221-
222-
# Check if pods are in pending state or have issues
223-
echo "===== Pod Descriptions for Troubleshooting ====="
224-
kubectl get pod | grep "$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl describe pod/{} || echo "Could not describe pods"
262+
# Get and display all pods status with more detail
263+
echo "===== Pod Status (detailed) ====="
264+
kubectl get pods -o wide
265+
echo ""
266+
267+
echo "===== Pod Readiness Summary ====="
268+
kubectl get pods --no-headers | awk '{print $2, $3}' | sort | uniq -c
269+
echo ""
270+
271+
# Check init container logs for all services
272+
for SERVICE in raster vector stac multidim; do
273+
echo "===== $SERVICE Service Pod Status ====="
274+
kubectl get pods -l app=$SERVICE-$RELEASE_NAME -o wide || echo "No $SERVICE pods found"
275+
276+
POD_NAME=$(kubectl get pod -l app=$SERVICE-$RELEASE_NAME -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
277+
if [ -n "$POD_NAME" ]; then
278+
echo "===== $SERVICE Pod ($POD_NAME) Init Container Logs ====="
279+
kubectl logs pod/$POD_NAME -c wait-for-pgstac-jobs --tail=100 || echo "Could not get $SERVICE init container logs"
280+
echo ""
281+
282+
echo "===== $SERVICE Pod ($POD_NAME) Main Container Logs ====="
283+
kubectl logs pod/$POD_NAME --tail=100 || echo "Could not get $SERVICE main container logs"
284+
echo ""
285+
286+
echo "===== $SERVICE Pod ($POD_NAME) Description ====="
287+
kubectl describe pod/$POD_NAME
288+
echo ""
289+
fi
290+
done
291+
292+
# Show job status that init containers might be waiting for
293+
echo "===== Job Status (what init containers are waiting for) ====="
294+
kubectl get jobs -o wide
295+
echo ""
296+
297+
# Check pgstac jobs using labels instead of hardcoded names
298+
for APP_LABEL in pgstac-migrate pgstac-load-samples; do
299+
echo "===== Jobs with app=$APP_LABEL Status ====="
300+
JOBS=$(kubectl get jobs -l app=$APP_LABEL -o name 2>/dev/null || true)
301+
if [ -n "$JOBS" ]; then
302+
for JOB in $JOBS; do
303+
echo "--- Job $JOB ---"
304+
kubectl get "$JOB" -o yaml 2>/dev/null | grep -A 10 -E "conditions|status:" || echo "Could not get status for $JOB"
305+
done
306+
else
307+
echo "No jobs found with app=$APP_LABEL label"
308+
fi
309+
echo ""
310+
done
311+
312+
# Check recent events
313+
echo "===== Recent Events (last 50) ====="
314+
kubectl get events --sort-by='.lastTimestamp' | tail -50
315+
echo ""
225316
226317
# force GH action to show failed result
227318
exit 128

charts/eoapi/templates/_pgstac_init.tpl

Lines changed: 0 additions & 30 deletions
This file was deleted.

charts/eoapi/templates/services/_common.tpl

Lines changed: 63 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,32 +37,81 @@ initContainers:
3737
image: bitnami/kubectl:latest
3838
env:
3939
{{- include "eoapi.commonEnvVars" (dict "service" "init" "root" .) | nindent 2 }}
40+
resources:
41+
requests:
42+
cpu: "50m"
43+
memory: "64Mi"
44+
limits:
45+
cpu: "100m"
46+
memory: "128Mi"
4047
command:
4148
- /bin/sh
4249
- -c
4350
- |
4451
set -eu
4552
46-
MIGRATE_JOB="${RELEASE_NAME:-eoapi}-pgstac-migrate"
47-
SAMPLES_JOB="${RELEASE_NAME:-eoapi}-pgstac-load-samples"
53+
# Configurable parameters with values.yaml support and environment variable fallback
54+
SLEEP_INTERVAL="${PGSTAC_WAIT_SLEEP_INTERVAL:-{{ .Values.pgstacBootstrap.settings.waitConfig.sleepInterval | default 5 }}}"
55+
TIMEOUT_SECONDS="${PGSTAC_WAIT_TIMEOUT:-{{ .Values.pgstacBootstrap.settings.waitConfig.timeout | default 900 }}}"
4856
49-
wait_complete () {
50-
job="$1"
51-
echo "Waiting for $job to complete..."
52-
# Optional: fail fast after 15 min so CI doesn't hang forever
53-
deadline=$(( $(date +%s) + 900 ))
57+
wait_for_job_by_label () {
58+
label_selector="$1"
59+
job_description="$2"
60+
echo "Waiting for job with label $label_selector to complete (timeout: ${TIMEOUT_SECONDS}s, interval: ${SLEEP_INTERVAL}s)..."
61+
deadline=$(( $(date +%s) + TIMEOUT_SECONDS ))
62+
5463
while :; do
55-
# If job doesn't exist yet or SA can't read it, jsonpath may be empty
56-
status="$(kubectl get job "$job" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null || true)"
57-
[ "$status" = "True" ] && { echo "$job completed"; return 0; }
58-
[ $(date +%s) -ge $deadline ] && { echo "Timeout waiting for $job"; exit 1; }
59-
sleep 5
64+
# Check if deadline exceeded
65+
[ $(date +%s) -ge $deadline ] && { echo "Timeout waiting for $job_description job"; exit 1; }
66+
67+
# Get jobs matching the label
68+
jobs=$(kubectl get job -l "$label_selector" -o name 2>/dev/null || true)
69+
70+
if [ -z "$jobs" ]; then
71+
echo "No $job_description jobs found yet, waiting..."
72+
sleep 5
73+
continue
74+
fi
75+
76+
# Check each job's status
77+
all_complete=true
78+
any_failed=false
79+
80+
for job in $jobs; do
81+
# Get completion and failure status
82+
complete_status=$(kubectl get "$job" -o jsonpath='{.status.conditions[?(@.type=="Complete")].status}' 2>/dev/null || echo "Unknown")
83+
failed_status=$(kubectl get "$job" -o jsonpath='{.status.conditions[?(@.type=="Failed")].status}' 2>/dev/null || echo "False")
84+
85+
job_name=$(echo "$job" | cut -d'/' -f2)
86+
87+
if [ "$failed_status" = "True" ]; then
88+
echo "ERROR: $job_description job $job_name failed!"
89+
echo "Job details:"
90+
kubectl describe "$job" || true
91+
echo "Job logs:"
92+
kubectl logs -l "job-name=$job_name" --tail=50 || true
93+
any_failed=true
94+
elif [ "$complete_status" != "True" ]; then
95+
echo "$job_description job $job_name not yet complete (Complete: $complete_status, Failed: $failed_status)"
96+
all_complete=false
97+
else
98+
echo "$job_description job $job_name completed successfully"
99+
fi
100+
done
101+
102+
# Exit with error if any job failed
103+
[ "$any_failed" = true ] && exit 1
104+
105+
# Exit successfully if all jobs completed
106+
[ "$all_complete" = true ] && return 0
107+
108+
sleep $SLEEP_INTERVAL
60109
done
61110
}
62111

63-
wait_complete "$MIGRATE_JOB"
112+
wait_for_job_by_label "app=pgstac-migrate" "pgstac-migrate"
64113
{{- if .Values.pgstacBootstrap.settings.loadSamples }}
65-
wait_complete "$SAMPLES_JOB"
114+
wait_for_job_by_label "app=pgstac-load-samples" "pgstac-load-samples"
66115
{{- end }}
67116
{{- end }}
68117
{{- end -}}

charts/eoapi/values.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,15 @@ pgstacBootstrap:
161161
settings:
162162
# General configuration options
163163
loadSamples: true # Set to false to disable sample data loading
164+
165+
# Wait configuration for init containers waiting for pgstac jobs
166+
# These parameters control how long services wait for pgstac migration jobs to complete
167+
waitConfig:
168+
# Sleep interval between job status checks (in seconds)
169+
sleepInterval: 5
170+
# Maximum time to wait for jobs to complete (in seconds)
171+
# Default: 900 seconds (15 minutes)
172+
timeout: 900
164173
### Database connection settings TEMPORARY UNTIL WE HAVE A BETTER SOLUTION
165174
### FOR CONFIGURING THE DB CONNECTION
166175
user: eoapi

0 commit comments

Comments
 (0)