|
95 | 95 | run: | |
96 | 96 | echo "===== Jobs =====" |
97 | 97 | kubectl get jobs -o wide |
| 98 | + echo "" |
| 99 | + echo "===== Job Status Details =====" |
| 100 | + kubectl get jobs -o custom-columns='NAME:.metadata.name,COMPLETIONS:.spec.completions,SUCCESSFUL:.status.succeeded,FAILED:.status.failed,AGE:.metadata.creationTimestamp' |
| 101 | + echo "" |
| 102 | + echo "===== All Pods =====" |
| 103 | + kubectl get pods -o wide |
| 104 | + echo "" |
98 | 105 | echo "===== Pods (pgstac) =====" |
99 | 106 | kubectl get pods | grep -i pgstac || true |
| 107 | + echo "" |
| 108 | + echo "===== Pod Phase Summary =====" |
| 109 | + kubectl get pods --no-headers | awk '{print $3}' | sort | uniq -c |
| 110 | + echo "" |
| 111 | + echo "===== Events (last 10 minutes) =====" |
| 112 | + kubectl get events --sort-by='.lastTimestamp' | tail -20 |
100 | 113 |
|
101 | 114 | - name: debug pgstac-eoapi-superuser-init-db job failure |
102 | 115 | if: steps.helm-render-install-eoapi-templates.outcome == 'failure' |
@@ -141,87 +154,165 @@ jobs: |
141 | 154 | if: steps.helm-render-install-eoapi-templates.outcome == 'failure' |
142 | 155 | continue-on-error: true |
143 | 156 | run: | |
144 | | - echo "Extracting pgstac-migrate job info and logs for debugging..." |
| 157 | + echo "Extracting comprehensive pgstac-migrate job info and logs for debugging..." |
145 | 158 |
|
146 | | - # Get job details |
147 | | - echo "===== pgstac-migrate Job Details =====" |
148 | | - kubectl get job "$RELEASE_NAME-pgstac-migrate" -o yaml || echo "Could not get pgstac-migrate job details" |
| 159 | + # Get all jobs with details |
| 160 | + echo "===== All Jobs Status =====" |
| 161 | + kubectl get jobs -o wide |
| 162 | + echo "" |
149 | 163 |
|
150 | | - # Get pod details |
151 | | - echo "===== pgstac-migrate Pod Details =====" |
152 | | - kubectl get pods -l app=pgstac-migrate --all-namespaces || echo "Could not find pgstac-migrate pods" |
| 164 | + # Get specific job details using labels |
| 165 | + echo "===== pgstac-migrate Job Details (by label) =====" |
| 166 | + kubectl get jobs -l app=pgstac-migrate -o yaml || echo "Could not get pgstac-migrate job details" |
| 167 | + echo "" |
| 168 | +
|
| 169 | + # Get pod details - both by label and by job-name |
| 170 | + echo "===== pgstac-migrate Pod Details (by label) =====" |
| 171 | + kubectl get pods -l app=pgstac-migrate --all-namespaces -o wide || echo "Could not find pgstac-migrate pods by label" |
| 172 | + echo "" |
| 173 | +
|
| 174 | + echo "===== pgstac-migrate Pod Details (by app label) =====" |
| 175 | + kubectl get pods -l app=pgstac-migrate -o wide || echo "Could not find pgstac-migrate pods by app label" |
| 176 | + echo "" |
153 | 177 |
|
154 | | - # Extract logs from pgstac-migrate pod(s) |
| 178 | + # Extract logs from all pgstac-migrate pods (running, completed, failed) |
155 | 179 | echo "===== pgstac-migrate Pod Logs =====" |
156 | | - echo "Looking for completed pods from pgstac-migrate job..." |
157 | | - COMPLETED_PODS=$(kubectl get pods --selector=job-name="$RELEASE_NAME-pgstac-migrate" --field-selector=status.phase=Succeeded,status.phase=Failed -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) |
158 | | - if [ -n "$COMPLETED_PODS" ]; then |
159 | | - echo "Found completed pods from pgstac-migrate job. Extracting logs from each:" |
160 | | - for POD in $COMPLETED_PODS; do |
161 | | - echo "--- Logs from completed pod $POD ---" |
162 | | - kubectl logs pod/$POD || echo "Could not get logs from pod $POD" |
| 180 | + ALL_PODS=$(kubectl get pods -l app=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) |
| 181 | + if [ -n "$ALL_PODS" ]; then |
| 182 | + echo "Found pgstac-migrate job pods. Extracting logs from each:" |
| 183 | + for POD in $ALL_PODS; do |
| 184 | + echo "--- Pod $POD status ---" |
| 185 | + kubectl get pod "$POD" -o wide |
| 186 | + echo "--- Logs from pod $POD ---" |
| 187 | + kubectl logs pod/$POD --tail=100 || echo "Could not get logs from pod $POD" |
| 188 | + echo "--- Previous logs from pod $POD (if container restarted) ---" |
| 189 | + kubectl logs pod/$POD --previous --tail=50 || echo "No previous logs for pod $POD" |
| 190 | + echo "" |
163 | 191 | done |
164 | 192 | else |
165 | | - echo "No completed pods found for pgstac-migrate job" |
| 193 | + echo "No pods found for pgstac-migrate jobs" |
| 194 | + echo "Checking for pods with broader label search..." |
| 195 | + LABEL_PODS=$(kubectl get pods -l app=pgstac-migrate -o jsonpath='{.items[*].metadata.name}' 2>/dev/null) |
| 196 | + if [ -n "$LABEL_PODS" ]; then |
| 197 | + for POD in $LABEL_PODS; do |
| 198 | + echo "--- Pod $POD (found by label) ---" |
| 199 | + kubectl describe pod "$POD" |
| 200 | + kubectl logs pod/$POD --tail=50 || true |
| 201 | + done |
| 202 | + fi |
166 | 203 | fi |
167 | 204 |
|
168 | 205 | # Get details about the database pods/services |
169 | 206 | echo "===== Database Pod/Service Details =====" |
170 | | - # Find database service |
171 | | - kubectl get svc | grep "db" || echo "Could not find database services" |
172 | | - # Find database pods |
173 | | - kubectl get pods | grep "db-" || echo "Could not find database pods" |
| 207 | + kubectl get svc | grep -E "db|postgres" || echo "Could not find database services" |
| 208 | + kubectl get pods | grep -E "db-|postgres" || echo "Could not find database pods" |
| 209 | + echo "" |
| 210 | +
|
| 211 | + # Check ConfigMaps and Secrets |
| 212 | + echo "===== Relevant ConfigMaps =====" |
| 213 | + kubectl get configmaps | grep -E "pgstac|initdb" || echo "No pgstac configmaps found" |
| 214 | + echo "" |
174 | 215 |
|
175 | 216 | # Check for any events related to the job or pods |
176 | | - echo "===== Related Kubernetes Events =====" |
177 | | - kubectl get events | grep -E "pgstac|db" || echo "No relevant events found" |
| 217 | + echo "===== Related Kubernetes Events (last 50) =====" |
| 218 | + kubectl get events --sort-by='.lastTimestamp' | grep -E "pgstac|db|migrate" || echo "No relevant events found" |
178 | 219 |
|
179 | 220 | - id: watchservices |
180 | 221 | name: watch services boot |
181 | 222 | timeout-minutes: 3 |
182 | 223 | continue-on-error: true |
183 | 224 | run: | |
184 | 225 | # Wait for services to be ready using native readiness checks |
| 226 | + echo "===== Current Pod Status =====" |
| 227 | + kubectl get pods -o wide |
| 228 | + echo "" |
| 229 | +
|
185 | 230 | echo "Waiting for raster service to be ready..." |
186 | | - kubectl wait --for=condition=Ready pod -l app=raster-${RELEASE_NAME} --timeout=180s |
| 231 | + kubectl wait --for=condition=Ready pod -l app=raster-${RELEASE_NAME} --timeout=180s || { |
| 232 | + echo "Raster service failed to become ready. Checking status..." |
| 233 | + kubectl get pods -l app=raster-${RELEASE_NAME} -o wide |
| 234 | + kubectl describe pods -l app=raster-${RELEASE_NAME} |
| 235 | + exit 1 |
| 236 | + } |
187 | 237 | echo "raster service is ready, moving on..." |
188 | | - |
| 238 | +
|
189 | 239 | echo "Waiting for vector service to be ready..." |
190 | | - kubectl wait --for=condition=Ready pod -l app=vector-${RELEASE_NAME} --timeout=180s |
| 240 | + kubectl wait --for=condition=Ready pod -l app=vector-${RELEASE_NAME} --timeout=180s || { |
| 241 | + echo "Vector service failed to become ready. Checking status..." |
| 242 | + kubectl get pods -l app=vector-${RELEASE_NAME} -o wide |
| 243 | + kubectl describe pods -l app=vector-${RELEASE_NAME} |
| 244 | + exit 1 |
| 245 | + } |
191 | 246 | echo "vector service is ready, moving on..." |
192 | | - |
| 247 | +
|
193 | 248 | echo "Waiting for stac service to be ready..." |
194 | | - kubectl wait --for=condition=Ready pod -l app=stac-${RELEASE_NAME} --timeout=180s |
| 249 | + kubectl wait --for=condition=Ready pod -l app=stac-${RELEASE_NAME} --timeout=180s || { |
| 250 | + echo "STAC service failed to become ready. Checking status..." |
| 251 | + kubectl get pods -l app=stac-${RELEASE_NAME} -o wide |
| 252 | + kubectl describe pods -l app=stac-${RELEASE_NAME} |
| 253 | + exit 1 |
| 254 | + } |
195 | 255 | echo "all services are ready, moving on..." |
196 | 256 |
|
197 | 257 | - name: cleanup if services fail to boot |
198 | 258 | if: steps.watchservices.outcome == 'failure' |
199 | 259 | run: | |
200 | | - echo "The watchservices step failed or timed out. Extracting pod logs for debugging..." |
201 | | -
|
202 | | - # Get and display all pods status |
203 | | - echo "===== Pod Status =====" |
204 | | - kubectl get pods |
| 260 | + echo "The watchservices step failed or timed out. Extracting comprehensive debugging info..." |
205 | 261 |
|
206 | | - # Extract logs from raster pod init container (wait-for-pgstac-jobs) |
207 | | - echo "===== Raster Pod Init Container Logs (wait-for-pgstac-jobs) =====" |
208 | | - kubectl get pod | grep "^raster-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} -c wait-for-pgstac-jobs --tail=100 || echo "Could not get raster init container logs" |
209 | | -
|
210 | | - # Extract logs from raster pod main container |
211 | | - echo "===== Raster Pod Main Container Logs =====" |
212 | | - kubectl get pod | grep "^raster-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get raster main container logs" |
213 | | -
|
214 | | - # Extract logs from vector pod |
215 | | - echo "===== Vector Pod Logs =====" |
216 | | - kubectl get pod | grep "^vector-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get vector logs" |
217 | | -
|
218 | | - # Extract logs from stac pod |
219 | | - echo "===== STAC Pod Logs =====" |
220 | | - kubectl get pod | grep "^stac-$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl logs pod/{} --tail=100 || echo "Could not get STAC logs" |
221 | | -
|
222 | | - # Check if pods are in pending state or have issues |
223 | | - echo "===== Pod Descriptions for Troubleshooting =====" |
224 | | - kubectl get pod | grep "$RELEASE_NAME" | cut -d' ' -f1 | xargs -I{} kubectl describe pod/{} || echo "Could not describe pods" |
| 262 | + # Get and display all pods status with more detail |
| 263 | + echo "===== Pod Status (detailed) =====" |
| 264 | + kubectl get pods -o wide |
| 265 | + echo "" |
| 266 | + |
| 267 | + echo "===== Pod Readiness Summary =====" |
| 268 | + kubectl get pods --no-headers | awk '{print $2, $3}' | sort | uniq -c |
| 269 | + echo "" |
| 270 | +
|
| 271 | + # Check init container logs for all services |
| 272 | + for SERVICE in raster vector stac multidim; do |
| 273 | + echo "===== $SERVICE Service Pod Status =====" |
| 274 | + kubectl get pods -l app=$SERVICE-$RELEASE_NAME -o wide || echo "No $SERVICE pods found" |
| 275 | + |
| 276 | + POD_NAME=$(kubectl get pod -l app=$SERVICE-$RELEASE_NAME -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") |
| 277 | + if [ -n "$POD_NAME" ]; then |
| 278 | + echo "===== $SERVICE Pod ($POD_NAME) Init Container Logs =====" |
| 279 | + kubectl logs pod/$POD_NAME -c wait-for-pgstac-jobs --tail=100 || echo "Could not get $SERVICE init container logs" |
| 280 | + echo "" |
| 281 | + |
| 282 | + echo "===== $SERVICE Pod ($POD_NAME) Main Container Logs =====" |
| 283 | + kubectl logs pod/$POD_NAME --tail=100 || echo "Could not get $SERVICE main container logs" |
| 284 | + echo "" |
| 285 | + |
| 286 | + echo "===== $SERVICE Pod ($POD_NAME) Description =====" |
| 287 | + kubectl describe pod/$POD_NAME |
| 288 | + echo "" |
| 289 | + fi |
| 290 | + done |
| 291 | +
|
| 292 | + # Show job status that init containers might be waiting for |
| 293 | + echo "===== Job Status (what init containers are waiting for) =====" |
| 294 | + kubectl get jobs -o wide |
| 295 | + echo "" |
| 296 | + |
| 297 | + # Check pgstac jobs using labels instead of hardcoded names |
| 298 | + for APP_LABEL in pgstac-migrate pgstac-load-samples; do |
| 299 | + echo "===== Jobs with app=$APP_LABEL Status =====" |
| 300 | + JOBS=$(kubectl get jobs -l app=$APP_LABEL -o name 2>/dev/null || true) |
| 301 | + if [ -n "$JOBS" ]; then |
| 302 | + for JOB in $JOBS; do |
| 303 | + echo "--- Job $JOB ---" |
| 304 | + kubectl get "$JOB" -o yaml 2>/dev/null | grep -A 10 -E "conditions|status:" || echo "Could not get status for $JOB" |
| 305 | + done |
| 306 | + else |
| 307 | + echo "No jobs found with app=$APP_LABEL label" |
| 308 | + fi |
| 309 | + echo "" |
| 310 | + done |
| 311 | +
|
| 312 | + # Check recent events |
| 313 | + echo "===== Recent Events (last 50) =====" |
| 314 | + kubectl get events --sort-by='.lastTimestamp' | tail -50 |
| 315 | + echo "" |
225 | 316 |
|
226 | 317 | # force GH action to show failed result |
227 | 318 | exit 128 |
|
0 commit comments