Skip to content

Commit eea82a7

Browse files
authored
Merge branch 'main' into user/asklar/on_device
2 parents 5b6929f + 5055c79 commit eea82a7

File tree

1 file changed

+53
-6
lines changed

1 file changed

+53
-6
lines changed

.github/workflows/sync-db.yml

Lines changed: 53 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -139,29 +139,48 @@ jobs:
139139
sleep 15
140140
141141
# Find the job created by k8up for this restore
142-
for i in {1..30}; do
143-
JOB_NAME=$(kubectl get jobs -n default -l k8up.io/owned-by=restore -o jsonpath='{.items[?(@.metadata.ownerReferences[0].name=="'$RESTORE_NAME'")].metadata.name}' 2>/dev/null)
142+
# k8up creates jobs with name pattern "restore-<restore-name>"
143+
# Since our restore is named "restore-from-prod-*", the job will be "restore-restore-from-prod-*"
144+
for i in {1..60}; do
145+
JOB_NAME=$(kubectl get jobs -n default --no-headers 2>/dev/null | grep "^restore-$RESTORE_NAME" | awk '{print $1}' | head -1)
144146
if [ -n "$JOB_NAME" ]; then
145147
echo "Found restore job: $JOB_NAME"
146148
break
147149
fi
148-
echo "Waiting for job to be created... ($i/30)"
150+
echo "Waiting for job to be created... ($i/60)"
149151
sleep 2
150152
done
151153
152154
if [ -z "$JOB_NAME" ]; then
153-
echo "ERROR: Restore job not found"
154-
kubectl get restore $RESTORE_NAME -n default -o yaml
155+
echo "ERROR: Restore job not found after 120 seconds"
156+
echo "Checking restore resource status:"
157+
kubectl get restore $RESTORE_NAME -n default
158+
kubectl describe restore $RESTORE_NAME -n default
159+
160+
echo "Checking for any restore jobs:"
161+
kubectl get jobs -n default | grep restore || echo "No restore jobs found"
162+
163+
echo "Checking k8up operator logs:"
164+
kubectl logs deployment/k8up -n default --tail=50 | grep -i restore || echo "No restore logs found"
155165
exit 1
156166
fi
157167
158168
# Wait for the restore job to complete (max 15 minutes)
169+
echo "Waiting for restore job to complete..."
159170
kubectl wait --for=condition=complete \
160171
job/$JOB_NAME \
161172
--timeout=900s -n default || {
162173
echo "Restore job failed or timed out"
174+
echo "Job status:"
175+
kubectl get job/$JOB_NAME -n default
176+
echo "Job details:"
163177
kubectl describe job/$JOB_NAME -n default
164-
kubectl logs job/$JOB_NAME -n default --tail=100
178+
echo "Job logs (if available):"
179+
kubectl logs job/$JOB_NAME -n default --tail=100 || echo "No logs available"
180+
181+
# Check if it's a credential issue
182+
echo "Checking if credentials exist:"
183+
kubectl get secret prod-to-staging-sync-credentials -n default || echo "Credentials secret missing!"
165184
exit 1
166185
}
167186
@@ -370,11 +389,36 @@ jobs:
370389
- name: Cleanup
371390
if: always()
372391
run: |
392+
# Wait for any restore jobs to finish before cleanup (max 5 minutes for large database restores)
393+
echo "Checking for running restore jobs..."
394+
RESTORE_NAME="${{ steps.restore.outputs.restore_name }}"
395+
396+
for i in {1..60}; do
397+
# Check specifically for our restore job in Running or Pending state
398+
RUNNING_JOBS=$(kubectl get jobs -n default --no-headers 2>/dev/null | grep "^restore-${RESTORE_NAME}" | grep -E "Running|Pending" | wc -l)
399+
if [ "$RUNNING_JOBS" -eq 0 ]; then
400+
echo "No running restore jobs found"
401+
break
402+
fi
403+
echo "Waiting for $RUNNING_JOBS restore job(s) to finish... ($i/60)"
404+
sleep 5
405+
done
406+
373407
# Clean up jobs first
374408
if [ -n "${{ steps.copy-job.outputs.job_name }}" ]; then
375409
kubectl delete job ${{ steps.copy-job.outputs.job_name }} -n default || true
376410
fi
377411
412+
# Clean up the restore job if it exists (with proper error handling)
413+
if [ -n "$RESTORE_NAME" ]; then
414+
if kubectl get job "restore-$RESTORE_NAME" -n default >/dev/null 2>&1; then
415+
echo "Deleting restore job: restore-$RESTORE_NAME"
416+
kubectl delete job "restore-$RESTORE_NAME" -n default || echo "Failed to delete job, may have already been cleaned up"
417+
else
418+
echo "Restore job restore-$RESTORE_NAME not found, skipping deletion"
419+
fi
420+
fi
421+
378422
# Remove restore PVC (will wait for jobs to finish)
379423
kubectl delete pvc restore-data-pvc -n default || true
380424
@@ -384,5 +428,8 @@ jobs:
384428
# Clean up old restore resources (keep last 3)
385429
kubectl get restore -n default --sort-by=.metadata.creationTimestamp -o name | head -n -3 | xargs -r kubectl delete || true
386430
431+
# Clean up old restore jobs (keep last 3)
432+
kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep '^job.batch/restore-' | head -n -3 | xargs -r kubectl delete -n default || true
433+
387434
# Clean up old copy jobs (keep last 3)
388435
kubectl get jobs -n default --sort-by=.metadata.creationTimestamp -o name | grep 'copy-pgdata-' | head -n -3 | xargs -r kubectl delete -n default || true

0 commit comments

Comments
 (0)