Skip to content

Commit 6849a79

Browse files
committed
fix: rollback hardening
1 parent 6e9ab03 commit 6849a79

File tree

3 files changed

+131
-102
lines changed

3 files changed

+131
-102
lines changed

hack/lib-premiumv2-migration-common.sh

Lines changed: 73 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ PVC_BACKUP_DIR="${PVC_BACKUP_DIR:-pvc-backups}"
3333

3434
# ---------- Timeouts ----------
3535
BIND_TIMEOUT_SECONDS="${BIND_TIMEOUT_SECONDS:-60}"
36-
MONITOR_TIMEOUT_MINUTES="${MONITOR_TIMEOUT_MINUTES:-120}"
36+
MONITOR_TIMEOUT_MINUTES="${MONITOR_TIMEOUT_MINUTES:-300}"
3737
WORKLOAD_DETACH_TIMEOUT_MINUTES="${WORKLOAD_DETACH_TIMEOUT_MINUTES:-0}"
3838

3939
# ---------- Kubectl retry configurations ----------
@@ -91,17 +91,12 @@ finalize_audit_summary() {
9191
else
9292
echo
9393
info "Best-effort revert command summary:"
94-
declare -A REVERT_COUNT
9594
local line act revert
9695
for line in "${AUDIT_LINES[@]}"; do
9796
IFS='|' read -r _ act _ _ _ revert _ <<<"$line"
9897
[[ -z "$revert" || "$revert" == "N/A" ]] && continue
99-
REVERT_COUNT["$revert"]=$(( ${REVERT_COUNT["$revert"]:-0} + 1 ))
98+
printf ' %s\n' "$revert"
10099
done
101-
for cmd in "${!REVERT_COUNT[@]}"; do
102-
printf ' (%d) %s\n' "${REVERT_COUNT[$cmd]}" "$cmd"
103-
done
104-
unset REVERT_COUNT
105100
fi
106101
if [[ -n "$AUDIT_LOG_FILE" ]]; then
107102
{
@@ -649,10 +644,6 @@ run_prerequisites_checks() {
649644
size=$(kcmd get pv "$pv" -o jsonpath='{.spec.capacity.storage}' 2>/dev/null || true)
650645
[[ -z "$sc" ]] && PREREQ_ISSUES+=("PVC/$ns/$pvc missing storageClassName")
651646
[[ -z "$size" ]] && PREREQ_ISSUES+=("PV/$pv capacity missing")
652-
zone=$(kcmd get pv "$pv" -o jsonpath='{.spec.nodeAffinity.required.nodeSelectorTerms[*].matchExpressions[?(@.key=="topology.disk.csi.azure.com/zone")].values[0]}' 2>/dev/null || true)
653-
[[ -z "$zone" ]] && zone=$(kcmd get pv "$pv" -o jsonpath='{.spec.nodeAffinity.required.nodeSelectorTerms[*].matchExpressions[?(@.key=="topology.kubernetes.io/zone")].values[0]}' 2>/dev/null || true)
654-
[[ -z "$zone" ]] && zone=$(kcmd get pv "$pv" -o jsonpath='{.metadata.labels.topology\.kubernetes\.io/zone}' 2>/dev/null || true)
655-
[[ -z "$zone" ]] && PREREQ_ISSUES+=("PV/$pv no zone label")
656647
if [[ -n "$sc" ]]; then
657648
if [[ -z "${_SC_JSON_CACHE[$sc]:-}" ]]; then
658649
_SC_JSON_CACHE[$sc]=$(kcmd get sc "$sc" -o json 2>/dev/null || echo "")
@@ -798,34 +789,56 @@ ensure_snapshot() {
798789
}
799790

800791
print_migration_cleanup_report() {
801-
# Prints suggested cleanup commands for (1) successful migrations, (2) incomplete ones.
802-
# Relies on:
803-
# MIG_PVCS (array of "namespace|pvc")
804-
# MIGRATION_MODE (dual|inplace) – influences which artifacts to look for
805-
# Uses naming helpers & kcmd.
806792
local mode="${MIGRATION_MODE:-dual}"
807793
local success_header_printed=false
808794
local failed_header_printed=false
809795
local any=false
810796

811-
if ! (( ${#MIG_PVCS[@]} == 0 )); then
797+
if (( ${#MIG_PVCS[@]} == 0 )); then
812798
warn "print_migration_cleanup_report: MIG_PVCS empty (nothing to report)."
813799
return 0
814800
fi
815801

816802
info "Generating migration cleanup / investigation report (mode=${mode})..."
817803

804+
# Cache Released PVs that (a) still reference a claimRef and (b) are PremiumV2_LRS in CSI volumeAttributes.
805+
# Output columns (TSV):
806+
# namespace pvcName pvName reclaimPolicy storageClass capacity skuName
807+
local released_pv_lines
808+
released_pv_lines="$(kcmd get pv -o json 2>/dev/null | jq -r '
809+
.items[]
810+
| select(.status.phase=="Released"
811+
and .spec.claimRef
812+
and .spec.claimRef.namespace!=null
813+
and .spec.claimRef.name!=null
814+
and .spec.csi!=null
815+
)
816+
| . as $pv
817+
| (
818+
$pv.spec.csi.volumeAttributes.skuName
819+
// $pv.spec.csi.volumeAttributes.skuname
820+
// ""
821+
) as $sku
822+
| select($sku=="PremiumV2_LRS")
823+
| [
824+
.spec.claimRef.namespace,
825+
.spec.claimRef.name,
826+
.metadata.name,
827+
(.spec.persistentVolumeReclaimPolicy // ""),
828+
(.spec.storageClassName // ""),
829+
(.spec.capacity.storage // ""),
830+
$sku
831+
] | @tsv
832+
' 2>/dev/null || true)"
833+
818834
for ENTRY in "${MIG_PVCS[@]}"; do
819835
local ns="${ENTRY%%|*}" pvc="${ENTRY##*|}"
820836
local done lbl pv
821-
lbl=$(kcmd get pvc "$pvc" -n "$ns" -o jsonpath="{.metadata.labels['$MIGRATION_DONE_LABEL_KEY']}" 2>/dev/null || true)
837+
lbl=$(kcmd get pvc "$pvc" -n "$ns" -o go-template="{{ index .metadata.labels \"${MIGRATION_DONE_LABEL_KEY}\" }}" 2>/dev/null || true)
822838
pv=$(kcmd get pvc "$pvc" -n "$ns" -o jsonpath='{.spec.volumeName}' 2>/dev/null || true)
823839
[[ "$lbl" == "$MIGRATION_DONE_LABEL_VALUE" ]] && done=true || done=false
824840

825-
# Common derived names
826-
local snap="" int_pvc="" int_pv="" pv2_pvc="" target_pvc
827-
target_pvc="$pvc"
828-
841+
local snap="" int_pvc="" int_pv="" pv2_pvc=""
829842
if [[ -n "$pv" ]]; then
830843
snap="$(name_snapshot "$pv")"
831844
int_pv="$(name_csi_pv "$pv")"
@@ -843,76 +856,67 @@ print_migration_cleanup_report() {
843856
any=true
844857
echo " Source PVC: $ns/$pvc"
845858

846-
# Dual mode intermediate PV/PVC (should be safe to remove post migration)
847859
if [[ "$mode" == "dual" ]]; then
848-
if kcmd get pvc "$int_pvc" -n "$ns" >/dev/null 2>&1; then
849-
if kcmd get pvc "$int_pvc" -n "$ns" -o json \
850-
| jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
851-
echo " - delete intermediate PVC: kubectl delete pvc $int_pvc -n $ns"
852-
fi
860+
if kcmd get pvc "$int_pvc" -n "$ns" >/dev/null 2>&1 && \
861+
kcmd get pvc "$int_pvc" -n "$ns" -o json | jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
862+
echo " - delete intermediate PVC: kubectl delete pvc $int_pvc -n $ns"
853863
fi
854-
if kcmd get pv "$int_pv" >/dev/null 2>&1; then
855-
if kcmd get pv "$int_pv" -o json \
856-
| jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
857-
echo " - delete intermediate PV: kubectl delete pv $int_pv"
858-
fi
864+
if kcmd get pv "$int_pv" >/dev/null 2>&1 && \
865+
kcmd get pv "$int_pv" -o json | jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
866+
echo " - delete intermediate PV: kubectl delete pv $int_pv"
859867
fi
860-
# pv2 PVC (only if tool-labeled and user wants to manually consolidate; usually KEEP)
861-
if kcmd get pvc "$pv2_pvc" -n "$ns" >/dev/null 2>&1; then
862-
if kcmd get pvc "$pv2_pvc" -n "$ns" -o json \
863-
| jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
864-
echo " - (optional) pv2 PVC present: $pv2_pvc (KEEP for workload; delete only if decommissioning): kubectl delete pvc $pv2_pvc -n $ns"
865-
fi
868+
if kcmd get pvc "$pv2_pvc" -n "$ns" >/dev/null 2>&1 && \
869+
kcmd get pvc "$pv2_pvc" -n "$ns" -o json | jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
870+
echo " - (optional) pv2 PVC present: $pv2_pvc (KEEP unless decommissioning)"
866871
fi
867872
fi
868-
869-
# Snapshot (safe to delete once satisfied with migration)
870-
if [[ -n "$snap" ]] && kcmd get volumesnapshot "$snap" -n "$ns" >/dev/null 2>&1; then
871-
if kcmd get volumesnapshot "$snap" -n "$ns" -o json \
872-
| jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
873-
echo " - delete snapshot: kubectl delete volumesnapshot $snap -n $ns"
874-
fi
873+
if [[ -n "$snap" ]] && kcmd get volumesnapshot "$snap" -n "$ns" >/dev/null 2>&1 && \
874+
kcmd get volumesnapshot "$snap" -n "$ns" -o json | jq -e --arg k "$CREATED_BY_LABEL_KEY" --arg v "$MIGRATION_TOOL_ID" '.metadata.labels[$k]==$v' >/dev/null; then
875+
echo " - delete snapshot: kubectl delete volumesnapshot $snap -n $ns"
875876
fi
876-
877-
# Original PV (in dual mode original disk persists; user decision)
878877
if [[ "$mode" == "dual" && -n "$pv" ]]; then
879-
echo " - (review) original PV: $pv (delete only after confirming replacement in production)"
878+
echo " - (review) original PV: $pv"
880879
fi
881-
882880
else
883881
$failed_header_printed || {
884882
echo
885883
warn "Artifacts for incomplete/pending migrations"
886-
echo " (Review before deletion; these may be needed for retry or rollback)"
884+
echo " (Review before deletion; may be needed for retry/rollback)"
887885
failed_header_printed=true
888886
}
889887
any=true
890888
echo " Incomplete PVC: $ns/$pvc"
891889

892-
# List any intermediate or target artifacts existing
893890
if [[ "$mode" == "dual" ]]; then
894-
if kcmd get pvc "$int_pvc" -n "$ns" >/dev/null 2>&1; then
895-
echo " - intermediate PVC exists: $int_pvc (kubectl describe pvc $int_pvc -n $ns)"
896-
fi
897-
if kcmd get pv "$int_pv" >/dev/null 2>&1; then
898-
echo " - intermediate PV exists: $int_pv (kubectl describe pv $int_pv)"
899-
fi
900-
if kcmd get pvc "$pv2_pvc" -n "$ns" >/dev/null 2>&1; then
901-
echo " - pv2 PVC (target) exists: $pv2_pvc (kubectl describe pvc $pv2_pvc -n $ns)"
902-
fi
891+
[[ "$(kcmd get pvc "$int_pvc" -n "$ns" -o name 2>/dev/null || true)" ]] && \
892+
echo " - intermediate PVC exists: $int_pvc"
893+
[[ "$(kcmd get pv "$int_pv" -o name 2>/dev/null || true)" ]] && \
894+
echo " - intermediate PV exists: $int_pv"
895+
[[ "$(kcmd get pvc "$pv2_pvc" -n "$ns" -o name 2>/dev/null || true)" ]] && \
896+
echo " - pv2 PVC (target) exists: $pv2_pvc"
903897
else
904-
# inplace: target pvc == pvc
905-
if kcmd get pvc "$pvc" -n "$ns" >/dev/null 2>&1; then
898+
[[ "$(kcmd get pvc "$pvc" -n "$ns" -o name 2>/dev/null || true)" ]] && \
906899
echo " - current PVC phase: $(kcmd get pvc "$pvc" -n "$ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)"
907-
fi
908900
fi
909901
if [[ -n "$snap" ]] && kcmd get volumesnapshot "$snap" -n "$ns" >/dev/null 2>&1; then
910-
echo " - snapshot exists: $snap (kubectl describe volumesnapshot $snap -n $ns)"
902+
echo " - snapshot exists: $snap"
911903
fi
912-
if [[ -n "$pv" ]]; then
913-
echo " - source PV: $pv (kubectl describe pv $pv)"
914-
fi
915-
echo " - retry guidance: leave artifacts intact; script will reuse or recreate as needed."
904+
[[ -n "$pv" ]] && echo " - source PV: $pv"
905+
echo " - retry guidance: leave artifacts intact; script will reuse them."
906+
fi
907+
908+
# PremiumV2 Released PVs referencing this claim (likely leftover pv2 PVs post-rollback usually in inplace mode)
909+
if [[ -n "$released_pv_lines" ]]; then
910+
local had_rel=false
911+
while IFS=$'\t' read -r rns rpvc rpv rreclaim rsc rcap rsku; do
912+
[[ -z "$rns" ]] && continue
913+
if [[ "$rns" == "$ns" && "$rpvc" == "$pvc" && "$rpv" != "$pv" ]]; then
914+
$had_rel || { echo " - released PremiumV2 PV(s) associated (not currently) with claim:"; had_rel=true; }
915+
echo " * $rpv (sku=$rsku reclaim=${rreclaim:-?} sc=${rsc:-?} size=${rcap:-?})"
916+
echo " inspect: kubectl describe pv $rpv"
917+
echo " delete : kubectl delete pv $rpv # after verifying data & rollback success"
918+
fi
919+
done <<< "$released_pv_lines"
916920
fi
917921
done
918922

hack/premium-to-premiumv2-migrator-dualpvc.sh

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,6 @@ EOF
150150
return 2
151151
}
152152

153-
# ---------------- Discover Tagged PVCs ----------------
154-
populate_pvcs
155-
156153
# ------------- Pre-Req & Conflicts -------------
157154
print_combined_validation_report_and_exit_if_needed() {
158155
local prereq_count
@@ -193,7 +190,7 @@ for ENTRY in "${MIG_PVCS[@]}"; do
193190
continue
194191
fi
195192

196-
DONE_LABEL=$(kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath="{.metadata.labels['$MIGRATION_DONE_LABEL_KEY']}" 2>/dev/null || true)
193+
DONE_LABEL=$(kcmd get pvc "$pvc" -n "$pvc_ns" -o go-template="{{ index .metadata.labels \"${MIGRATION_DONE_LABEL_KEY}\" }}" 2>/dev/null || true)
197194
[[ "$DONE_LABEL" == "$MIGRATION_DONE_LABEL_VALUE" ]] && { info "Already migrated $pvc_ns/$pvc"; continue; }
198195

199196
pv=$(kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath='{.spec.volumeName}' 2>/dev/null || true)
@@ -241,14 +238,14 @@ for ENTRY in "${MIG_PVCS[@]}"; do
241238
done
242239

243240
# ------------- Monitoring Loop -------------
244-
deadline=$(( $(date +%s) + MONITOR_TIMEOUT_MINUTES*60 ))
241+
deadline=$(( $(date +%s) + MONITOR_TIMEOUT_MINUTES * 60 ))
245242
info "Monitoring migrations (timeout ${MONITOR_TIMEOUT_MINUTES}m)..."
246243

247244
while true; do
248245
ALL_DONE=true
249246
for ENTRY in "${MIG_PVCS[@]}"; do
250247
pvc_ns="${ENTRY%%|*}" pvc="${ENTRY##*|}" pv2_pvc="$(name_pv2_pvc "$pvc")"
251-
if kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath="{.metadata.labels['$MIGRATION_DONE_LABEL_KEY']}" 2>/dev/null | grep -q "^${MIGRATION_DONE_LABEL_VALUE}\$"; then
248+
if kcmd get pvc "$pvc" -n "$pvc_ns" -o go-template="{{ index .metadata.labels \"${MIGRATION_DONE_LABEL_KEY}\" }}" | grep -q "^${MIGRATION_DONE_LABEL_VALUE}\$"; then
252249
continue
253250
fi
254251
STATUS=$(kcmd get pvc "$pv2_pvc" -n "$pvc_ns" -o jsonpath='{.status.phase}' 2>/dev/null || true)
@@ -267,7 +264,7 @@ while true; do
267264
if [[ "$STATUS" == "Bound" && -z "$reason" && $MIGRATION_FORCE_INPROGRESS_AFTER_MINUTES -gt 0 ]]; then
268265
orig_pv=$(kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath='{.spec.volumeName}' 2>/dev/null || true)
269266
[[ -z "$orig_pv" ]] && continue
270-
inprog_val=$(kcmd get pv "$orig_pv" -o jsonpath="{.metadata.labels['$MIGRATION_INPROGRESS_LABEL_KEY']}" 2>/dev/null || true)
267+
inprog_val=$(kcmd get pv "$orig_pv" -o go-template="{{ index .metadata.labels \"${MIGRATION_INPROGRESS_LABEL_KEY}\" }}" 2>/dev/null || true)
271268
if [[ "$inprog_val" != "$MIGRATION_INPROGRESS_LABEL_VALUE" ]]; then
272269
cts=$(kcmd get pvc "$pv2_pvc" -n "$pvc_ns" -o jsonpath='{.metadata.creationTimestamp}' 2>/dev/null || true)
273270
if [[ -n "$cts" ]]; then
@@ -292,11 +289,11 @@ while true; do
292289
warn "Monitor timeout reached."
293290
for ENTRY in "${MIG_PVCS[@]}"; do
294291
pvc_ns="${ENTRY%%|*}" pvc="${ENTRY##*|}"
295-
kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath="{.metadata.labels['$MIGRATION_DONE_LABEL_KEY']}" 2>/dev/null | \
292+
kcmd get pvc "$pvc" -n "$pvc_ns" -o go-template="{{ index .metadata.labels \"${MIGRATION_DONE_LABEL_KEY}\" }}" | \
296293
grep -q "^${MIGRATION_DONE_LABEL_VALUE}\$" && continue
297294
orig_pv=$(kcmd get pvc "$pvc" -n "$pvc_ns" -o jsonpath='{.spec.volumeName}' 2>/dev/null || true)
298295
[[ -z "$orig_pv" ]] && continue
299-
inprog_val=$(kcmd get pv "$orig_pv" -o jsonpath="{.metadata.labels['$MIGRATION_INPROGRESS_LABEL_KEY']}" 2>/dev/null || true)
296+
inprog_val=$(kcmd get pv "$orig_pv" -o go-template="{{ index .metadata.labels \"${MIGRATION_INPROGRESS_LABEL_KEY}\" }}" 2>/dev/null || true)
300297
if [[ "$inprog_val" != "$MIGRATION_INPROGRESS_LABEL_VALUE" ]]; then
301298
warn "Timeout fallback: labeling PV $orig_pv with ${MIGRATION_INPROGRESS_LABEL_KEY}=${MIGRATION_INPROGRESS_LABEL_VALUE}"
302299
kcmd label pv "$orig_pv" "${MIGRATION_INPROGRESS_LABEL_KEY}=${MIGRATION_INPROGRESS_LABEL_VALUE}" --overwrite
@@ -315,7 +312,7 @@ echo
315312
info "Summary:"
316313
for entry in "${MIG_PVCS[@]}"; do
317314
ns="${entry%%|*}" pvc="${entry##*|}"
318-
lbl=$(kcmd get pvc "$pvc" -n "$ns" -o jsonpath="{.metadata.labels['$MIGRATION_DONE_LABEL_KEY']}" 2>/dev/null || true)
315+
lbl=$(kcmd get pvc "$pvc" -n "$ns" -o go-template="{{ index .metadata.labels \"${MIGRATION_DONE_LABEL_KEY}\" }}" 2>/dev/null || true)
319316
if [[ "$lbl" == "$MIGRATION_DONE_LABEL_VALUE" ]]; then
320317
echo "$ns/$pvc migrated"
321318
else
@@ -334,6 +331,6 @@ if (( ${#PV2_BIND_TIMEOUTS[@]} > 0 )); then
334331
printf ' - %s\n' "${PV2_BIND_TIMEOUTS[@]}"
335332
fi
336333

337-
print_migration_cleanup_report
334+
MIGRATION_MODE=dual print_migration_cleanup_report
338335
finalize_audit_summary "$SCRIPT_START_TS" "$SCRIPT_START_EPOCH"
339336
ok "Script finished."

0 commit comments

Comments
 (0)