Skip to content

Commit 520b709

Browse files
nammnfealebenpae
authored andcommitted
cluster-cleaner: handle more edge-cases and force deletions on resources that are stuck (#4164)
# Summary - brute-force if we are stuck: removing finalizers if they are stuck after 10 seconds trying to delete the resource - adding bunch more logging - handle edge-cases e.g. some resources were deleted at a different place ## Proof of Work - running manually in openshift static cluster freed the cluster ## Checklist - [ ] Have you linked a jira ticket and/or is the ticket in the title? - [ ] Have you checked whether your jira ticket required DOCSP changes? - [ ] Have you checked for release_note changes? ## Reminder (Please remove this when merging) - Please try to Approve or Reject Changes the PR, keep PRs in review as short as possible - Our Short Guide for PRs: [Link](REDACTED) - Remember the following Communication Standards - use comment prefixes for clarity: * **blocking**: Must be addressed before approval. * **follow-up**: Can be addressed in a later PR or ticket. * **q**: Clarifying question. * **nit**: Non-blocking suggestions. * **note**: Side-note, non-actionable. Example: Praise * --> no prefix is considered a question
1 parent 4a05978 commit 520b709

File tree

3 files changed

+57
-10
lines changed

3 files changed

+57
-10
lines changed

docker/cluster-cleaner/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
name: cluster-cleaner
22
description: The background cleaner
3-
version: 0.13
3+
version: 0.14

docker/cluster-cleaner/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
IMAGE_VERSION=0.13
1+
IMAGE_VERSION=0.14
22

33
.PHONY: all
44
all: build push install
Lines changed: 55 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,23 @@
11
#!/usr/bin/env sh
22

3+
delete_resources_safely() {
4+
resource_type="$1"
5+
namespace="$2"
6+
7+
echo "Attempting normal deletion of $resource_type in $namespace..."
8+
kubectl delete "${resource_type}" --all -n "${namespace}" --wait=true --timeout=10s || true
9+
10+
# Check if any resources are still stuck
11+
resources=$(kubectl get "$resource_type" -n "${namespace}" --no-headers -o custom-columns=":metadata.name")
12+
13+
for resource in ${resources}; do
14+
echo "${resource_type}/${resource} is still present, force deleting..."
15+
16+
kubectl patch "${resource_type}" "${resource}" -n "${namespace}" -p '{"metadata":{"finalizers":null}}' --type=merge || true
17+
kubectl delete "${resource_type}" "${resource}" -n "${namespace}" --force --grace-period=0 || true
18+
done
19+
}
20+
321
if [ -z ${DELETE_OLDER_THAN_AMOUNT+x} ] || [ -z ${DELETE_OLDER_THAN_UNIT+x} ]; then
422
echo "Need to set both 'DELETE_OLDER_THAN_AMOUNT' and 'DELETE_OLDER_THAN_UNIT' environment variables."
523
exit 1
@@ -11,20 +29,49 @@ if [ -z ${LABELS+x} ]; then
1129
fi
1230

1331
echo "Deleting namespaces for evg tasks that are older than ${DELETE_OLDER_THAN_AMOUNT} ${DELETE_OLDER_THAN_UNIT} with label ${LABELS}"
32+
echo "Which are:"
33+
kubectl get namespace -l "${LABELS}" -o name
1434
for namespace in $(kubectl get namespace -l "${LABELS}" -o name); do
15-
creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}')
35+
creation_time=$(kubectl get "${namespace}" -o jsonpath='{.metadata.creationTimestamp}' 2>/dev/null || echo "")
1636

17-
if ! ./is_older_than.py "${creation_time}" "${DELETE_OLDER_THAN_AMOUNT}" "${DELETE_OLDER_THAN_UNIT}"; then
37+
if [ -z "$creation_time" ]; then
38+
echo "Namespace ${namespace} does not exist or has no creation timestamp, skipping."
1839
continue
1940
fi
2041

2142
namespace_name=$(echo "${namespace}" | cut -d '/' -f 2)
2243

23-
csrs_in_namespace=$(kubectl get csr -o name | grep "${namespace_name}")
24-
kubectl delete "${csrs_in_namespace}"
44+
if ! ./is_older_than.py "${creation_time}" "${DELETE_OLDER_THAN_AMOUNT}" "${DELETE_OLDER_THAN_UNIT}"; then
45+
echo "Skipping ${namespace_name}, not old enough."
46+
continue
47+
fi
48+
49+
echo "Deleting ${namespace_name}"
50+
51+
csrs_in_namespace=$(kubectl get csr -o name | grep "${namespace_name}" || true)
52+
if [ -n "${csrs_in_namespace}" ]; then
53+
kubectl delete "${csrs_in_namespace}"
54+
fi
55+
56+
delete_resources_safely "mdb" "${namespace_name}"
57+
delete_resources_safely "mdbu" "${namespace_name}"
58+
delete_resources_safely "om" "${namespace_name}"
59+
60+
echo "Attempting to delete namespace: ${namespace_name}"
2561

26-
kubectl delete mdb --all -n "${namespace_name=}"
27-
kubectl delete mdbu --all -n "${namespace_name=}"
28-
kubectl delete om --all -n "{namespace_name=}"
29-
kubectl delete "${namespace}"
62+
if kubectl get namespace "${namespace_name}" >/dev/null 2>&1; then
63+
kubectl delete namespace "${namespace_name}" --wait=true --timeout=10s || true
64+
else
65+
echo "Namespace ${namespace_name} not found, skipping deletion."
66+
fi
67+
68+
if kubectl get namespace "${namespace_name}" >/dev/null 2>&1; then
69+
echo "Namespace ${namespace_name} is still stuck, removing finalizers..."
70+
kubectl patch namespace "${namespace_name}" -p '{"metadata":{"finalizers":null}}' --type=merge
71+
72+
echo "Force deleting namespace: ${namespace_name}"
73+
kubectl delete namespace "${namespace_name}" --wait=true --timeout=30s
74+
else
75+
echo "Namespace ${namespace_name} deleted successfully."
76+
fi
3077
done

0 commit comments

Comments
 (0)