Skip to content

Commit ec9e1ba

Browse files
svtebmartin-mat
authored andcommitted
Fix: CNF uninstallation resource discovery
Refs: #2310 - Original implementation of wait_for_deployment_uninstallation utilized label discovery to find descendant resources of some core resource, this introduced errors as labels werent a sufficient identifier. To resolve this issue we now search for descendants through UIDs and ownerRef. - Utility functions to retrieve UIDs and recursively find descendants were introduced in the KubectlClient.Get module - Minor refactors to existing code. - constant WORKLOAD_RESOURCES was extended to include job and cronjob since they are also workload resources. - spec test to verify that failing uninstallation gets caught was tagged incorrectly, resulting in it being skipped in actions (this discovery prompted some changes to it) Signed-off-by: svteb <slavo.valko@tietoevry.com>
1 parent 1fdd2b8 commit ec9e1ba

File tree

5 files changed

+209
-101
lines changed

5 files changed

+209
-101
lines changed

spec/setup_spec.cr

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,11 +221,11 @@ describe "Installation" do
221221
end
222222
end
223223

224-
it "cnf_uninstall should fail for a stuck manifest deployment" do
224+
it "cnf_uninstall should fail for a stuck manifest deployment", tags: ["cnf_installation"] do
225225
result = ShellCmd.cnf_install("cnf-path=sample-cnfs/sample_stuck_finalizer/cnf-testsuite.yml")
226226
result[:status].success?.should be_true
227227

228-
result = ShellCmd.cnf_uninstall(timeout: 60, expect_failure: true)
228+
result = ShellCmd.cnf_uninstall(timeout: 30, expect_failure: true)
229229
(/Some resources of "stuck_finalizer" did not finish deleting within the timeout/ =~ result[:output]).should_not be_nil
230230
ensure
231231
# Patch out finalizer
@@ -241,20 +241,21 @@ describe "Installation" do
241241
(/CNF uninstallation skipped/ =~ result[:output]).should_not be_nil
242242
end
243243

244-
it "cnf_uninstall should fail for a stuck helm deployment" do
244+
it "cnf_uninstall should fail for a stuck helm deployment", tags: ["cnf_installation"] do
245245
result = ShellCmd.cnf_install("cnf-path=sample-cnfs/sample_stuck_helm_deployment/")
246246
result[:status].success?.should be_true
247247

248248
# Inject finalizer into Deployment pod template
249249
KubectlClient::Utils.patch(
250-
"deployment",
251-
"stuck-nginx",
250+
"pod",
251+
nil,
252252
"cnf-default",
253-
"json",
254-
"[{\"op\":\"add\",\"path\":\"/spec/template/metadata/finalizers\",\"value\":[\"example.com/stuck\"]}]"
253+
"merge",
254+
%({"metadata":{"finalizers":["example.com/stuck"]}}),
255+
{ "app.kubernetes.io/instance" => "stuck-nginx" }
255256
)
256257

257-
result = ShellCmd.cnf_uninstall(timeout: 60, expect_failure: true)
258+
result = ShellCmd.cnf_uninstall(timeout: 30, expect_failure: true)
258259
(/Some resources of "stuck-nginx" did not finish deleting within the timeout/ =~ result[:output]).should_not be_nil
259260
ensure
260261
# Patch out finalizer

src/modules/kubectl_client/kubectl_client.cr

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,17 @@ module KubectlClient
1111

1212
alias CMDResult = NamedTuple(status: Process::Status, output: String, error: String)
1313
alias BackgroundCMDResult = NamedTuple(process: Process, output: String, error: String)
14-
15-
WORKLOAD_RESOURCES = {deployment: "Deployment",
16-
pod: "Pod",
17-
replicaset: "ReplicaSet",
18-
statefulset: "StatefulSet",
19-
daemonset: "DaemonSet"}
14+
alias ResourceDescendant = NamedTuple(kind: String, name: String, namespace: String?, uid: String )
15+
16+
WORKLOAD_RESOURCES = {
17+
deployment: "Deployment",
18+
pod: "Pod",
19+
replicaset: "ReplicaSet",
20+
statefulset: "StatefulSet",
21+
daemonset: "DaemonSet",
22+
job: "Job",
23+
cronjob: "CronJob"
24+
}
2025

2126
module ShellCMD
2227
# logger should have method name (any other scopes, if necessary) that is calling attached using .for() method.

src/modules/kubectl_client/modules/get.cr

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,5 +546,71 @@ module KubectlClient
546546

547547
runtimes.uniq
548548
end
549+
550+
def self.resource_uid(kind : String, resource_name : String, namespace : String? = nil) : String?
551+
logger = @@logger.for("resource_uid")
552+
logger.debug { "Get uid of #{kind}/#{resource_name} in #{namespace || "default"}" }
553+
554+
begin
555+
resource_json = resource(kind, resource_name, namespace)
556+
resource_json.dig?("metadata", "uid").try &.as_s?
557+
rescue KubectlClient::ShellCMD::NotFoundError
558+
nil
559+
end
560+
end
561+
562+
def self.resources_by_owner_uid(kind : String, owner_uid : String, namespace : String? = nil) : Array(JSON::Any)
563+
logger = @@logger.for("resources_by_owner_uid")
564+
logger.debug { "Looking for #{kind} owned by #{owner_uid} in #{namespace || "all namespaces"}" }
565+
566+
resources_json = resource(kind, namespace: namespace, all_namespaces: namespace.nil?)
567+
items = (resources_json.dig?("items").try &.as_a?) || [] of JSON::Any
568+
569+
matched = items.select do |item|
570+
refs = (item.dig?("metadata", "ownerReferences").try &.as_a?) || [] of JSON::Any
571+
refs.any? { |ref| ref["uid"].as_s == owner_uid }
572+
end
573+
574+
matched
575+
end
576+
577+
# find all descendant uids (BFS)
578+
def self.descendants(
579+
root_kind : String,
580+
root_name : String,
581+
root_namespace : String? = nil
582+
) : Array(ResourceDescendant)
583+
seen_uids = Set(String).new
584+
queue = [] of ResourceDescendant
585+
results = [] of ResourceDescendant
586+
587+
queue << { kind: root_kind, name: root_name, namespace: root_namespace, uid: "" }
588+
589+
while current = queue.shift?
590+
kind = current[:kind]
591+
name = current[:name]
592+
namespace = current[:namespace]
593+
594+
uid = resource_uid(kind, name, namespace)
595+
next unless uid
596+
597+
if seen_uids.includes?(uid)
598+
next
599+
end
600+
601+
seen_uids.add(uid)
602+
results << { kind: kind, name: name, namespace: namespace, uid: uid }
603+
604+
# enqueue its direct children
605+
WORKLOAD_RESOURCES.values.each do |child_kind|
606+
resources_by_owner_uid(child_kind, uid, namespace).each do |child|
607+
child_name = child.dig("metadata", "name").as_s
608+
queue << { kind: child_kind, name: child_name, namespace: namespace, uid: "" }
609+
end
610+
end
611+
end
612+
613+
results
614+
end
549615
end
550616
end

src/modules/kubectl_client/modules/wait.cr

Lines changed: 64 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -143,67 +143,80 @@ module KubectlClient
143143
def self.resource_wait_for_uninstall(
144144
kind : String,
145145
resource_name : String,
146-
labels : Hash(String, String) = {} of String => String,
147-
namespace : String? = "default",
148-
wait_count : Int32 = GENERIC_OPERATION_TIMEOUT
146+
namespace : String? = "default",
147+
wait_count : Int32 = GENERIC_OPERATION_TIMEOUT,
148+
descendants : Array(ResourceDescendant) = [] of ResourceDescendant
149149
) : Bool
150-
logger = @@logger.for("resource_wait_for_uninstall")
151-
logger.info { "Waiting for resource #{kind}/#{resource_name} to uninstall" }
152-
153-
seconds_count = 0
154-
deleted_once = false
155-
156-
# Initial fetch (marks deleted_once if missing up front)
157-
resource =
158-
begin
159-
KubectlClient::Get.resource(kind, resource_name, namespace)
160-
rescue KubectlClient::ShellCMD::NotFoundError
161-
deleted_once = true
162-
EMPTY_JSON
163-
end
164-
165-
until seconds_count > wait_count
166-
if seconds_count % RESOURCE_WAIT_LOG_INTERVAL == 0
167-
logger.info { "seconds elapsed while waiting: #{seconds_count}" }
150+
logger = @@logger.for("resource_wait_for_uninstall")
151+
logger.info { "Waiting up to #{wait_count}s for #{kind}/#{resource_name} to uninstall" }
152+
153+
# Make a mutable copy of the descendant list
154+
pending = descendants.dup
155+
seconds = 0
156+
root_alive = true
157+
158+
while seconds <= wait_count
159+
if (seconds % RESOURCE_WAIT_LOG_INTERVAL).zero?
160+
logger.info { "#{seconds}s elapsed, pending descendants=#{pending.size}" }
168161
end
169-
170-
# Only call Get.resource until we see it NotFound
171-
present = if !deleted_once
162+
163+
# Check whether the root resource still exists
164+
if root_alive
172165
begin
173166
KubectlClient::Get.resource(kind, resource_name, namespace)
174-
true
175167
rescue KubectlClient::ShellCMD::NotFoundError
176-
deleted_once = true
177-
false
168+
root_alive = false
169+
logger.info { "Root resource #{kind}/#{resource_name} is gone" }
178170
end
179-
else
180-
false
181171
end
182-
183-
# Once parent is gone, look for any leftover pods
184-
pods = if !present
185-
all_pods = KubectlClient::Get.resource("pods", nil, namespace).dig("items").as_a
186-
KubectlClient::Get.pods_by_labels(all_pods, labels)
187-
elsif resource != EMPTY_JSON
188-
begin
189-
KubectlClient::Get.pods_by_resource_labels(resource, namespace)
190-
rescue KubectlClient::ShellCMD::NotFoundError
191-
[] of JSON::Any
172+
173+
# Once the root is gone, prune descendants by UID
174+
unless root_alive
175+
pending.clone.each do |descendant|
176+
begin
177+
data = KubectlClient::Get.resource(
178+
descendant[:kind],
179+
nil,
180+
descendant[:namespace],
181+
descendant[:namespace].nil?,
182+
)
183+
184+
items = data["items"].as_a? || [] of JSON::Any
185+
exists = items.any? do |obj|
186+
obj["metadata"]["uid"].as_s == descendant[:uid]
187+
end
188+
rescue KubectlClient::ShellCMD::NotFoundError
189+
exists = false
190+
end
191+
192+
unless exists
193+
pending.delete(descendant)
194+
logger.info { "Descendant #{descendant[:kind]}/#{descendant[:name]} (uid=#{descendant[:uid]}) deleted, #{pending.size} remaining" }
195+
end
196+
end
197+
198+
if pending.empty?
199+
logger.info { "#{kind}/#{resource_name} and all descendants are deleted" }
200+
return true
192201
end
193-
else
194-
[] of JSON::Any
195-
end
196-
197-
if !present && pods.empty?
198-
logger.info { "#{kind}/#{resource_name} fully deleted" }
199-
return true
200202
end
201-
202-
sleep 1.second
203-
seconds_count += 1
203+
204+
sleep 1
205+
seconds += 1
204206
end
205-
206-
logger.warn { "#{kind}/#{resource_name} still present after #{wait_count}s" }
207+
208+
# If we reach here, we timed out
209+
leftovers = [] of String
210+
leftovers << "root=#{kind}/#{resource_name}" if root_alive
211+
212+
unless pending.empty?
213+
details = pending.map do |desc|
214+
"#{desc[:kind]}/#{desc[:name]}(namespace=#{desc[:namespace]})"
215+
end.join(", ")
216+
leftovers << "pending=#{details}"
217+
end
218+
219+
logger.warn { "Timeout after #{wait_count}s; still present: #{leftovers.join(" & ")}" }
207220
false
208221
end
209222

0 commit comments

Comments
 (0)