Skip to content

Commit 0c3bade

Browse files
committed
[connectedk8s troubleshoot] Adding kube-namespace parameter
Work item: https://msazure.visualstudio.com/One/_workitems/edit/33153715
1 parent 109b85f commit 0c3bade

File tree

4 files changed

+108
-1
lines changed

4 files changed

+108
-1
lines changed

src/connectedk8s/azext_connectedk8s/_constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@
304304
KAP_CR_Save_Failed_Fault_Type = "Error occured while fetching KAP CR snapshot"
305305
Fetch_KAP_CR_Save_Failed_Fault_Type = "Exception occured while fetching KAP CR snapshot"
306306
Fetch_Arc_Agent_Logs_Failed_Fault_Type = "Error occured in arc agents logger"
307+
Fetch_Namespace_Pod_Logs_Failed_Fault_Type = "Error occured in namespace pods logger"
307308
Fetch_Arc_Agents_Events_Logs_Failed_Fault_Type = (
308309
"Error occured in arc agents events logger"
309310
)
@@ -371,6 +372,7 @@
371372
# Name of the checks and operations
372373
Retrieve_Arc_Agents_Event_Logs = "retrieved_arc_agents_event_logs"
373374
Retrieve_Arc_Agents_Logs = "retrieved_arc_agents_logs"
375+
Retrieve_Namespace_Logs = "retrieved_namespace_logs"
374376
Retrieve_Deployments_Logs = "retrieved_deployments_logs"
375377
Retrieve_Arc_Workload_Identity_Events_Logs = (
376378
"retrieved_arc_workload_identity_event_logs"

src/connectedk8s/azext_connectedk8s/_params.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -605,3 +605,8 @@ def load_arguments(self: Connectedk8sCommandsLoader, _: CLICommand) -> None:
605605
action="store_true",
606606
help="Skip SSL verification for any cluster connection.",
607607
)
608+
c.argument(
609+
"kube_namespace",
610+
options_list=["--kube-namespace"],
611+
help="Kube namespace to troubleshoot from current machine.",
612+
)

src/connectedk8s/azext_connectedk8s/_troubleshootutils.py

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,97 @@ def retrieve_arc_agents_logs(
287287

288288
return consts.Diagnostic_Check_Failed, storage_space_available
289289

290+
def retrieve_namespace_logs(
291+
corev1_api_instance: CoreV1Api,
292+
filepath_with_timestamp: str,
293+
storage_space_available: bool,
294+
kube_namespace: str,
295+
) -> tuple[str, bool]:
296+
print(f"Step: {get_utctimestring()}: Retrieve logs from pods in '{kube_namespace}' namespace.")
297+
try:
298+
if storage_space_available:
299+
# To retrieve all of the pods that are present in the Cluster namespace
300+
namespace_pod_list = corev1_api_instance.list_namespaced_pod(
301+
namespace=kube_namespace
302+
)
303+
# creating a folder for the namespace inside the timestamp folder
304+
namespace_folder_name = f"ns_{kube_namespace}"
305+
namespace_logs_path = os.path.join(
306+
filepath_with_timestamp, namespace_folder_name
307+
)
308+
os.mkdir(namespace_logs_path)
309+
310+
# Traversing through all pods in the namespace
311+
for each_namespace_pod in namespace_pod_list.items:
312+
# Fetching the current Pod name and creating a folder with that name inside the timestamp folder
313+
pod_name = each_namespace_pod.metadata.name
314+
315+
pod_name_logs_path = os.path.join(namespace_logs_path, pod_name)
316+
with contextlib.suppress(FileExistsError):
317+
os.mkdir(pod_name_logs_path)
318+
# If the pod is not in Running state we wont be able to get logs of the containers
319+
if each_namespace_pod.status.phase != "Running":
320+
continue
321+
# Traversing through all of the containers present inside each pods
322+
for each_container in each_namespace_pod.spec.containers:
323+
# Fetching the Container name
324+
container_name = each_container.name
325+
# Creating a text file with the name of the container and adding that containers logs in it
326+
container_log = corev1_api_instance.read_namespaced_pod_log(
327+
name=pod_name, container=container_name, namespace=kube_namespace
328+
)
329+
# Path to add the pods container logs.
330+
namespace_pod_container_logs_path = os.path.join(
331+
pod_name_logs_path, container_name + ".txt"
332+
)
333+
with open(namespace_pod_container_logs_path, "w+") as container_file:
334+
container_file.write(str(container_log))
335+
336+
return consts.Diagnostic_Check_Passed, storage_space_available
337+
338+
# For handling storage or OS exception that may occur during the execution
339+
except OSError as e:
340+
if "[Errno 28]" in str(e):
341+
storage_space_available = False
342+
telemetry.set_exception(
343+
exception=e,
344+
fault_type=consts.No_Storage_Space_Available_Fault_Type,
345+
summary="No space left on device",
346+
)
347+
shutil.rmtree(filepath_with_timestamp, ignore_errors=False)
348+
else:
349+
logger.exception(
350+
"An exception has occured while trying to fetch the namespace pod "
351+
"logs from the cluster."
352+
)
353+
telemetry.set_exception(
354+
exception=e,
355+
fault_type=consts.Fetch_Namespace_Pod_Logs_Failed_Fault_Type,
356+
summary="Error occured in namespace pods logger",
357+
)
358+
diagnoser_output.append(
359+
"An exception has occured while trying to fetch the namespace pods logs from "
360+
f"the cluster. Exception: {e}\n"
361+
)
362+
363+
# To handle any exception that may occur during the execution
364+
except Exception as e:
365+
logger.exception(
366+
"An exception has occured while trying to fetch the namespace pods logs "
367+
"from the cluster."
368+
)
369+
telemetry.set_exception(
370+
exception=e,
371+
fault_type=consts.Fetch_Namespace_Pod_Logs_Failed_Fault_Type,
372+
summary="Error occured in namespace pods logger",
373+
)
374+
diagnoser_output.append(
375+
"An exception has occured while trying to fetch the namespace pods logs from the "
376+
f"cluster. Exception: {e}\n"
377+
)
378+
379+
return consts.Diagnostic_Check_Failed, storage_space_available
380+
290381

291382
def retrieve_arc_agents_event_logs(
292383
filepath_with_timestamp: str,

src/connectedk8s/azext_connectedk8s/custom.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3992,6 +3992,7 @@ def troubleshoot(
39923992
skip_ssl_verification: bool = False,
39933993
no_wait: bool = False,
39943994
tags: dict[str, str] | None = None,
3995+
kube_namespace: str | None = None,
39953996
) -> None:
39963997
try:
39973998
logger.warning("Diagnoser running. This may take a while ...\n")
@@ -4090,7 +4091,7 @@ def troubleshoot(
40904091

40914092
# Check if agents have been added to the cluster
40924093
arc_agents_pod_list = corev1_api_instance.list_namespaced_pod(
4093-
namespace="azure-arc"
4094+
namespace = "azure-arc",
40944095
)
40954096

40964097
# To verify if arc agents have been added to the cluster
@@ -4102,6 +4103,14 @@ def troubleshoot(
41024103
) = troubleshootutils.retrieve_arc_agents_logs(
41034104
corev1_api_instance, filepath_with_timestamp, storage_space_available
41044105
)
4106+
# For storing pod logs in a given namespace
4107+
if kube_namespace:
4108+
(
4109+
diagnostic_checks[consts.Retrieve_Namespace_Logs],
4110+
storage_space_available,
4111+
) = troubleshootutils.retrieve_namespace_logs(
4112+
corev1_api_instance, filepath_with_timestamp, storage_space_available, kube_namespace
4113+
)
41054114

41064115
# For storing all arc agents events logs
41074116
(

0 commit comments

Comments
 (0)