Skip to content

Commit 9a57690

Browse files
tedim52claude
andcommitted
fix(k8s): add warning log for partially degraded logs collector pods
Surface partial pod failures in the logs collector DaemonSet via a warning log instead of silently reporting Running when some pods are stopped. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 04f1248 commit 9a57690

File tree

1 file changed

+13
-5
lines changed
  • container-engine-lib/lib/backend_impls/kubernetes/kubernetes_kurtosis_backend/logs_collector_functions

1 file changed

+13
-5
lines changed

container-engine-lib/lib/backend_impls/kubernetes/kubernetes_kurtosis_backend/logs_collector_functions/shared_helpers.go

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_kurtosis_backend/shared_helpers"
66
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager"
7+
"github.com/sirupsen/logrus"
78
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_resource_collectors"
89
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/object_attributes_provider/kubernetes_label_key"
910
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/object_attributes_provider/label_value_consts"
@@ -269,8 +270,8 @@ func getLogsCollectorsObjectFromKubernetesResources(ctx context.Context, kuberne
269270
}
270271

271272
// TODO: container status is outdated for k8s pods (see TODO in shared_helpers.GetContainerStatusFromPod)
272-
// in the meantime logs collector status is container.ContainerStatus_Running if all pods managed by the logs collector DaemonSet are running
273-
// if one is failing/or stopped, the logs collector is to considered to be stopped
273+
// logs collector status is container.ContainerStatus_Running if at least one pod managed by the logs collector DaemonSet is running
274+
// if some pods are stopped while others are running, a warning is logged to surface partial degradation
274275
func getLogsCollectorStatus(ctx context.Context, kubernetesManager *kubernetes_manager.KubernetesManager, logsCollectorDaemonSet *v1.DaemonSet) (container.ContainerStatus, error) {
275276
logsCollectorPods, err := kubernetesManager.GetPodsManagedByDaemonSet(ctx, logsCollectorDaemonSet)
276277
if err != nil {
@@ -281,22 +282,29 @@ func getLogsCollectorStatus(ctx context.Context, kubernetesManager *kubernetes_m
281282
return container.ContainerStatus_Stopped, stacktrace.NewError("No pods managed by logs collector daemon set were found. There should be at least one. This is likely a bug in Kurtosis.")
282283
}
283284

284-
hasRunningPod := false
285+
runningPods := 0
286+
stoppedPods := 0
285287
for _, pod := range logsCollectorPods {
286288
podStatus, err := shared_helpers.GetContainerStatusFromPod(pod)
287289
if err != nil {
288290
return container.ContainerStatus_Stopped, stacktrace.Propagate(err, "An error occurred retrieving container status for a pod managed by logs collectors collector daemon set '%v' with name: %v\n", logsCollectorDaemonSet.Name, pod.Name)
289291
}
290292

291293
if podStatus == container.ContainerStatus_Running {
292-
hasRunningPod = true
294+
runningPods++
295+
} else {
296+
stoppedPods++
293297
}
294298
}
295299

296-
if !hasRunningPod {
300+
if runningPods == 0 {
297301
return container.ContainerStatus_Stopped, nil
298302
}
299303

304+
if stoppedPods > 0 {
305+
logrus.Warnf("Logs collector daemon set '%v' has %d stopped pods out of %d total pods. The collector is partially degraded.", logsCollectorDaemonSet.Name, stoppedPods, len(logsCollectorPods))
306+
}
307+
300308
return container.ContainerStatus_Running, nil
301309
}
302310

0 commit comments

Comments
 (0)