Skip to content

Commit 1f99f4a

Browse files
Mana (마나)claude
andcommitted
fix: CrashLoopBackOff detection via container state, fix doc schema
- Add derive_effective_status() that checks container waiting/terminated reasons instead of relying solely on pod phase (which stays "Running" even during CrashLoopBackOff) - Detects: CrashLoopBackOff, ImagePullBackOff, ErrImagePull, CreateContainerConfigError, OOMKilled, Error - Fix docs/dash-reference.md: full headless output schema now correctly shows {"clusters": [...], "infrastructure": {...}} wrapper - Add 3 new tests for derive_effective_status - 642 tests passing, 0 clippy warnings Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 14c6b33 commit 1f99f4a

File tree

2 files changed

+176
-54
lines changed

2 files changed

+176
-54
lines changed

docs/dash-reference.md

Lines changed: 77 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -114,64 +114,88 @@ _generated/clusters/
114114
### Full output (`scalex dash --headless`)
115115

116116
```json
117-
[
118-
{
119-
"name": "tower",
120-
"health": "green",
121-
"namespaces": ["default", "kube-system", "argocd"],
122-
"nodes": [
123-
{
124-
"name": "tower-cp-0",
125-
"status": "Ready",
126-
"roles": ["control-plane"],
127-
"cpu_capacity": "4",
128-
"mem_capacity": "8Gi",
129-
"cpu_allocatable": "3800m",
130-
"mem_allocatable": "7Gi"
131-
}
132-
],
133-
"pods": [
134-
{
135-
"name": "coredns-abc123",
136-
"namespace": "kube-system",
137-
"status": "Running",
138-
"ready": "1/1",
139-
"restarts": 0,
140-
"age": "5d",
141-
"node": "tower-cp-0"
142-
}
143-
],
144-
"deployments": [
145-
{
146-
"name": "coredns",
147-
"namespace": "kube-system",
148-
"ready": "2/2",
149-
"up_to_date": 2,
150-
"available": 2,
151-
"age": "5d"
117+
{
118+
"clusters": [
119+
{
120+
"name": "tower",
121+
"health": "green",
122+
"namespaces": ["default", "kube-system", "argocd"],
123+
"nodes": [
124+
{
125+
"name": "tower-cp-0",
126+
"status": "Ready",
127+
"roles": ["control-plane"],
128+
"cpu_capacity": "4",
129+
"mem_capacity": "8Gi",
130+
"cpu_allocatable": "3800m",
131+
"mem_allocatable": "7Gi"
132+
}
133+
],
134+
"pods": [
135+
{
136+
"name": "coredns-abc123",
137+
"namespace": "kube-system",
138+
"status": "Running",
139+
"ready": "1/1",
140+
"restarts": 0,
141+
"age": "5d",
142+
"node": "tower-cp-0"
143+
}
144+
],
145+
"deployments": [
146+
{
147+
"name": "coredns",
148+
"namespace": "kube-system",
149+
"ready": "2/2",
150+
"up_to_date": 2,
151+
"available": 2,
152+
"age": "5d"
153+
}
154+
],
155+
"services": [
156+
{
157+
"name": "kubernetes",
158+
"namespace": "default",
159+
"svc_type": "ClusterIP",
160+
"cluster_ip": "10.233.0.1",
161+
"ports": "443/TCP",
162+
"age": "5d"
163+
}
164+
],
165+
"resource_usage": {
166+
"cpu_percent": 0.0,
167+
"mem_percent": 0.0,
168+
"total_pods": 15,
169+
"running_pods": 15,
170+
"failed_pods": 0,
171+
"total_nodes": 1,
172+
"ready_nodes": 1
152173
}
153-
],
154-
"services": [
174+
}
175+
],
176+
"infrastructure": {
177+
"sdi_pools": [
155178
{
156-
"name": "kubernetes",
157-
"namespace": "default",
158-
"svc_type": "ClusterIP",
159-
"cluster_ip": "10.233.0.1",
160-
"ports": "443/TCP",
161-
"age": "5d"
179+
"pool_name": "tower",
180+
"purpose": "management",
181+
"nodes": [
182+
{
183+
"name": "tower-cp-0",
184+
"ip": "10.0.0.100",
185+
"host": "node-0",
186+
"cpu": 2,
187+
"mem_gb": 4,
188+
"disk_gb": 30,
189+
"status": "running",
190+
"gpu": false
191+
}
192+
]
162193
}
163194
],
164-
"resource_usage": {
165-
"cpu_percent": 0.0,
166-
"mem_percent": 0.0,
167-
"total_pods": 15,
168-
"running_pods": 15,
169-
"failed_pods": 0,
170-
"total_nodes": 1,
171-
"ready_nodes": 1
172-
}
195+
"total_vms": 1,
196+
"running_vms": 1
173197
}
174-
]
198+
}
175199
```
176200

177201
### Filtered output (`scalex dash --headless --resource pods`)

scalex-cli/src/dash/data.rs

Lines changed: 99 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,10 @@ pub async fn fetch_pods(client: &Client, namespace: Option<&str>) -> Result<Vec<
124124
.map(|s| s.container_statuses.clone().unwrap_or_default())
125125
.unwrap_or_default();
126126

127+
// Derive effective status: check container waiting reasons
128+
// (e.g., CrashLoopBackOff shows phase=Running but container is waiting)
129+
let effective_status = derive_effective_status(&phase, &container_statuses);
130+
127131
let ready_count = container_statuses.iter().filter(|c| c.ready).count();
128132
let total_count = container_statuses.len();
129133
let restarts: i32 = container_statuses.iter().map(|c| c.restart_count).sum();
@@ -137,7 +141,7 @@ pub async fn fetch_pods(client: &Client, namespace: Option<&str>) -> Result<Vec<
137141
PodInfo {
138142
name: meta.name.clone().unwrap_or_default(),
139143
namespace: meta.namespace.clone().unwrap_or_default(),
140-
status: phase,
144+
status: effective_status,
141145
ready: format!("{}/{}", ready_count, total_count),
142146
restarts,
143147
age,
@@ -419,6 +423,42 @@ pub fn compute_resource_usage(nodes: &[NodeInfo], pods: &[PodInfo]) -> ResourceU
419423
}
420424
}
421425

426+
/// Derive effective pod status by checking container waiting reasons.
427+
/// K8s reports phase=Running even when containers are in CrashLoopBackOff.
428+
fn derive_effective_status(
429+
phase: &str,
430+
container_statuses: &[k8s_openapi::api::core::v1::ContainerStatus],
431+
) -> String {
432+
// Check for waiting containers with error reasons
433+
for cs in container_statuses {
434+
if let Some(state) = &cs.state {
435+
if let Some(waiting) = &state.waiting {
436+
if let Some(reason) = &waiting.reason {
437+
match reason.as_str() {
438+
"CrashLoopBackOff"
439+
| "ImagePullBackOff"
440+
| "ErrImagePull"
441+
| "CreateContainerConfigError"
442+
| "InvalidImageName" => {
443+
return reason.clone();
444+
}
445+
_ => {}
446+
}
447+
}
448+
}
449+
// Check for terminated containers with error
450+
if let Some(terminated) = &state.terminated {
451+
if let Some(reason) = &terminated.reason {
452+
if reason == "Error" || reason == "OOMKilled" {
453+
return reason.clone();
454+
}
455+
}
456+
}
457+
}
458+
}
459+
phase.to_string()
460+
}
461+
422462
fn format_age(now: chrono::DateTime<Utc>, created: chrono::DateTime<Utc>) -> String {
423463
let duration = now.signed_duration_since(created);
424464
let secs = duration.num_seconds();
@@ -510,4 +550,62 @@ mod tests {
510550
let created = now - chrono::Duration::days(3);
511551
assert_eq!(format_age(now, created), "3d");
512552
}
553+
554+
#[test]
555+
fn derive_status_returns_phase_when_no_waiting() {
556+
let statuses = vec![];
557+
assert_eq!(derive_effective_status("Running", &statuses), "Running");
558+
}
559+
560+
#[test]
561+
fn derive_status_detects_crashloopbackoff() {
562+
use k8s_openapi::api::core::v1::{ContainerState, ContainerStateWaiting, ContainerStatus};
563+
564+
let statuses = vec![ContainerStatus {
565+
name: "app".into(),
566+
ready: false,
567+
restart_count: 5,
568+
image: "test:latest".into(),
569+
image_id: "".into(),
570+
state: Some(ContainerState {
571+
waiting: Some(ContainerStateWaiting {
572+
reason: Some("CrashLoopBackOff".into()),
573+
message: None,
574+
}),
575+
running: None,
576+
terminated: None,
577+
}),
578+
..Default::default()
579+
}];
580+
assert_eq!(
581+
derive_effective_status("Running", &statuses),
582+
"CrashLoopBackOff"
583+
);
584+
}
585+
586+
#[test]
587+
fn derive_status_detects_oomkilled() {
588+
use k8s_openapi::api::core::v1::{
589+
ContainerState, ContainerStateTerminated, ContainerStatus,
590+
};
591+
592+
let statuses = vec![ContainerStatus {
593+
name: "app".into(),
594+
ready: false,
595+
restart_count: 1,
596+
image: "test:latest".into(),
597+
image_id: "".into(),
598+
state: Some(ContainerState {
599+
waiting: None,
600+
running: None,
601+
terminated: Some(ContainerStateTerminated {
602+
reason: Some("OOMKilled".into()),
603+
exit_code: 137,
604+
..Default::default()
605+
}),
606+
}),
607+
..Default::default()
608+
}];
609+
assert_eq!(derive_effective_status("Running", &statuses), "OOMKilled");
610+
}
513611
}

0 commit comments

Comments
 (0)