Address CodeRabbit follow-up issues in backends and k8s smoke scripts

esnvidia · esnvidia · commit 093ff4c72b07 · 2026-02-22T21:47:26.000-08:00
- fix backend executor/fallback validation and run_job_and_wait cleanup

- harden declarative/slurm/kubernetes behavior and NCCL warning handling

- make single/multi-node smoke scripts CI-safe with automatic cleanup

- tighten kubectl manifest-validation fallback rules

Signed-off-by: Emanuel Scoullos &lt;escoullos@nvidia.com&gt;
diff --git a/nemo_skills/pipeline/backends/factory.py b/nemo_skills/pipeline/backends/factory.py
@@ -106,10 +106,12 @@ def get_backend(
 
         # Normalize executor name
         executor = executor.lower()
+        primary_config = dict(cluster_config)
+        primary_config["executor"] = executor
 
         # Try primary backend
         try:
-            backend = BackendFactory._create_backend(executor, cluster_config)
+            backend = BackendFactory._create_backend(executor, primary_config)
 
             # Health check
             if backend.health_check():
@@ -124,9 +126,12 @@ def get_backend(
             # Try fallback if configured
             fallback_executor = cluster_config.get("fallback_executor")
             if fallback and fallback_executor:
+                fallback_executor = fallback_executor.lower()
                 LOG.info(f"Attempting fallback to {fallback_executor} backend")
                 try:
-                    fallback_backend = BackendFactory._create_backend(fallback_executor, cluster_config)
+                    fallback_config = dict(cluster_config)
+                    fallback_config["executor"] = fallback_executor
+                    fallback_backend = BackendFactory._create_backend(fallback_executor, fallback_config)
                     if fallback_backend.health_check():
                         LOG.info(f"Successfully initialized fallback {fallback_executor} backend")
                         return fallback_backend
diff --git a/nemo_skills/pipeline/backends/integration.py b/nemo_skills/pipeline/backends/integration.py
@@ -295,11 +295,12 @@ def run_job_and_wait(
     backend = get_backend(cluster_config)
     handle = backend.submit_job(spec)
     LOG.info(f"Submitted job {handle.job_id} to {backend.name} backend")
-
-    status = backend.wait_for_completion(handle, timeout=timeout)
-    LOG.info(f"Job {handle.job_id} finished with status: {status.value}")
-
-    return status
+    try:
+        status = backend.wait_for_completion(handle, timeout=timeout)
+        LOG.info(f"Job {handle.job_id} finished with status: {status.value}")
+        return status
+    finally:
+        backend.cleanup(handle)
 
 
 def is_kubernetes_cluster(cluster_config: Dict) -> bool:
diff --git a/nemo_skills/pipeline/backends/kubernetes.py b/nemo_skills/pipeline/backends/kubernetes.py
@@ -88,7 +88,11 @@ def __init__(self, cluster_config: Dict):
         self.config = cluster_config
 
         # Validate config
-        if cluster_config.get("executor") != "kubernetes":
+        try:
+            executor = cluster_config["executor"]
+        except KeyError as exc:
+            raise ValueError("KubernetesBackend requires executor='kubernetes' in config") from exc
+        if executor != "kubernetes":
             raise ValueError("KubernetesBackend requires executor='kubernetes' in config")
 
         self.namespace = cluster_config.get("namespace", "default")
@@ -499,7 +503,11 @@ def _inject_rdma_resources(self, containers: list):
                     continue
             except (TypeError, ValueError):
                 # Keep behavior permissive if a custom quantity format appears.
-                pass
+                LOG.warning(
+                    "Unable to parse GPU quantity for container '%s' (value=%r); continuing RDMA resource injection",
+                    container.name,
+                    gpu_count,
+                )
 
             limits[resource_name] = resource_count
             requests[resource_name] = resource_count
diff --git a/nemo_skills/pipeline/backends/local.py b/nemo_skills/pipeline/backends/local.py
@@ -94,7 +94,15 @@ class LocalBackend(ComputeBackend):
 
     def __init__(self, cluster_config: Dict):
         self.config = cluster_config
-        self.use_docker = cluster_config.get("executor") == "local"
+        try:
+            executor = cluster_config["executor"]
+        except KeyError as exc:
+            raise ValueError("LocalBackend requires executor='local' or 'none' in config") from exc
+
+        if executor not in {"local", "none"}:
+            raise ValueError("LocalBackend requires executor='local' or 'none' in config")
+
+        self.use_docker = executor == "local"
         self._jobs: Dict[str, LocalJob] = {}
 
     @property
diff --git a/nemo_skills/pipeline/backends/slurm.py b/nemo_skills/pipeline/backends/slurm.py
@@ -127,13 +127,15 @@ def submit_job(self, spec: JobSpec) -> JobHandle:
 
         # Submit using existing infrastructure
         with get_exp(spec.name, self.config) as exp:
+            requested_gpus = main_container.resources.gpus if main_container.resources.gpus is not None else None
+
             task = add_task(
                 exp=exp,
                 cmd=cmd,
                 task_name=spec.name,
                 cluster_config=self.config,
                 container=container_image,
-                num_gpus=main_container.resources.gpus or None,
+                num_gpus=requested_gpus,
                 num_nodes=1,
                 heterogeneous=is_heterogeneous,
             )
diff --git a/nemo_skills/pipeline/utils/declarative.py b/nemo_skills/pipeline/utils/declarative.py
@@ -257,7 +257,7 @@ def _sanitize_k8s_name(name: str, max_length: int = 63) -> tuple[str, bool]:
     if not name:
         name = "job"
 
-    was_modified = name != original.lower() or original != original.lower()
+    was_modified = name != original
     return name, was_modified
 
 
@@ -537,9 +537,10 @@ def _run_nemo_run(
                             LOG.info(f"Job '{job_name}' depends on task handle '{dep}' (from reused experiment)")
                     elif isinstance(dep, dict):
                         # Dict dependency = internal job reference (by job spec object)
-                        dep_name = dep.get("name")
-                        if not dep_name:
-                            raise ValueError(f"Job dependency must have a 'name' field: {dep}")
+                        try:
+                            dep_name = dep["name"]
+                        except KeyError as exc:
+                            raise ValueError(f"Job dependency must have a 'name' field: {dep}") from exc
                         if dep_name in job_name_to_handle:
                             internal_deps.append(job_name_to_handle[dep_name])
                             LOG.info(
@@ -695,9 +696,10 @@ def _run_kubernetes(self, dry_run: bool = False, log_dir: Optional[str] = None,
                     LOG.warning(f"External dependency '{dep}' not supported on Kubernetes, skipping")
                 elif isinstance(dep, dict):
                     # Dict dependency = internal job reference (same as _run_nemo_run)
-                    dep_name = dep.get("name")
-                    if not dep_name:
-                        raise ValueError(f"Job dependency must have a 'name' field: {dep}")
+                    try:
+                        dep_name = dep["name"]
+                    except KeyError as exc:
+                        raise ValueError(f"Job dependency must have a 'name' field: {dep}") from exc
                     if dep_name in job_name_to_handle:
                         dependency_handles.append(job_name_to_handle[dep_name])
                         LOG.info(f"Job '{original_job_name}' depends on internal job '{dep_name}'")
@@ -744,8 +746,8 @@ def _run_kubernetes(self, dry_run: bool = False, log_dir: Optional[str] = None,
                 LOG.info(f"Waiting for job '{job_name}' to complete (sequential mode)...")
                 status = backend.wait_for_completion(handle)
                 LOG.info(f"Job '{job_name}' completed with status: {status.value}")
-                if status == JobStatus.FAILED:
-                    raise RuntimeError(f"Job '{job_name}' failed, aborting pipeline")
+                if status != JobStatus.SUCCEEDED:
+                    raise RuntimeError(f"Job '{job_name}' did not succeed (status={status.value}), aborting pipeline")
 
         if dry_run:
             LOG.info("Dry run complete. No jobs were submitted.")
@@ -808,15 +810,12 @@ def _convert_groups_to_job_spec(
                 # Prepare the command (evaluates lazy commands)
                 script, exec_config = self._prepare_command(command, self.cluster_config)
 
-                # Get the command string
-                if callable(script.inline):
-                    cmd_result = script.inline()
-                    if isinstance(cmd_result, tuple):
-                        cmd_str, _ = cmd_result
-                    else:
-                        cmd_str = cmd_result
-                else:
-                    cmd_str = script.inline
+                # _prepare_command() resolves lazy callables; inline is expected to be a string now.
+                cmd_str = script.inline
+                if not isinstance(cmd_str, str):
+                    raise TypeError(
+                        f"Command '{command.name}' must resolve to a string inline command, got {type(cmd_str).__name__}"
+                    )
 
                 # Resolve container image
                 container_image = self._resolve_container(exec_config, command, self.cluster_config)
@@ -843,7 +842,15 @@ def _convert_groups_to_job_spec(
                 # Get ports from script if available
                 ports = []
                 if hasattr(script, "port"):
-                    ports = [script.port]
+                    script_port = script.port
+                    if isinstance(script_port, int) and 1 <= script_port <= 65535:
+                        ports = [script_port]
+                    elif script_port is not None:
+                        LOG.warning(
+                            "Ignoring invalid port value %r on command '%s'; expected int in [1, 65535]",
+                            script_port,
+                            command.name,
+                        )
 
                 # Create container spec
                 container = ContainerSpec(
@@ -926,7 +933,11 @@ def _print_dry_run_job(self, job_name: str, spec: JobSpec):
             LOG.info(f"  - {container.name}")
             LOG.info(f"    Image: {container.image}")
             LOG.info(f"    GPUs: {container.resources.gpus}")
-            LOG.info(f"    Command: {' '.join(container.command[:50])}...")
+            command_text = " ".join(container.command)
+            max_chars = 200
+            if len(command_text) > max_chars:
+                command_text = f"{command_text[:max_chars]}..."
+            LOG.info(f"    Command: {command_text}")
         if spec.dependencies:
             LOG.info(f"Dependencies: {spec.dependencies}")
         LOG.info(f"Timeout: {spec.timeout_seconds}s")
diff --git a/scripts/k8s-tests/check_nccl_logs.py b/scripts/k8s-tests/check_nccl_logs.py
@@ -121,8 +121,10 @@ def parse_nccl_logs(log_text: str) -> NCCLCheckResult:
         if rank_match:
             result.ranks_seen.add(int(rank_match.group(1)))
 
-        # Errors
-        if "NCCL WARN" in line or "NCCL ERROR" in line:
+        # NCCL WARN lines are useful signal but not hard failures by themselves.
+        if "NCCL WARN" in line:
+            result.warnings.append(line.strip())
+        if "NCCL ERROR" in line:
             result.errors.append(line.strip())
 
     return result
@@ -189,9 +191,15 @@ def validate_result(
                 messages.append(f"FAIL: World size {result.world_size} != expected {expected_world}")
                 passed = False
 
+    # Check for NCCL warnings
+    if result.warnings:
+        messages.append(f"WARN: {len(result.warnings)} NCCL warning(s):")
+        for warn in result.warnings[:5]:
+            messages.append(f"  - {warn}")
+
     # Check for NCCL errors
     if result.errors:
-        messages.append(f"FAIL: {len(result.errors)} NCCL error(s)/warning(s):")
+        messages.append(f"FAIL: {len(result.errors)} NCCL error(s):")
         for err in result.errors[:5]:
             messages.append(f"  - {err}")
         passed = False
diff --git a/scripts/k8s-tests/pipeline_smoke_test.py b/scripts/k8s-tests/pipeline_smoke_test.py
@@ -92,13 +92,7 @@
 MULTI_NODE_TRAIN_CMD = """
 export NCCL_DEBUG=INFO
 export NCCL_DEBUG_SUBSYS=INIT,NET
-torchrun \\
-  --nproc_per_node={gpus} \\
-  --nnodes={nodes} \\
-  --node_rank=${{NODE_RANK:-0}} \\
-  --master_addr=${{MASTER_ADDR:-localhost}} \\
-  --master_port=${{MASTER_PORT:-29500}} \\
-  -c "
+cat > /tmp/multinode_smoke_train.py << 'PYEOF'
 import os, torch, torch.distributed as dist
 from torch.nn.parallel import DistributedDataParallel as DDP
 import torch.nn as nn
@@ -108,7 +102,7 @@
 local_rank = int(os.environ.get('LOCAL_RANK', 0))
 device = torch.device(f'cuda:{{local_rank}}')
 torch.cuda.set_device(device)
-print(f'[Rank {{rank}}] Node {{os.environ.get(\"NODE_RANK\",\"?\")}} GPU: {{torch.cuda.get_device_name(device)}}')
+print(f'[Rank {{rank}}] Node {{os.environ.get("NODE_RANK","?")}} GPU: {{torch.cuda.get_device_name(device)}}')
 
 model = DDP(nn.Linear(64, 64).to(device), device_ids=[local_rank])
 x = torch.randn(16, 64, device=device)
@@ -125,7 +119,14 @@
     print(f'  World size: {{dist.get_world_size()}}')
     print(f'  GPU: {{torch.cuda.get_device_name(0)}}')
 dist.destroy_process_group()
-"
+PYEOF
+torchrun \\
+  --nproc_per_node={gpus} \\
+  --nnodes={nodes} \\
+  --node_rank=${{NODE_RANK:-0}} \\
+  --master_addr=${{MASTER_ADDR:-localhost}} \\
+  --master_port=${{MASTER_PORT:-29500}} \\
+  /tmp/multinode_smoke_train.py
 """
 
 
@@ -243,11 +244,8 @@ def main():
 
         if status in (JobStatus.SUCCEEDED, JobStatus.FAILED):
             print(f"\n--- Logs from {name} ---")
-            try:
-                for line in backend.get_logs(handle):
-                    print(line, end="" if line.endswith("\n") else "\n")
-            except Exception as e:
-                print(f"Failed to get logs: {e}")
+            for line in backend.get_logs(handle):
+                print(line, end="" if line.endswith("\n") else "\n")
 
         if status == JobStatus.FAILED:
             print(f"\nERROR: Job '{name}' failed")
diff --git a/scripts/k8s-tests/run_sft_k8s_real.py b/scripts/k8s-tests/run_sft_k8s_real.py
@@ -171,11 +171,8 @@ def main():
 
         if status in (JobStatus.SUCCEEDED, JobStatus.FAILED):
             print("\n--- Logs ---")
-            try:
-                for line in backend.get_logs(handle):
-                    print(line, end="" if line.endswith("\n") else "\n")
-            except Exception as e:
-                print(f"Log error: {e}")
+            for line in backend.get_logs(handle):
+                print(line, end="" if line.endswith("\n") else "\n")
 
         # Cleanup
         backend.cleanup(handle)
diff --git a/scripts/k8s-tests/smoke_test_multi_node.sh b/scripts/k8s-tests/smoke_test_multi_node.sh
@@ -28,6 +28,21 @@ SERVICE_NAME="${JOB_NAME}-workers"
 MASTER_PORT=29500
 TIMEOUT_SECONDS=900  # 15 minutes
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CLEANUP_DONE=0
+
+cleanup_resources() {
+    if [[ "$CLEANUP_DONE" -eq 1 ]]; then
+        return
+    fi
+    CLEANUP_DONE=1
+
+    echo ""
+    echo "Cleaning up Kubernetes resources..."
+    kubectl delete job "$JOB_NAME" -n "$NAMESPACE" --ignore-not-found >/dev/null 2>&1 || true
+    kubectl delete service "$SERVICE_NAME" -n "$NAMESPACE" --ignore-not-found >/dev/null 2>&1 || true
+}
+
+trap cleanup_resources EXIT INT TERM
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -267,12 +282,7 @@ if [ -n "$FIRST_POD" ]; then
         --expected-gpus-per-node "$NUM_GPUS" || true
 fi
 
-# Cleanup
-echo ""
-read -p "Delete job and service? [y/N] " -n 1 -r
-echo
-if [[ $REPLY =~ ^[Yy]$ ]]; then
-    kubectl delete job "$JOB_NAME" -n "$NAMESPACE" --ignore-not-found
-    kubectl delete service "$SERVICE_NAME" -n "$NAMESPACE" --ignore-not-found
-    echo "Resources deleted."
+if [[ "$JOB_STATUS" != "succeeded" ]]; then
+    echo "Smoke test failed."
+    exit 1
 fi
diff --git a/scripts/k8s-tests/smoke_test_single_node.sh b/scripts/k8s-tests/smoke_test_single_node.sh
@@ -26,6 +26,20 @@ IMAGE="${IMAGE:-${PYTORCH_IMAGE:-nvcr.io/nvidia/pytorch:25.03-py3}}"
 JOB_NAME="nemo-sft-smoke-$(date +%s | tail -c 6)"
 TIMEOUT_SECONDS=600  # 10 minutes
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CLEANUP_DONE=0
+
+cleanup_resources() {
+    if [[ "$CLEANUP_DONE" -eq 1 ]]; then
+        return
+    fi
+    CLEANUP_DONE=1
+
+    echo ""
+    echo "Cleaning up Kubernetes resources..."
+    kubectl delete job "$JOB_NAME" -n "$NAMESPACE" --ignore-not-found >/dev/null 2>&1 || true
+}
+
+trap cleanup_resources EXIT INT TERM
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -233,11 +247,7 @@ if [ -n "$POD_NAME" ]; then
     echo "Full logs saved to: $LOG_FILE"
 fi
 
-# Cleanup
-echo ""
-read -p "Delete job $JOB_NAME? [y/N] " -n 1 -r
-echo
-if [[ $REPLY =~ ^[Yy]$ ]]; then
-    kubectl delete job "$JOB_NAME" -n "$NAMESPACE" --ignore-not-found
-    echo "Job deleted."
+if [[ "$JOB_STATUS" != "succeeded" ]]; then
+    echo "Smoke test failed."
+    exit 1
 fi
diff --git a/scripts/k8s-tests/validate_k8s_manifests.sh b/scripts/k8s-tests/validate_k8s_manifests.sh
@@ -30,10 +30,17 @@ fi
 if command -v kubectl >/dev/null 2>&1; then
     echo "kubeconform not found; using kubectl --dry-run=client for validation..."
     for file in "${files[@]}"; do
-        # Try strict validation first. If schema download/strict validation is unavailable,
-        # fall back to parser-level validation to avoid false negatives in offline setups.
-        if ! kubectl apply --dry-run=client --validate=true -f "$file" >/dev/null 2>&1; then
-            kubectl apply --dry-run=client --validate=false -f "$file" >/dev/null
+        # Try strict schema validation first.
+        if ! output=$(kubectl apply --dry-run=client --validate=true -f "$file" 2>&1 >/dev/null); then
+            # Only fall back for known schema-retrieval/offline failures.
+            if grep -Eqi "openapi|schema|unable to retrieve|failed to download|connection refused|dial tcp|timeout|no such host|x509" <<<"$output"; then
+                echo "WARN: strict schema validation unavailable for $file; falling back to parser-only validation." >&2
+                kubectl apply --dry-run=client --validate=false -f "$file" >/dev/null
+            else
+                echo "ERROR: kubectl strict validation failed for $file" >&2
+                echo "$output" >&2
+                exit 1
+            fi
         fi
     done
     exit 0