Adding proper logging for device removal case (#912)

RaunakJalan · web-flow · commit a2d7288b79db · 2026-03-09T13:32:17.000+05:30
* Adding e2e yaml and data nic param

* Test: Fixing snapshot deleting uuid column

* Testing: Fixing e2e tests

* Testing: Fixing e2e yaml summary and adding e2e only

* Testing: Fixing e2e yaml summary and adding e2e only

* Testing: Fixing e2e yaml summary and adding e2e only

* Adding test fixes

* Adding test fixes

* Adding a scheduler

* Fixing deletes for lvol based on count

* Fixing deletes for lvol based on count

* Fixing notification and parallel lvol deletes

* Parallel lvol delete clone delete

* Parallel lvol delete clone delete

* Adding upgrade yaml, fixing upgrade case and adding device remove stress case

* Adding upgrade yaml, fixing upgrade case and adding device remove stress case

* Adding upgrade yaml, fixing upgrade case and adding device remove stress case

* Adding proper logging for device removal case

* Adding proper logging for device removal case
diff --git a/.github/workflows/e2e-bootstrap.yml b/.github/workflows/e2e-bootstrap.yml
@@ -1,5 +1,5 @@
 name: Bootstrap Cluster + Run E2E Tests
-run-name: "E2E Bootstrap | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
+run-name: "E2E Bootstrap | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
 
 on:
   workflow_call:
diff --git a/.github/workflows/e2e-only.yml b/.github/workflows/e2e-only.yml
@@ -1,5 +1,5 @@
 name: Run E2E Tests on Existing Cluster
-run-name: "E2E | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
+run-name: "E2E | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/stress-run-bootstrap.yml b/.github/workflows/stress-run-bootstrap.yml
@@ -1,5 +1,5 @@
 name: Bootstrap Cluster + Stress Run Until Failure
-run-name: "Stress Bootstrap | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
+run-name: "Stress Bootstrap | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
 
 on:
   workflow_dispatch:
diff --git a/e2e/e2e_tests/cluster_test_base.py b/e2e/e2e_tests/cluster_test_base.py
@@ -369,18 +369,39 @@ def collect_management_details(self, post_teardown=False):
         cmd = f"{self.base_cmd} cluster list-tasks {self.cluster_id} --limit 0 >& {base_path}/cluster_list_tasks{suffix}.txt"
         self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
                                   command=cmd)
-        
+
+        # Collect subtasks for balancing_on_restart tasks
+        try:
+            tasks_out, _ = self.ssh_obj.exec_command(
+                node=self.mgmt_nodes[0],
+                command=f"{self.base_cmd} cluster list-tasks {self.cluster_id} --limit 0"
+            )
+            for line in (tasks_out or "").splitlines():
+                if "balancing_on_restart" not in line:
+                    continue
+                parts = [p.strip() for p in line.split("|")]
+                # Table rows have a leading empty cell from '| id | ...'
+                # Column layout: | id | function | status | ...
+                tid = next((p for p in parts if p and p != "id"), None)
+                if not tid:
+                    continue
+                sub_cmd = f"{self.base_cmd} cluster get-subtasks {tid} >& {base_path}/subtask_{tid}{suffix}.txt"
+                self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=sub_cmd)
+        except Exception as e:
+            self.logger.warning(f"Failed to collect subtasks: {e}")
+
         cmd = f"{self.base_cmd} sn list >& {base_path}/sn_list{suffix}.txt"
         self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
                                   command=cmd)
-        cmd = f"{self.base_cmd} cluster get-capacity {self.cluster_id} >& {base_path}/cluster_capacity{suffix}.txt"
+
+        cmd = f"{self.base_cmd} sn list --json >& {base_path}/sn_list{suffix}.json"
         self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
                                   command=cmd)
-        
+
         cmd = f"{self.base_cmd} cluster get-capacity {self.cluster_id} >& {base_path}/cluster_capacity{suffix}.txt"
         self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
                                   command=cmd)
-        
+
         cmd = f"{self.base_cmd} cluster show {self.cluster_id} >& {base_path}/cluster_show{suffix}.txt"
         self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
                                   command=cmd)
diff --git a/e2e/stress_test/continuous_single_node_outage.py b/e2e/stress_test/continuous_single_node_outage.py
@@ -60,9 +60,9 @@ def __init__(self, **kwargs):
     def _initialize_outage_log(self):
         """Create or initialize the outage log file."""
         with open(self.outage_log_file, 'w') as log:
-            log.write("Timestamp,Node,Outage_Type,Event\n")
+            log.write("Timestamp,Device_ID,PCIe,Outage_Type,Event\n")
 
-    def log_outage_event(self, node, outage_type, event, outage_time=0):
+    def log_outage_event(self, outage_type, event, outage_time=0):
         """Log an outage event to the outage log file."""
         if outage_time:
             base_epoch = getattr(self, "outage_start_time", None)
@@ -73,8 +73,10 @@ def log_outage_event(self, node, outage_type, event, outage_time=0):
         else:
             ts_dt = datetime.now()
         timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S')
+        device_id = self.outage_device_id or "unknown"
+        pcie = self.outage_device_pcie or ""
         with open(self.outage_log_file, 'a') as log:
-            log.write(f"{timestamp},{node},{outage_type},{event}\n")
+            log.write(f"{timestamp},{device_id},{pcie},{outage_type},{event}\n")
 
     def create_lvols_with_fio(self, count):
         """Create lvols and start FIO with random configurations."""
@@ -218,7 +220,7 @@ def perform_random_outage(self):
             f"Performing {outage_type} on device {self.outage_device_id} "
             f"(PCI: {self.outage_device_pcie}) on node {self.current_outage_node}"
         )
-        self.log_outage_event(self.current_outage_node, outage_type, "Outage started")
+        self.log_outage_event(outage_type, "Outage started")
 
         if outage_type == "device_remove_logical":
             self.ssh_obj.exec_command(
@@ -239,7 +241,7 @@ def perform_random_outage(self):
     def restart_nodes_after_failover(self, outage_type):
         """Recover the device after an outage."""
         self.logger.info(f"Recovering from {outage_type} for device {self.outage_device_id}")
-        self.log_outage_event(self.current_outage_node, outage_type, "Recovery started")
+        self.log_outage_event(outage_type, "Recovery started")
 
         if outage_type == "device_remove_logical":
             self.ssh_obj.restart_device(
@@ -266,7 +268,7 @@ def restart_nodes_after_failover(self, outage_type):
         )
         self.sbcli_utils.wait_for_health_status(self.current_outage_node, True, timeout=600)
         self.outage_end_time = int(datetime.now().timestamp())
-        self.log_outage_event(self.current_outage_node, outage_type, "Device recovered")
+        self.log_outage_event(outage_type, "Device recovered")
 
         search_start_iso = datetime.fromtimestamp(self.outage_start_time - 30).isoformat(timespec='microseconds')
         search_end_iso = datetime.fromtimestamp(self.outage_end_time + 10).isoformat(timespec='microseconds')
diff --git a/e2e/utils/ssh_utils.py b/e2e/utils/ssh_utils.py
@@ -1579,11 +1579,11 @@ def _safe(name: str) -> str:
             # Always save a full container listing for later forensics
             self.exec_command(
                 node,
-                f"bash -lc \"docker ps -a > '{base_dir}/docker_ps_a_{_safe(node)}_{ts}.txt' 2>&1 || true\""
+                f"bash -lc \"sudo docker ps -a > '{base_dir}/docker_ps_a_{_safe(node)}_{ts}.txt' 2>&1 || true\""
             )
 
             # Discover container names (include exited)
-            out, _ = self.exec_command(node, "bash -lc \"docker ps -a --format '{{{{.Names}}}}' 2>/dev/null || true\"")
+            out, _ = self.exec_command(node, "bash -lc \"sudo docker ps -a --format '{{.Names}}' 2>/dev/null || true\"")
             containers = [c.strip() for c in (out or "").splitlines() if c.strip()]
 
             if not containers:
@@ -1602,29 +1602,29 @@ def _safe(name: str) -> str:
                 self.exec_command(
                     node,
                     "bash -lc "
-                    f"\"docker logs --timestamps {c} > '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' 2>&1 || true; "
+                    f"\"sudo docker logs --timestamps {c} > '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' 2>&1 || true; "
                     f"mv -f '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' '{cont_dir}/docker_logs_{sc}_{ts}.log' || true\""
                 )
 
                 # docker inspect (JSON)
                 self.exec_command(
                     node,
                     "bash -lc "
-                    f"\"docker inspect {c} > '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' 2>&1 || true; "
+                    f"\"sudo docker inspect {c} > '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' 2>&1 || true; "
                     f"mv -f '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' '{cont_dir}/docker_inspect_{sc}_{ts}.json' || true\""
                 )
 
                 # Optional extras that often help:
                 # docker top (may fail on exited containers, so '|| true')
                 self.exec_command(
                     node,
-                    f"bash -lc \"docker top {c} > '{cont_dir}/docker_top_{sc}_{ts}.txt' 2>&1 || true\""
+                    f"bash -lc \"sudo docker top {c} > '{cont_dir}/docker_top_{sc}_{ts}.txt' 2>&1 || true\""
                 )
 
                 # container fs usage (size); harmless if unsupported
                 self.exec_command(
                     node,
-                    f"bash -lc \"docker inspect --size {c} > '{cont_dir}/docker_inspect_size_{sc}_{ts}.json' 2>&1 || true\""
+                    f"bash -lc \"sudo docker inspect --size {c} > '{cont_dir}/docker_inspect_size_{sc}_{ts}.json' 2>&1 || true\""
                 )
 
             # For convenience, also dump names list used