Skip to content

Commit a2d7288

Browse files
authored
Adding proper logging for device removal case (#912)
* Adding e2e yaml and data nic param * Test: Fixing snapshot deleting uuid column * Testing: Fixing e2e tests * Testing: Fixing e2e yaml summary and adding e2e only * Testing: Fixing e2e yaml summary and adding e2e only * Testing: Fixing e2e yaml summary and adding e2e only * Adding test fixes * Adding test fixes * Adding a scheduler * Fixing deletes for lvol based on count * Fixing deletes for lvol based on count * Fixing notification and parallel lvol deletes * Parallel lvol delete clone delete * Parallel lvol delete clone delete * Adding upgrade yaml, fixing upgrade case and adding device remove stress case * Adding upgrade yaml, fixing upgrade case and adding device remove stress case * Adding upgrade yaml, fixing upgrade case and adding device remove stress case * Adding proper logging for device removal case * Adding proper logging for device removal case
1 parent ff625f6 commit a2d7288

File tree

6 files changed

+42
-19
lines changed

6 files changed

+42
-19
lines changed

.github/workflows/e2e-bootstrap.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: Bootstrap Cluster + Run E2E Tests
2-
run-name: "E2E Bootstrap | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
2+
run-name: "E2E Bootstrap | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
33

44
on:
55
workflow_call:

.github/workflows/e2e-only.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: Run E2E Tests on Existing Cluster
2-
run-name: "E2E | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
2+
run-name: "E2E | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
33

44
on:
55
workflow_dispatch:

.github/workflows/stress-run-bootstrap.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
name: Bootstrap Cluster + Stress Run Until Failure
2-
run-name: "Stress Bootstrap | ${{ github.ref_name }} | ${{ inputs.MNODES }}"
2+
run-name: "Stress Bootstrap | ${{ inputs.SBCLI_BRANCH }} | ${{ inputs.MNODES }}"
33

44
on:
55
workflow_dispatch:

e2e/e2e_tests/cluster_test_base.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -369,18 +369,39 @@ def collect_management_details(self, post_teardown=False):
369369
cmd = f"{self.base_cmd} cluster list-tasks {self.cluster_id} --limit 0 >& {base_path}/cluster_list_tasks{suffix}.txt"
370370
self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
371371
command=cmd)
372-
372+
373+
# Collect subtasks for balancing_on_restart tasks
374+
try:
375+
tasks_out, _ = self.ssh_obj.exec_command(
376+
node=self.mgmt_nodes[0],
377+
command=f"{self.base_cmd} cluster list-tasks {self.cluster_id} --limit 0"
378+
)
379+
for line in (tasks_out or "").splitlines():
380+
if "balancing_on_restart" not in line:
381+
continue
382+
parts = [p.strip() for p in line.split("|")]
383+
# Table rows have a leading empty cell from '| id | ...'
384+
# Column layout: | id | function | status | ...
385+
tid = next((p for p in parts if p and p != "id"), None)
386+
if not tid:
387+
continue
388+
sub_cmd = f"{self.base_cmd} cluster get-subtasks {tid} >& {base_path}/subtask_{tid}{suffix}.txt"
389+
self.ssh_obj.exec_command(node=self.mgmt_nodes[0], command=sub_cmd)
390+
except Exception as e:
391+
self.logger.warning(f"Failed to collect subtasks: {e}")
392+
373393
cmd = f"{self.base_cmd} sn list >& {base_path}/sn_list{suffix}.txt"
374394
self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
375395
command=cmd)
376-
cmd = f"{self.base_cmd} cluster get-capacity {self.cluster_id} >& {base_path}/cluster_capacity{suffix}.txt"
396+
397+
cmd = f"{self.base_cmd} sn list --json >& {base_path}/sn_list{suffix}.json"
377398
self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
378399
command=cmd)
379-
400+
380401
cmd = f"{self.base_cmd} cluster get-capacity {self.cluster_id} >& {base_path}/cluster_capacity{suffix}.txt"
381402
self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
382403
command=cmd)
383-
404+
384405
cmd = f"{self.base_cmd} cluster show {self.cluster_id} >& {base_path}/cluster_show{suffix}.txt"
385406
self.ssh_obj.exec_command(node=self.mgmt_nodes[0],
386407
command=cmd)

e2e/stress_test/continuous_single_node_outage.py

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ def __init__(self, **kwargs):
6060
def _initialize_outage_log(self):
6161
"""Create or initialize the outage log file."""
6262
with open(self.outage_log_file, 'w') as log:
63-
log.write("Timestamp,Node,Outage_Type,Event\n")
63+
log.write("Timestamp,Device_ID,PCIe,Outage_Type,Event\n")
6464

65-
def log_outage_event(self, node, outage_type, event, outage_time=0):
65+
def log_outage_event(self, outage_type, event, outage_time=0):
6666
"""Log an outage event to the outage log file."""
6767
if outage_time:
6868
base_epoch = getattr(self, "outage_start_time", None)
@@ -73,8 +73,10 @@ def log_outage_event(self, node, outage_type, event, outage_time=0):
7373
else:
7474
ts_dt = datetime.now()
7575
timestamp = ts_dt.strftime('%Y-%m-%d %H:%M:%S')
76+
device_id = self.outage_device_id or "unknown"
77+
pcie = self.outage_device_pcie or ""
7678
with open(self.outage_log_file, 'a') as log:
77-
log.write(f"{timestamp},{node},{outage_type},{event}\n")
79+
log.write(f"{timestamp},{device_id},{pcie},{outage_type},{event}\n")
7880

7981
def create_lvols_with_fio(self, count):
8082
"""Create lvols and start FIO with random configurations."""
@@ -218,7 +220,7 @@ def perform_random_outage(self):
218220
f"Performing {outage_type} on device {self.outage_device_id} "
219221
f"(PCI: {self.outage_device_pcie}) on node {self.current_outage_node}"
220222
)
221-
self.log_outage_event(self.current_outage_node, outage_type, "Outage started")
223+
self.log_outage_event(outage_type, "Outage started")
222224

223225
if outage_type == "device_remove_logical":
224226
self.ssh_obj.exec_command(
@@ -239,7 +241,7 @@ def perform_random_outage(self):
239241
def restart_nodes_after_failover(self, outage_type):
240242
"""Recover the device after an outage."""
241243
self.logger.info(f"Recovering from {outage_type} for device {self.outage_device_id}")
242-
self.log_outage_event(self.current_outage_node, outage_type, "Recovery started")
244+
self.log_outage_event(outage_type, "Recovery started")
243245

244246
if outage_type == "device_remove_logical":
245247
self.ssh_obj.restart_device(
@@ -266,7 +268,7 @@ def restart_nodes_after_failover(self, outage_type):
266268
)
267269
self.sbcli_utils.wait_for_health_status(self.current_outage_node, True, timeout=600)
268270
self.outage_end_time = int(datetime.now().timestamp())
269-
self.log_outage_event(self.current_outage_node, outage_type, "Device recovered")
271+
self.log_outage_event(outage_type, "Device recovered")
270272

271273
search_start_iso = datetime.fromtimestamp(self.outage_start_time - 30).isoformat(timespec='microseconds')
272274
search_end_iso = datetime.fromtimestamp(self.outage_end_time + 10).isoformat(timespec='microseconds')

e2e/utils/ssh_utils.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1579,11 +1579,11 @@ def _safe(name: str) -> str:
15791579
# Always save a full container listing for later forensics
15801580
self.exec_command(
15811581
node,
1582-
f"bash -lc \"docker ps -a > '{base_dir}/docker_ps_a_{_safe(node)}_{ts}.txt' 2>&1 || true\""
1582+
f"bash -lc \"sudo docker ps -a > '{base_dir}/docker_ps_a_{_safe(node)}_{ts}.txt' 2>&1 || true\""
15831583
)
15841584

15851585
# Discover container names (include exited)
1586-
out, _ = self.exec_command(node, "bash -lc \"docker ps -a --format '{{{{.Names}}}}' 2>/dev/null || true\"")
1586+
out, _ = self.exec_command(node, "bash -lc \"sudo docker ps -a --format '{{.Names}}' 2>/dev/null || true\"")
15871587
containers = [c.strip() for c in (out or "").splitlines() if c.strip()]
15881588

15891589
if not containers:
@@ -1602,29 +1602,29 @@ def _safe(name: str) -> str:
16021602
self.exec_command(
16031603
node,
16041604
"bash -lc "
1605-
f"\"docker logs --timestamps {c} > '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' 2>&1 || true; "
1605+
f"\"sudo docker logs --timestamps {c} > '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' 2>&1 || true; "
16061606
f"mv -f '{cont_dir}/docker_logs_{sc}_{ts}.log.tmp' '{cont_dir}/docker_logs_{sc}_{ts}.log' || true\""
16071607
)
16081608

16091609
# docker inspect (JSON)
16101610
self.exec_command(
16111611
node,
16121612
"bash -lc "
1613-
f"\"docker inspect {c} > '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' 2>&1 || true; "
1613+
f"\"sudo docker inspect {c} > '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' 2>&1 || true; "
16141614
f"mv -f '{cont_dir}/docker_inspect_{sc}_{ts}.json.tmp' '{cont_dir}/docker_inspect_{sc}_{ts}.json' || true\""
16151615
)
16161616

16171617
# Optional extras that often help:
16181618
# docker top (may fail on exited containers, so '|| true')
16191619
self.exec_command(
16201620
node,
1621-
f"bash -lc \"docker top {c} > '{cont_dir}/docker_top_{sc}_{ts}.txt' 2>&1 || true\""
1621+
f"bash -lc \"sudo docker top {c} > '{cont_dir}/docker_top_{sc}_{ts}.txt' 2>&1 || true\""
16221622
)
16231623

16241624
# container fs usage (size); harmless if unsupported
16251625
self.exec_command(
16261626
node,
1627-
f"bash -lc \"docker inspect --size {c} > '{cont_dir}/docker_inspect_size_{sc}_{ts}.json' 2>&1 || true\""
1627+
f"bash -lc \"sudo docker inspect --size {c} > '{cont_dir}/docker_inspect_size_{sc}_{ts}.json' 2>&1 || true\""
16281628
)
16291629

16301630
# For convenience, also dump names list used

0 commit comments

Comments
 (0)