From cca256394979e1f9f0a8bd1bbdf5e332cce901a1 Mon Sep 17 00:00:00 2001
From: Roberto Alfieri <ralfieri@redhat.com>
Date: Tue, 4 Nov 2025 15:57:16 +0100
Subject: [PATCH] [ceph_migrate] trigger mgr failover when cluster health is
 degraded

Add a task in post.yaml to trigger Ceph manager failover when the
cluster health status is not HEALTH_OK. This helps recover from
degraded cluster states after migration operations.

The task installs the cephadm package on all ComputeHCI nodes and
then executes 'cephadm shell -- ceph mgr fail' on the first compute
node. This approach avoids container-based CLI complexity and uses
the native cephadm tool available on compute nodes where Ceph
daemons are running.

The task only runs when ComputeHCI nodes are available and the
cluster health is degraded (HEALTH_WARN or HEALTH_ERR).

Signed-off-by: Roberto Alfieri <ralfieri@redhat.com>
---
 tests/roles/ceph_migrate/handlers/main.yml |  5 -----
 tests/roles/ceph_migrate/tasks/post.yaml   | 22 ++++++++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)

diff --git a/tests/roles/ceph_migrate/handlers/main.yml b/tests/roles/ceph_migrate/handlers/main.yml
index 31ccdcdb1..2a09e77b1 100644
--- a/tests/roles/ceph_migrate/handlers/main.yml
+++ b/tests/roles/ceph_migrate/handlers/main.yml
@@ -3,8 +3,3 @@
   become: true
   ansible.builtin.command:
     "{{ ceph_cli }} mgr fail"
-  delegate_to: "{{ groups['ComputeHCI'][0] | default(inventory_hostname) }}"
-  when:
-    - groups['ComputeHCI'] is defined
-    - groups['ComputeHCI'] | length > 0
-    - ceph_cli is defined
diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml
index f1ac70399..2860159c2 100644
--- a/tests/roles/ceph_migrate/tasks/post.yaml
+++ b/tests/roles/ceph_migrate/tasks/post.yaml
@@ -11,3 +11,25 @@
   vars:
     shell_header: "set -euo pipefail"
   when: ceph_daemons_layout.rgw | default(true) | bool
+
+- name: Remove faulty mgr
+  when:
+    - groups['ComputeHCI'] is defined
+    - groups['ComputeHCI'] | length > 0
+    - ceph is defined
+    - ceph.health.status is defined
+    - ceph.health.status != 'HEALTH_OK'
+  block:
+    - name: Install cephadm on all compute nodes
+      become: true
+      ansible.builtin.package:
+        name: cephadm
+        state: present
+      loop: "{{ groups['ComputeHCI'] }}"
+      delegate_to: "{{ item }}"
+
+    - name: Force fail ceph mgr on first compute node
+      become: true
+      ansible.builtin.command: cephadm shell -- ceph mgr fail
+      changed_when: false
+      delegate_to: "{{ groups['ComputeHCI'][0] }}"