[ceph_migrate] trigger mgr failover when cluster health is degraded

rebtoor · rebtoor · commit 3321964b7d6a · 2025-11-14T10:07:40.000+01:00
Add a task in post.yaml to trigger Ceph manager failover when the
cluster health status is not HEALTH_OK. This helps recover from
degraded cluster states after migration operations.

The task installs the cephadm package on all ComputeHCI nodes and
then executes 'cephadm shell -- ceph mgr fail' on the first compute
node. This approach avoids container-based CLI complexity and uses
the native cephadm tool available on compute nodes where Ceph
daemons are running.

The task only runs when ComputeHCI nodes are available and the
cluster health is degraded (HEALTH_WARN or HEALTH_ERR).

Signed-off-by: Roberto Alfieri &lt;ralfieri@redhat.com&gt;
diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml
@@ -11,3 +11,25 @@
   vars:
     shell_header: "set -euo pipefail"
   when: ceph_daemons_layout.rgw | default(true) | bool
+
+- name: Remove faulty mgr
+  when:
+    - groups['ComputeHCI'] is defined
+    - groups['ComputeHCI'] | length > 0
+    - ceph is defined
+    - ceph.health.status is defined
+    - ceph.health.status != 'HEALTH_OK'
+  block:
+    - name: Install cephadm on all compute nodes
+      become: true
+      ansible.builtin.package:
+        name: cephadm
+        state: present
+      loop: "{{ groups['ComputeHCI'] }}"
+      delegate_to: "{{ item }}"
+
+    - name: Force fail ceph mgr on first compute node
+      become: true
+      ansible.builtin.command: cephadm shell -- ceph mgr fail
+      changed_when: false
+      delegate_to: "{{ groups['ComputeHCI'][0] }}"