[ceph_migrate] trigger mgr failover when cluster health is degraded

rebtoor · rebtoor · commit 46909aa5a5cc · 2025-11-13T10:32:26.000+01:00
Add a task in post.yaml to trigger Ceph manager failover when the
cluster health status is not HEALTH_OK. This helps recover from
degraded cluster states after migration operations.

The task delegates to ComputeHCI nodes and uses /etc/ceph as the
config directory, ensuring proper execution context. It only runs
when ComputeHCI nodes are available and the cluster health is
degraded (HEALTH_WARN or HEALTH_ERR).

Also update fail_mgr.yaml to accept ceph_config_home as a
parameter, allowing callers to override the default temporary
client home directory. This enables the task to work correctly
when executed on ComputeHCI nodes where /etc/ceph is the
appropriate config location.

Signed-off-by: Roberto Alfieri &lt;ralfieri@redhat.com&gt;
diff --git a/tests/roles/ceph_migrate/tasks/fail_mgr.yaml b/tests/roles/ceph_migrate/tasks/fail_mgr.yaml
@@ -1,8 +1,12 @@
 # Get a client using -v /home/tripleo-admin/ceph_config:/etc/ceph:z as input
+- name: Set ceph_config_home if not provided
+  ansible.builtin.set_fact:
+    _ceph_config_home: "{{ ceph_config_home | default(ceph_config_tmp_client_home) }}"
+
 - name: Refresh ceph_cli
   ansible.builtin.include_tasks: ceph_cli.yaml
   vars:
-    ceph_config_home: "{{ ceph_config_tmp_client_home }}"
+    ceph_config_home: "{{ _ceph_config_home }}"
     ceph_fsid: "{{ mon_dump.fsid }}"
     ceph_cluster: ceph
 
diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml
@@ -11,3 +11,19 @@
   vars:
     shell_header: "set -euo pipefail"
   when: ceph_daemons_layout.rgw | default(true) | bool
+
+- name: Remove faulty mgr
+  delegate_to: "{{ groups['ComputeHCI'][0] | default(inventory_hostname) }}"
+  when:
+    - groups['ComputeHCI'] is defined
+    - groups['ComputeHCI'] | length > 0
+    - ceph is defined
+    - ceph.health.status is defined
+    - ceph.health.status != 'HEALTH_OK'
+  block:
+    - name: Include fail_mgr tasks
+      ansible.builtin.include_tasks: fail_mgr.yaml
+      vars:
+        ceph_config_home: /etc/ceph
+        ceph_fsid: "{{ mon_dump.fsid }}"
+        ceph_cluster: ceph