[ceph_migrate] trigger mgr failover when cluster health is degraded

rebtoor · rebtoor · commit f7c69e33f71e · 2025-11-13T18:13:26.000+01:00
Add a task in post.yaml to trigger Ceph manager failover when the
cluster health status is not HEALTH_OK. This helps recover from
degraded cluster states after migration operations.

The task delegates to ComputeHCI nodes and uses /etc/ceph as the
config directory, ensuring proper execution context. It only runs
when ComputeHCI nodes are available and the cluster health is
degraded (HEALTH_WARN or HEALTH_ERR).

Update fail_mgr.yaml to intelligently handle ceph_config_home
based on execution context. When called from controller nodes
(like in mon.yaml), it uses ceph_config_tmp_client_home where
keyring files are available. When explicitly passed /etc/ceph or
executed on compute nodes, it uses that path. This ensures the
task works correctly in all execution contexts.

Signed-off-by: Roberto Alfieri &lt;ralfieri@redhat.com&gt;
diff --git a/tests/roles/ceph_migrate/tasks/fail_mgr.yaml b/tests/roles/ceph_migrate/tasks/fail_mgr.yaml
@@ -1,8 +1,30 @@
 # Get a client using -v /home/tripleo-admin/ceph_config:/etc/ceph:z as input
+# When ceph_config_home is explicitly passed in vars (e.g., /etc/ceph for ComputeHCI nodes), use it
+# Otherwise, default to ceph_config_tmp_client_home for controller nodes
+# Since ceph_config_home defaults to /etc/ceph in defaults/main.yaml, when called from
+# controller nodes without explicit override, we should use tmp client home instead.
+# Logic: If ceph_config_home is not /etc/ceph, use it. If it's /etc/ceph, check if we're
+# on a compute node (in ComputeHCI group) - if yes use /etc/ceph, if no use tmp client home.
+- name: Set ceph_config_home - use tmp client home on controllers unless explicitly overridden
+  ansible.builtin.set_fact:
+    _ceph_config_home: >-
+      {{
+        ceph_config_home
+        if ceph_config_home is defined and ceph_config_home != '/etc/ceph'
+        else (
+          '/etc/ceph'
+          if (
+            groups['ComputeHCI'] is defined and
+            (inventory_hostname in groups['ComputeHCI'] or ansible_hostname | regex_search('compute'))
+          )
+          else ceph_config_tmp_client_home
+        )
+      }}
+
 - name: Refresh ceph_cli
   ansible.builtin.include_tasks: ceph_cli.yaml
   vars:
-    ceph_config_home: "{{ ceph_config_tmp_client_home }}"
+    ceph_config_home: "{{ _ceph_config_home }}"
     ceph_fsid: "{{ mon_dump.fsid }}"
     ceph_cluster: ceph
 
@@ -33,6 +55,5 @@
       # This time we fail because something is wrong
     - name: Fail if ceph orchestrator is still not responding
       ansible.builtin.command: "{{ ceph_cli }} orch status --format json"
-      become: true
       async: 30
       poll: 1
diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml
@@ -11,3 +11,19 @@
   vars:
     shell_header: "set -euo pipefail"
   when: ceph_daemons_layout.rgw | default(true) | bool
+
+- name: Remove faulty mgr
+  delegate_to: "{{ groups['ComputeHCI'][0] | default(inventory_hostname) }}"
+  when:
+    - groups['ComputeHCI'] is defined
+    - groups['ComputeHCI'] | length > 0
+    - ceph is defined
+    - ceph.health.status is defined
+    - ceph.health.status != 'HEALTH_OK'
+  block:
+    - name: Include fail_mgr tasks
+      ansible.builtin.include_tasks: fail_mgr.yaml
+      vars:
+        ceph_config_home: /etc/ceph
+        ceph_fsid: "{{ mon_dump.fsid }}"
+        ceph_cluster: ceph