From cca256394979e1f9f0a8bd1bbdf5e332cce901a1 Mon Sep 17 00:00:00 2001 From: Roberto Alfieri Date: Tue, 4 Nov 2025 15:57:16 +0100 Subject: [PATCH] [ceph_migrate] trigger mgr failover when cluster health is degraded Add a task in post.yaml to trigger Ceph manager failover when the cluster health status is not HEALTH_OK. This helps recover from degraded cluster states after migration operations. The task installs the cephadm package on all ComputeHCI nodes and then executes 'cephadm shell -- ceph mgr fail' on the first compute node. This approach avoids container-based CLI complexity and uses the native cephadm tool available on compute nodes where Ceph daemons are running. The task only runs when ComputeHCI nodes are available and the cluster health is degraded (HEALTH_WARN or HEALTH_ERR). Signed-off-by: Roberto Alfieri --- tests/roles/ceph_migrate/handlers/main.yml | 5 ----- tests/roles/ceph_migrate/tasks/post.yaml | 22 ++++++++++++++++++++++ 2 files changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/roles/ceph_migrate/handlers/main.yml b/tests/roles/ceph_migrate/handlers/main.yml index 31ccdcdb1..2a09e77b1 100644 --- a/tests/roles/ceph_migrate/handlers/main.yml +++ b/tests/roles/ceph_migrate/handlers/main.yml @@ -3,8 +3,3 @@ become: true ansible.builtin.command: "{{ ceph_cli }} mgr fail" - delegate_to: "{{ groups['ComputeHCI'][0] | default(inventory_hostname) }}" - when: - - groups['ComputeHCI'] is defined - - groups['ComputeHCI'] | length > 0 - - ceph_cli is defined diff --git a/tests/roles/ceph_migrate/tasks/post.yaml b/tests/roles/ceph_migrate/tasks/post.yaml index f1ac70399..2860159c2 100644 --- a/tests/roles/ceph_migrate/tasks/post.yaml +++ b/tests/roles/ceph_migrate/tasks/post.yaml @@ -11,3 +11,25 @@ vars: shell_header: "set -euo pipefail" when: ceph_daemons_layout.rgw | default(true) | bool + +- name: Remove faulty mgr + when: + - groups['ComputeHCI'] is defined + - groups['ComputeHCI'] | length > 0 + - ceph is defined + - ceph.health.status is defined + - ceph.health.status != 'HEALTH_OK' + block: + - name: Install cephadm on all compute nodes + become: true + ansible.builtin.package: + name: cephadm + state: present + loop: "{{ groups['ComputeHCI'] }}" + delegate_to: "{{ item }}" + + - name: Force fail ceph mgr on first compute node + become: true + ansible.builtin.command: cephadm shell -- ceph mgr fail + changed_when: false + delegate_to: "{{ groups['ComputeHCI'][0] }}"