Skip to content

Commit f7c69e3

Browse files
committed
[ceph_migrate] trigger mgr failover when cluster health is degraded
Add a task in post.yaml to trigger Ceph manager failover when the cluster health status is not HEALTH_OK. This helps recover from degraded cluster states after migration operations. The task delegates to ComputeHCI nodes and uses /etc/ceph as the config directory, ensuring proper execution context. It only runs when ComputeHCI nodes are available and the cluster health is degraded (HEALTH_WARN or HEALTH_ERR). Update fail_mgr.yaml to intelligently handle ceph_config_home based on execution context. When called from controller nodes (like in mon.yaml), it uses ceph_config_tmp_client_home where keyring files are available. When explicitly passed /etc/ceph or executed on compute nodes, it uses that path. This ensures the task works correctly in all execution contexts. Signed-off-by: Roberto Alfieri <[email protected]>
1 parent 2df6d47 commit f7c69e3

File tree

2 files changed

+39
-2
lines changed

2 files changed

+39
-2
lines changed

tests/roles/ceph_migrate/tasks/fail_mgr.yaml

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,30 @@
11
# Get a client using -v /home/tripleo-admin/ceph_config:/etc/ceph:z as input
2+
# When ceph_config_home is explicitly passed in vars (e.g., /etc/ceph for ComputeHCI nodes), use it
3+
# Otherwise, default to ceph_config_tmp_client_home for controller nodes
4+
# Since ceph_config_home defaults to /etc/ceph in defaults/main.yaml, when called from
5+
# controller nodes without explicit override, we should use tmp client home instead.
6+
# Logic: If ceph_config_home is not /etc/ceph, use it. If it's /etc/ceph, check if we're
7+
# on a compute node (in ComputeHCI group) - if yes use /etc/ceph, if no use tmp client home.
8+
- name: Set ceph_config_home - use tmp client home on controllers unless explicitly overridden
9+
ansible.builtin.set_fact:
10+
_ceph_config_home: >-
11+
{{
12+
ceph_config_home
13+
if ceph_config_home is defined and ceph_config_home != '/etc/ceph'
14+
else (
15+
'/etc/ceph'
16+
if (
17+
groups['ComputeHCI'] is defined and
18+
(inventory_hostname in groups['ComputeHCI'] or ansible_hostname | regex_search('compute'))
19+
)
20+
else ceph_config_tmp_client_home
21+
)
22+
}}
23+
224
- name: Refresh ceph_cli
325
ansible.builtin.include_tasks: ceph_cli.yaml
426
vars:
5-
ceph_config_home: "{{ ceph_config_tmp_client_home }}"
27+
ceph_config_home: "{{ _ceph_config_home }}"
628
ceph_fsid: "{{ mon_dump.fsid }}"
729
ceph_cluster: ceph
830

@@ -33,6 +55,5 @@
3355
# This time we fail because something is wrong
3456
- name: Fail if ceph orchestrator is still not responding
3557
ansible.builtin.command: "{{ ceph_cli }} orch status --format json"
36-
become: true
3758
async: 30
3859
poll: 1

tests/roles/ceph_migrate/tasks/post.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,19 @@
1111
vars:
1212
shell_header: "set -euo pipefail"
1313
when: ceph_daemons_layout.rgw | default(true) | bool
14+
15+
- name: Remove faulty mgr
16+
delegate_to: "{{ groups['ComputeHCI'][0] | default(inventory_hostname) }}"
17+
when:
18+
- groups['ComputeHCI'] is defined
19+
- groups['ComputeHCI'] | length > 0
20+
- ceph is defined
21+
- ceph.health.status is defined
22+
- ceph.health.status != 'HEALTH_OK'
23+
block:
24+
- name: Include fail_mgr tasks
25+
ansible.builtin.include_tasks: fail_mgr.yaml
26+
vars:
27+
ceph_config_home: /etc/ceph
28+
ceph_fsid: "{{ mon_dump.fsid }}"
29+
ceph_cluster: ceph

0 commit comments

Comments
 (0)