Skip to content

Commit 4c7d558

Browse files
committed
Add badfish support for iDRAC cleanup operations
This commit adds support for using badfish container to perform iDRAC cleanup operations on Dell hardware. Badfish is used to clear the iDRAC job queue and reset the iDRAC service to improve stability during boot operations. It does not replace redfish_command or URI modules, which continue to be used for standard Redfish operations. Changes: - Created new 'badfish' Ansible role with install.yml and call.yml tasks - Added 'reset_idrac' parameter (renamed from 'use_badfish') to control badfish-based iDRAC cleanup operations - Integrated badfish container installation into bastion bootstrap process - Updated boot-iso/dell.yml to use badfish for: - Clearing iDRAC job queue (always executed) - Resetting iDRAC service (when reset_idrac is enabled) - Waiting for iDRAC to be available after reset - Replaced fixed pause with wait_for module to verify host power down - All badfish operations use quay.io/quads/badfish container image The badfish role provides a reusable call.yml task file that accepts badfish_host, badfish_user, badfish_password, and badfish_args parameters, making it easy to call badfish commands from other roles. The 'reset_idrac' parameter pulls and uses the badfish container to perform iDRAC cleanup operations, which helps resolve issues with stuck job queues and improves iDRAC stability during virtual media boot operations. AI Model: Claude Sonnet 4.5
1 parent e6d5927 commit 4c7d558

File tree

6 files changed

+192
-23
lines changed

6 files changed

+192
-23
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
---
2+
# badfish call tasks
3+
# Reusable task file to run badfish container with various options
4+
#
5+
# Required variables:
6+
# badfish_host: BMC address or hostname
7+
# badfish_user: BMC username
8+
# badfish_password: BMC password
9+
# badfish_args: List of badfish command and its arguments (e.g., ['--power-on'] or ['--mount-virtual-media', 'http://example.com/image.iso'])
10+
#
11+
# Optional variables:
12+
# badfish_dns: DNS server IP (for VPN environments)
13+
# badfish_no_log: Set to false to log command output (default: false)
14+
# delay: Delay between retries in seconds (default: omit)
15+
# retries: Number of retries (default: omit)
16+
17+
- name: Validate required badfish parameters
18+
ansible.builtin.assert:
19+
that:
20+
- badfish_host is defined
21+
- badfish_host | length > 0
22+
- badfish_user is defined
23+
- badfish_user | length > 0
24+
- badfish_password is defined
25+
- badfish_password | length > 0
26+
- badfish_args is defined
27+
- badfish_args | length > 0
28+
fail_msg: "Missing or empty required badfish parameters. Required: badfish_host, badfish_user, badfish_password, badfish_args"
29+
quiet: true
30+
31+
- name: Build and run badfish container command
32+
vars:
33+
_badfish_podman_cmd: >-
34+
{{
35+
['podman', 'run', '--rm'] +
36+
(['--dns', badfish_dns] if badfish_dns is defined else [])
37+
}}
38+
_badfish_badfish_cmd: >-
39+
{{
40+
['-H', badfish_host, '-u', badfish_user, '-p', badfish_password] +
41+
(badfish_args | default([]))
42+
}}
43+
_badfish_full_cmd: "{{ _badfish_podman_cmd + ['quay.io/quads/badfish'] + _badfish_badfish_cmd }}"
44+
ansible.builtin.set_fact:
45+
badfish_command_list: "{{ _badfish_full_cmd }}"
46+
47+
- name: Display badfish command
48+
ansible.builtin.debug:
49+
msg: "Running badfish command: {{ badfish_command_list | join(' ') }}"
50+
51+
- name: Run badfish container
52+
ansible.builtin.command:
53+
argv: "{{ badfish_command_list }}"
54+
no_log: "{{ badfish_no_log | default(false) }}"
55+
register: badfish_result
56+
failed_when: badfish_result.rc != 0
57+
delay: "{{ delay | default(omit) }}"
58+
retries: "{{ retries | default(omit) }}"
59+
until: not badfish_result.failed
60+
61+
- name: Display badfish stdout
62+
ansible.builtin.debug:
63+
msg: "badfish stdout: {{ badfish_result.stdout }}"
64+
when: badfish_result.stdout is defined
65+
66+
- name: Display badfish stderr
67+
ansible.builtin.debug:
68+
msg: "badfish stderr: {{ badfish_result.stderr }}"
69+
when: badfish_result.stderr is defined
70+
71+
- name: Fail if badfish reports an error in stderr
72+
ansible.builtin.fail:
73+
msg: "badfish reported an error: {{ badfish_result.stderr }}"
74+
when:
75+
- not ignore_errors | default(false) | bool
76+
- badfish_result.stderr is defined and 'ERROR' in badfish_result.stderr
77+
78+
- name: Set fact with badfish result for external access
79+
ansible.builtin.set_fact:
80+
badfish_command_result: "{{ badfish_result }}"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
---
2+
# badfish install tasks
3+
# Pulls the badfish container image from quay.io
4+
5+
- name: Ensure podman package is installed
6+
ansible.builtin.package:
7+
name: podman
8+
state: present
9+
10+
- name: Pull badfish container image
11+
ansible.builtin.command:
12+
cmd: podman pull quay.io/quads/badfish

ansible/roles/bastion-install/defaults/main.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,7 @@ rh_crucible_url: https://github.com/perftool-incubator/crucible
1313

1414
# Since the use of tc on the bastion machine is rare, we disable rebooting the bastion machine by default
1515
bastion_install_tc_reboot: false
16+
17+
# Reset iDRAC service using badfish container (pulls and uses badfish container
18+
# to clear job queue and reset iDRAC service)
19+
reset_idrac: false

ansible/roles/bastion-install/tasks/main.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,12 @@
4848
disable_gpg_check: yes
4949
when: ansible_facts['distribution_major_version'] is version('9', '>=')
5050

51+
- name: Install badfish container image
52+
include_role:
53+
name: badfish
54+
tasks_from: install
55+
when: reset_idrac | bool
56+
5157
- name: Install python
5258
pip:
5359
name: python-hpilo

ansible/roles/boot-iso/tasks/dell.yml

Lines changed: 86 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,41 +2,78 @@
22
# Dell tasks for booting an iso
33

44
- name: Set Virtual Media ISO
5-
set_fact:
5+
ansible.builtin.set_fact:
66
_virtual_media_iso: "{{ virtual_media_iso | default(hostvars[item]['boot_iso']) }}"
77

8+
- name: "Dell - Clear iDrac job queue for {{ item }} (badfish)"
9+
ansible.builtin.include_role:
10+
name: badfish
11+
tasks_from: call
12+
vars:
13+
badfish_host: "{{ hostvars[item]['bmc_address'] }}"
14+
badfish_user: "{{ hostvars[item]['bmc_user'] }}"
15+
badfish_password: "{{ hostvars[item]['bmc_password'] }}"
16+
badfish_args:
17+
- "--clear-jobs"
18+
- "--force"
19+
ignore_errors: true
20+
821
- name: "Dell - Power down machine prior to booting iso for {{ item }}"
9-
shell: |
10-
ipmitool -I lanplus -H {{ hostvars[item]['bmc_address'] }} -U {{ hostvars[item]['bmc_user'] }} -P {{ hostvars[item]['bmc_password'] }} chassis power off
22+
ansible.builtin.command:
23+
cmd: >-
24+
ipmitool -I lanplus -H "{{ hostvars[item]['bmc_address'] }}"
25+
-U "{{ hostvars[item]['bmc_user'] }}"
26+
-P "{{ hostvars[item]['bmc_password'] }}" chassis power off
1127
ignore_errors: true
1228
register: ipmi_poweroff
1329

14-
- name: "Dell - Pause for power down for {{ item }}"
15-
pause:
16-
seconds: 10
30+
- name: "Dell - Reset iDRAC for {{ item }} (badfish)"
31+
ansible.builtin.include_role:
32+
name: badfish
33+
tasks_from: call
34+
vars:
35+
badfish_host: "{{ hostvars[item]['bmc_address'] }}"
36+
badfish_user: "{{ hostvars[item]['bmc_user'] }}"
37+
badfish_password: "{{ hostvars[item]['bmc_password'] }}"
38+
badfish_args:
39+
- "--racreset"
40+
ignore_errors: true
41+
when: reset_idrac | bool
42+
43+
- name: "Dell - Wait for power down for {{ item }}"
44+
ansible.builtin.wait_for:
45+
port: 22
46+
delay: 2
47+
state: stopped
48+
host: "{{ hostvars[item]['ansible_host'] | default(hostvars[item]['inventory_hostname']) }}"
49+
timeout: 60
1750
when: not ipmi_poweroff.failed
1851

19-
- name: Dell - Set OneTimeBoot VirtualCD
20-
uri:
21-
url: "https://{{ hostvars[item]['bmc_address'] }}/redfish/v1/Managers/iDRAC.Embedded.1/Actions/Oem/EID_674_Manager.ImportSystemConfiguration"
52+
- name: "Ensure iDRAC reset order is passed for {{ item }}"
53+
when: reset_idrac | bool
54+
ansible.builtin.pause:
55+
seconds: 30
56+
57+
- name: "Dell - Wait for iDRAC to be available for {{ item }}"
58+
ansible.builtin.uri:
59+
url: "https://{{ hostvars[item]['bmc_address'] }}/redfish/v1"
2260
user: "{{ hostvars[item]['bmc_user'] }}"
2361
password: "{{ hostvars[item]['bmc_password'] }}"
24-
method: POST
62+
method: GET
2563
headers:
2664
content-type: application/json
2765
Accept: application/json
28-
body:
29-
{
30-
"ShareParameters": { "Target": "ALL" },
31-
"ImportBuffer": '<SystemConfiguration><Component FQDD="iDRAC.Embedded.1"><Attribute Name="ServerBoot.1#BootOnce">Enabled</Attribute><Attribute Name="ServerBoot.1#FirstBootDevice">VCD-DVD</Attribute></Component></SystemConfiguration>',
32-
}
33-
body_format: json
34-
validate_certs: no
35-
status_code: 202
36-
return_content: yes
66+
validate_certs: false
67+
status_code: [200, 201, 301, 302]
68+
register: racreset_result
69+
until: racreset_result.status in [200, 201, 301, 302]
70+
retries: 60
71+
delay: 5
72+
failed_when: false
73+
when: reset_idrac | bool
3774

3875
- name: "Dell - Check for Virtual Media for {{ item }}"
39-
uri:
76+
ansible.builtin.uri:
4077
url: "https://{{ hostvars[item]['bmc_address'] }}/redfish/v1/Managers/iDRAC.Embedded.1/VirtualMedia/CD"
4178
user: "{{ hostvars[item]['bmc_user'] }}"
4279
password: "{{ hostvars[item]['bmc_password'] }}"
@@ -50,12 +87,15 @@
5087
status_code: 200
5188
return_content: yes
5289
register: check_virtual_media
90+
retries: 10
91+
delay: 10
92+
until: check_virtual_media.status == 200
5393

5494
- name: Block to rescue incase of stuck virtual media
5595
when: check_virtual_media.json.Image
5696
block:
5797
- name: "Dell - Eject any CD Virtual Media for {{ item }}"
58-
uri:
98+
ansible.builtin.uri:
5999
url: "https://{{ hostvars[item]['bmc_address'] }}/redfish/v1/Managers/iDRAC.Embedded.1/VirtualMedia/CD/Actions/VirtualMedia.EjectMedia"
60100
user: "{{ hostvars[item]['bmc_user'] }}"
61101
password: "{{ hostvars[item]['bmc_password'] }}"
@@ -72,15 +112,19 @@
72112
rescue:
73113
# Use racadm to address the failed redfish unmount of old virtual media
74114
- name: "Force mount of a existing image for {{ item }}"
75-
raw: racadm remoteimage -c -u "" -p "" -l http://{{ http_store_host }}:{{ http_store_port }}/{{ _virtual_media_iso }}
115+
ansible.builtin.command:
116+
cmd: >-
117+
racadm remoteimage -c -u "" -p "" -l http://{{ http_store_host }}:{{ http_store_port }}/{{ _virtual_media_iso }}
76118
delegate_to: "{{ hostvars[item]['bmc_address'] }}"
77119
vars:
78120
ansible_user: "{{ hostvars[item]['bmc_user'] }}"
79121
ansible_password: "{{ hostvars[item]['bmc_password'] }}"
80122
ansible_ssh_common_args: '-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null'
81123

82124
- name: "Force unmount of the existing image for {{ item }}"
83-
raw: racadm remoteimage -d
125+
ansible.builtin.command:
126+
cmd: >-
127+
racadm remoteimage -d
84128
delegate_to: "{{ hostvars[item]['bmc_address'] }}"
85129
vars:
86130
ansible_user: "{{ hostvars[item]['bmc_user'] }}"
@@ -106,6 +150,25 @@
106150
retries: 10
107151
delay: 30
108152

153+
- name: Dell - Set OneTimeBoot VirtualCD
154+
ansible.builtin.uri:
155+
url: "https://{{ hostvars[item]['bmc_address'] }}/redfish/v1/Managers/iDRAC.Embedded.1/Actions/Oem/EID_674_Manager.ImportSystemConfiguration"
156+
user: "{{ hostvars[item]['bmc_user'] }}"
157+
password: "{{ hostvars[item]['bmc_password'] }}"
158+
method: POST
159+
headers:
160+
content-type: application/json
161+
Accept: application/json
162+
body:
163+
{
164+
"ShareParameters": { "Target": "ALL" },
165+
"ImportBuffer": '<SystemConfiguration><Component FQDD="iDRAC.Embedded.1"><Attribute Name="ServerBoot.1#BootOnce">Enabled</Attribute><Attribute Name="ServerBoot.1#FirstBootDevice">VCD-DVD</Attribute></Component></SystemConfiguration>',
166+
}
167+
body_format: json
168+
validate_certs: no
169+
status_code: 202
170+
return_content: yes
171+
109172
- name: "DELL - Power ON for {{ item }}"
110173
community.general.redfish_command:
111174
category: Systems

ansible/vars/all.sample.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ setup_bastion_registry: false
7171
# Use in conjunction with ipv6 based clusters
7272
use_bastion_registry: false
7373

74+
# Reset iDRAC service using badfish container (pulls and uses badfish container
75+
# to clear job queue and reset iDRAC service)
76+
reset_idrac: false
77+
7478
################################################################################
7579
# OCP node vars
7680
################################################################################

0 commit comments

Comments
 (0)