Skip to content

Commit accd1fa

Browse files
Merge pull request #119 from Yashansh-Sharma15/ocp_kdump
Enabling Kdump Automation
2 parents 72c50af + 55d39e9 commit accd1fa

File tree

8 files changed

+210
-0
lines changed

8 files changed

+210
-0
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ This repository consists of additional ansible playbooks for the following:
5252
1. Run Openshift-tests-private testcases.
5353
1. Verify IPI day2 operations
5454
1. Deploy Openshift Data Foundation operator
55+
1. Enabling Kdump
5556

5657
## Assumptions:
5758

examples/ocp_kdump_vars.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
#ocp-kdump vars
3+
4+
worker_node: # Enter the worker node , that needs to be crashed

playbooks/ocp-kdump.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
- name: Automate Enabling Kdump
3+
hosts: bastion
4+
roles:
5+
- ocp-kdump
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
ocp-enabling-kdump
2+
=========
3+
4+
This role is used for Enabling Kdump on worker node.
5+
6+
Requirements
7+
------------
8+
9+
- OCP 4.x healthy cluster on PowerVS having 32gb reserve volume to worker node.
10+
11+
Role Variables
12+
--------------
13+
| Variable | Required | Default | Comments |
14+
|--------------------------------|----------|-------------|------------------------------------------------|
15+
| worker_node | yes | | Enter the name of the node in which kdump is to be done |
16+
17+
18+
Example Playbook
19+
----------------
20+
21+
```
22+
---
23+
- name: Automate Enabling Kdump
24+
include_role:
25+
- ocp-kdump
26+
```
27+
28+
License
29+
------------------
30+
31+
See LICENCE.txt
32+
33+
34+
Author Information
35+
------------------
36+
37+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
#ocp-kdump vars
3+
4+
worker_node: worker-0 # Enter the worker node , that needs to be crashed
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
# roles/ocp-kdump/files/crash_trigger.yml
2+
3+
- name: Wait for worker node SSH to be reachable
4+
wait_for:
5+
host: "{{ worker_node }}"
6+
port: 22
7+
state: started
8+
timeout: 300
9+
10+
- name: Trigger kernel crash on worker node
11+
shell: ssh -o StrictHostKeyChecking=no core@{{ worker_node }} "echo c | sudo tee /proc/sysrq-trigger"
12+
ignore_errors: true
13+
14+
- name: Pause for 15 minutes before next crash
15+
pause:
16+
seconds: 900
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
---
2+
# tasks file for playbooks/roles/ocp-kdump
3+
4+
# Check the health of Cluster Operators
5+
- name: Check if cluster operators and nodes are healthy
6+
include_role:
7+
name: check-cluster-health
8+
9+
- name: Ensure Butane is installed
10+
package:
11+
name: butane
12+
state: present
13+
14+
- name: Check Butane version
15+
command: butane --version
16+
register: butane_check
17+
failed_when: butane_check.rc != 0
18+
19+
- name: Ensure kexec-tools is installed
20+
package:
21+
name: kexec-tools
22+
state: present
23+
24+
- name: Create Butane config file
25+
copy:
26+
dest: "/tmp/99-worker-kdump.bu"
27+
mode: '0644'
28+
content: |
29+
variant: openshift
30+
version: 4.18.0
31+
metadata:
32+
name: 99-worker-kdump
33+
labels:
34+
machineconfiguration.openshift.io/role: worker
35+
openshift:
36+
kernel_arguments:
37+
- crashkernel=2G-4G:384M,4G-16G:512M,16G-64G:1G,64G-128G:2G,128G-:4G
38+
storage:
39+
files:
40+
- path: /etc/kdump.conf
41+
mode: 0644
42+
overwrite: true
43+
contents:
44+
inline: |
45+
path /var/crash
46+
core_collector makedumpfile -l --message-level 7 -d 31
47+
- path: /etc/sysconfig/kdump
48+
mode: 0644
49+
overwrite: true
50+
contents:
51+
inline: |
52+
KDUMP_COMMANDLINE_REMOVE="hugepages hugepagesz slub_debug quiet log_buf_len swiotlb"
53+
KDUMP_COMMANDLINE_APPEND="irqpoll maxcpus=1 reset_devices cgroup_disable=memory mce=off numa=off udev.children-max=2 panic=10 rootflags=nofail acpi_no_memhotplug transparent_hugepage=never nokaslr novmcoredd hest_disable"
54+
KEXEC_ARGS="-s"
55+
KDUMP_IMG="vmlinuz"
56+
systemd:
57+
units:
58+
- name: kdump.service
59+
enabled: true
60+
61+
- name: Generate MachineConfig YAML using Butane
62+
command: "butane /tmp/99-worker-kdump.bu -o /tmp/99-worker-kdump.yaml"
63+
args:
64+
creates: "/tmp/99-worker-kdump.yaml"
65+
66+
- name: Apply MachineConfig to the OpenShift cluster
67+
command: "oc apply -f /tmp/99-worker-kdump.yaml"
68+
69+
- name: Wait until all worker nodes are in Ready state
70+
shell: |
71+
oc get nodes --selector='node-role.kubernetes.io/worker' --no-headers | \
72+
awk '{print $2}' | grep -v 'Ready' || true
73+
register: worker_nodes_ready
74+
retries: 10
75+
delay: 30
76+
until: worker_nodes_ready.stdout == ""
77+
78+
- name: Restart kdump on worker node
79+
shell: >
80+
oc debug node/{{ worker_node }} -- chroot /host bash -c 'kdumpctl restart'
81+
register: kdump_restart_output
82+
failed_when: "'error' in kdump_restart_output.stderr.lower()"
83+
changed_when: true
84+
85+
- name: Wait for system to settle
86+
pause:
87+
seconds: 60
88+
89+
# Check kdump status
90+
- name: Check kdump status on worker node
91+
shell: >
92+
oc debug node/{{ worker_node }} -- chroot /host bash -c 'kdumpctl status'
93+
register: kdump_status_output
94+
95+
- name: Debug kdump status
96+
debug:
97+
var: kdump_status_output.stdout_lines
98+
99+
- name: Fail if kdump is not operational
100+
become: yes
101+
fail:
102+
msg: "Kdump is NOT operational"
103+
when:
104+
- "'Kdump is operational' not in kdump_status_output.stdout"
105+
106+
- name: Check if kdump service is active
107+
become: yes
108+
command: systemctl is-active kdump
109+
register: kdump_service_status
110+
111+
- name: Fail if kdump service is not active
112+
fail:
113+
msg: "kdump service is not active"
114+
when: kdump_service_status.stdout != "active"
115+
116+
- name: Validate essential keys exist in /etc/sysconfig/kdump
117+
become: true
118+
shell: |
119+
grep -q '^KDUMP_COMMANDLINE_APPEND=' /etc/sysconfig/kdump && \
120+
grep -q '^KDUMP_COMMANDLINE_REMOVE=' /etc/sysconfig/kdump && \
121+
grep -q '^KEXEC_ARGS=' /etc/sysconfig/kdump && \
122+
grep -q '^KDUMP_IMG=' /etc/sysconfig/kdump
123+
register: sysconfig_kdump_check
124+
failed_when: sysconfig_kdump_check.rc != 0
125+
126+
- name: Validate /etc/kdump.conf contents
127+
become: yes
128+
shell: |
129+
grep -q '^path /var/crash' /etc/kdump.conf && \
130+
grep -q 'core_collector makedumpfile -l --message-level 7 -d 31' /etc/kdump.conf
131+
register: kdump_conf_check
132+
failed_when: kdump_conf_check.rc != 0
133+
134+
# trigger crash file
135+
- name: Repeat crash trigger 10 times with pauses
136+
include_tasks: crash_trigger.yml
137+
loop: "{{ range(1, 11) | list }}"
138+
loop_control:
139+
label: "Crash iteration {{ item }}"
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
---
2+
3+
ocp_kdump_env:
4+
KUBECONFIG: "{{ ansible_env.HOME }}/.kube/config"

0 commit comments

Comments
 (0)