Skip to content

Commit a22e794

Browse files
committed
Add the steps to reboot the computes after update.
This sequence implements reboot of the compute nodes after the update. If one or more instances have been created on the hypervisor being rebooted they will be live-migrated to others hypervisor before the reboot and migrated back to that original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. During the reboot we start two scripts. One monitors and log the reboot of the hypervisors. The other log where the instance is currently running. Closes: https://issues.redhat.com/browse/OSPRH-8937
1 parent 7c36a5a commit a22e794

File tree

7 files changed

+272
-0
lines changed

7 files changed

+272
-0
lines changed

roles/update/tasks/main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,6 @@
7474
- not cifmw_update_run_dryrun | bool
7575
ansible.builtin.shell: |
7676
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
77+
78+
- name: Reboot the compute nodes
79+
ansible.builtin.include_tasks: reboot_compute.yml
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
- name: Define command for OpenStack client interactions
2+
ansible.builtin.set_fact:
3+
cifmw_update_openstack_cmd: |
4+
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack
5+
cifmw_update_bash_cmd: |
6+
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c
7+
8+
- name: Register storage backend type
9+
shell: >-
10+
{{ cifmw_update_openstack_cmd }} volume service list -f json |
11+
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
12+
register: storage_backend
13+
14+
- name: Get list of OpenStack hypervisors
15+
ansible.builtin.shell: |
16+
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
17+
register: hypervisor_list
18+
changed_when: false
19+
20+
- name: Parse the hypervisor list to extract hostnames
21+
ansible.builtin.set_fact:
22+
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"
23+
24+
- name: Create a reboot monitor script
25+
ansible.builtin.template:
26+
src: "monitor_servers.sh.j2"
27+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_server.sh"
28+
mode: "0775"
29+
30+
- name: Start the reboot monitor script
31+
ansible.builtin.shell:
32+
cmd:: "monitor_servers.sh"
33+
34+
- name: Create a instance placement monitor script
35+
ansible.builtin.template:
36+
src: "monitor_vm_placement.sh.j2"
37+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
38+
mode: "0775"
39+
40+
- name: Start the monitor placement script
41+
ansible.builtin.shell:
42+
cmd: "monitor_vm_placement.sh"
43+
44+
- name: Iterate over each hypervisor
45+
ansible.builtin.include_tasks: reboot_hypervisor.yml
46+
loop: "{{ hypervisor_hostnames }}"
47+
loop_control:
48+
loop_var: hypervisor
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
---
2+
- name: Extract short hostname from FQDN
3+
ansible.builtin.set_fact:
4+
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"
5+
6+
- debug:
7+
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"
8+
9+
- name: Check active VMs on hypervisor
10+
ansible.builtin.shell: >-
11+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
12+
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
13+
register: active_vms
14+
changed_when: false
15+
16+
- name: Evacuate VMs if they are running
17+
ansible.builtin.shell: >-
18+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
19+
nova host-evacuate-live
20+
{% if 'ceph' not in storage_backend.stdout %}
21+
--block-migrate
22+
{% endif %}
23+
{{ hypervisor }}"
24+
when: active_vms.stdout != ''
25+
changed_when: true
26+
27+
- name: Wait for compute node to get quiesced
28+
ansible.builtin.shell: >-
29+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
30+
| jq -r -c '[.[] | select(.Status |
31+
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
32+
| length'
33+
register: compute_node_instances
34+
until: compute_node_instances.stdout.find("0") > -1
35+
retries: 30
36+
delay: 5
37+
when:
38+
- active_vms.stdout != ''
39+
40+
- name: Reboot the hypervisor
41+
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml
42+
43+
- name: Perform sanity checks post-reboot
44+
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
45+
vars:
46+
current_hypervisor: "{{ hypervisor }}"
47+
48+
- debug:
49+
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
50+
with_items: "{{ active_vms.stdout_lines }}"
51+
52+
- name: Migrate back VMs post-reboot
53+
ansible.builtin.shell: >-
54+
set -o pipefail;
55+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
56+
nova live-migration
57+
{% if 'ceph' not in storage_backend.stdout %}
58+
--block-migrate
59+
{% endif %}
60+
{{ item }} {{ hypervisor }}";
61+
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
62+
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
63+
register: instance_migration_result
64+
until: instance_migration_result.stdout.find(hypervisor) > -1
65+
retries: 30
66+
delay: 5
67+
with_items: "{{ active_vms.stdout_lines }}"
68+
when:
69+
- active_vms.stdout != ''
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
- ansible.builtin.debug:
3+
msg: "Here I'm testing the reboot for {{ current_hypervisor }}."
4+
5+
- name: Verify nova-compute service
6+
ansible.builtin.shell: >-
7+
{{ openstack_cmd }} compute service list --host {{ current_hypervisor }} -f json
8+
| jq -r -c '.[]
9+
| select(.Binary | contains("nova-compute")) | .State'
10+
register: nova_compute_status
11+
until: nova_compute_status.stdout == 'up'
12+
retries: 5
13+
delay: 30
14+
15+
- name: Verify ovn-controller service
16+
ansible.builtin.shell: >-
17+
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
18+
| jq -r -c '.[]
19+
| select(.Binary | contains("ovn-controller")) | .Alive'
20+
register: ovn_controller_status
21+
until: ovn_controller_status.stdout == 'true'
22+
retries: 5
23+
delay: 30
24+
25+
- name: Verify networking-ovn-metadata-agent
26+
ansible.builtin.shell: >-
27+
{{ openstack_cmd }} network agent list --host {{ current_hypervisor }} -f json
28+
| jq -r -c '.[]
29+
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
30+
register: networking_ovn_metadata_status
31+
until: networking_ovn_metadata_status.stdout == 'true'
32+
retries: 5
33+
delay: 30
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
- name: Define necessary command prefixes for kube operations
3+
ansible.builtin.set_fact:
4+
cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}"
5+
6+
- name: Fetch NodeSets for the OpenStackDataPlaneDeployment
7+
ansible.builtin.shell: >-
8+
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name | awk -F'/' '{print " - " $2}'
9+
register: cifmw_update_node_sets
10+
changed_when: false
11+
12+
- name: Construct date string for CR name
13+
ansible.builtin.set_fact:
14+
cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d') }}"
15+
16+
- name: Construct CR name
17+
ansible.builtin.set_fact:
18+
cifmw_reboot_dep_name: reboot-{{ hypervisor_short_name }}-{{ cifmw_update_cr_date }}
19+
20+
- name: Create OpenStackDataPlaneDeployment CR YAML file
21+
ansible.builtin.copy:
22+
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
23+
content: |
24+
apiVersion: dataplane.openstack.org/v1beta1
25+
kind: OpenStackDataPlaneDeployment
26+
metadata:
27+
name: {{ cifmw_reboot_dep_name }}
28+
namespace: {{ cifmw_update_namespace }}
29+
spec:
30+
nodeSets:
31+
{{ cifmw_update_node_sets.stdout }}
32+
servicesOverride:
33+
- reboot-os
34+
ansibleExtraVars:
35+
edpm_reboot_strategy: force
36+
ansibleLimit: {{ hypervisor_short_name }}
37+
38+
- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot
39+
ansible.builtin.shell: >-
40+
{{ cifmw_update_oc_cmd_prefix }}
41+
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml
42+
43+
- name: Check OpenStackDataPlaneDeployment status
44+
ansible.builtin.command: >-
45+
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanedeployment
46+
{{ cifmw_reboot_dep_name }}
47+
register: deployment_status
48+
until: deployment_status.stdout.find('Setup complete') > -1
49+
retries: 60
50+
delay: 5
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
3+
# List of servers can be input as command line arguments or hardcoded here.
4+
servers=(
5+
{% for server in hypervisor_hostnames %}
6+
{{ server.split('.')[0] }}
7+
{% endfor %}
8+
)
9+
# or, for a hardcoded list: servers=("server1" "server2" ...)
10+
11+
# Log file to store the status changes
12+
log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log"
13+
14+
# Function to check server status via SSH
15+
check_servers() {
16+
for server in "${servers[@]}"; do
17+
# Attempt to connect to the SSH port (22)
18+
# if nc -z -w 5 "$server" 22 &> /dev/null; then
19+
# Alternatively, using ssh (make sure you have SSH keys set up for passwordless access):
20+
if ssh -i {{ ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then
21+
# Server is up
22+
if [ "${server_status[$server]}" == "down" ]; then
23+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file"
24+
server_status[$server]="up"
25+
fi
26+
else
27+
# Server is down
28+
if [ "${server_status[$server]}" != "down" ]; then
29+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file"
30+
server_status[$server]="down"
31+
fi
32+
fi
33+
done
34+
}
35+
36+
# Initialize server status array
37+
declare -A server_status
38+
for server in "${servers[@]}"; do
39+
server_status[$server]="unknown"
40+
done
41+
42+
# Main loop to continuously check server status
43+
while true; do
44+
check_servers
45+
sleep 1 # Wait for 60 seconds before re-checking
46+
done
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/bin/bash
2+
# Log the instance hypervisor. Useful when tracking compute reboot.
3+
4+
export KUBECONFIG="{{ cifmw_openshift_kubeconfig }}"
5+
export PATH="{{ cifmw_path }}"
6+
7+
log_file={{ cifmw_update_artifacts_basedir }}/instance_placement.log
8+
source_file={{ cifmw_update_artifacts_basedir }}/workload_suffix
9+
instance_prefix="instance_"
10+
11+
. "$source_file"
12+
13+
instance_name="${instance_prefix}${SUFFIX}"
14+
previous_hypervisor=""
15+
16+
while true; do
17+
current_hypervisor=$(oc rsh -n openstack openstackclient openstack server show "${instance_name}" -f json | jq -r -c '.["OS-EXT-SRV-ATTR:host"]')
18+
if [[ "$current_hypervisor" != "$previous_hypervisor" ]]; then
19+
echo "$(date) $instance_name $current_hypervisor" >> "$log_file"
20+
previous_hypervisor="$current_hypervisor"
21+
fi
22+
sleep 1
23+
done

0 commit comments

Comments
 (0)