Skip to content

Commit c056d3c

Browse files
committed
Add the steps to reboot the computes after update.
This sequence implements reboot of the compute nodes after the update. We have one instance created. If the hypervisor being rebooted has the instance that instance will be live-migrated to another hypervisor before the reboot and migrated back to that original hypervisor after the reboot. Some basic sanity checks are performed after the reboot and before the migration back to ensure that the necessary services are up and running. During the reboot we start two scripts. One monitors and log the reboot of the hypervisors. The other log where the instance is currently running. The log files can be found in `~/ci-framework-data/tests/update/` in `reboot_server_status.log` and `instance_placement.log` respectively. A note about node evacuation. We are still using node evaction from the nova cli. This command has not been ported to the openstack cli. There's a discussion about it [on launchpad](https://bugs.launchpad.net/python-openstackclient/+bug/2055552). The official documentation mention only the live-migration path, but as we also use the live-migration in the test sequence that part is covered. We still expect customer to use the nova cli as it's way more user friendly and is still currently working. Closes: https://issues.redhat.com/browse/OSPRH-8937
1 parent d4072ba commit c056d3c

File tree

10 files changed

+367
-1
lines changed

10 files changed

+367
-1
lines changed

roles/update/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,5 +13,6 @@ Role to run update
1313
* `cifmw_update_ping_loss_percent` : (Integer) Maximum percentage of ping loss accepted. Default to `0`. Only relevant when `cifmw_update_ping_loss_second` is not 0.
1414
* `cifmw_update_control_plane_check`: (Boolean) Activate a continuous control plane testing. Default to `False`
1515
* `cifmw_update_openstackclient_pod_timeout`: (Integer) Maximum number of seconds to wait for the openstackclient Pod to be available during control plane testing, as it is being restarted during update. Default to `10` seconds.
16-
16+
* `cifmw_update_ansible_ssh_private_key_file`: (String) Define the path to the private key file used for the compute nodes.
17+
* `cifmw_update_wait_retries_reboot`: (Integer) Number of retries to wait for a compute node reboot. One retry is done every five seconds. Default to 60, so five minutes.
1718
## Examples

roles/update/defaults/main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ cifmw_update_ping_start_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_s
3838
cifmw_update_ping_stop_script: "{{ cifmw_update_artifacts_basedir }}/l3_agent_stop_ping.sh"
3939

4040
## User facing
41+
cifmw_update_ansible_ssh_private_key_file: >-
42+
"{{ ansible_ssh_private_key_file | default(ansible_user_dir ~ '/.ssh/id_cifw') }}"
43+
cifmw_update_wait_retries_reboot: 60
4144

4245
cifmw_update_ping_test: false
4346
cifmw_update_create_volume: false

roles/update/molecule/default/prepare.yml

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,17 @@
2323
cifmw_installyamls_repos: "{{ ansible_user_dir }}/src/github.com/openstack-k8s-operators/install_yamls"
2424
cifmw_install_yamls_defaults:
2525
NAMESPACE: openstack
26+
cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}"
27+
cifmw_openshift_kubeconfig: "{{ ansible_user_dir }}/.crc/machines/crc/kubeconfig"
28+
pre_tasks:
29+
- name: Ensure CRC is started
30+
async: 1800
31+
poll: 0
32+
register: _crc_start
33+
ansible.builtin.command: crc start
34+
environment:
35+
PATH: "{{ cifmw_path }}"
36+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
2637
roles:
2738
- role: test_deps
2839
- role: ci_setup
@@ -31,4 +42,12 @@
3142
- name: Set custom cifmw PATH reusable fact
3243
ansible.builtin.set_fact:
3344
cifmw_path: "{{ ansible_user_dir }}/.crc/bin:{{ ansible_user_dir }}/.crc/bin/oc:{{ ansible_user_dir }}/bin:{{ ansible_env.PATH }}"
45+
cifmw_openshift_kubeconfig: "{{ ansible_user_dir }}/.crc/machines/crc/kubeconfig"
3446
cacheable: true
47+
- name: Check for CRC status
48+
ansible.builtin.async_status:
49+
jid: "{{ _crc_start.ansible_job_id }}"
50+
register: _crc_status
51+
until: _crc_status.finished
52+
retries: 100
53+
delay: 10

roles/update/tasks/main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,6 @@
7474
- not cifmw_update_run_dryrun | bool
7575
ansible.builtin.shell: |
7676
{{ cifmw_update_artifacts_basedir }}/control_plane_test_stop.sh
77+
78+
- name: Reboot the compute nodes
79+
ansible.builtin.include_tasks: reboot_computes.yml
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
- name: Define command for OpenStack client interactions
2+
ansible.builtin.set_fact:
3+
cifmw_update_openstack_cmd: >-
4+
oc rsh -n {{ cifmw_update_namespace }} openstackclient openstack
5+
6+
- name: Register storage backend type
7+
environment:
8+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
9+
PATH: "{{ cifmw_path }}"
10+
ansible.builtin.shell: >-
11+
set -o pipefail;
12+
{{ cifmw_update_openstack_cmd }} volume service list -f json |
13+
jq -r -c '.[] | select(.Binary | contains("cinder-volume")) | .Host'
14+
register: storage_backend
15+
16+
- name: Get the list of OpenStack hypervisors
17+
environment:
18+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
19+
PATH: "{{ cifmw_path }}"
20+
ansible.builtin.shell: |
21+
{{ cifmw_update_openstack_cmd }} hypervisor list -f json
22+
register: hypervisor_list
23+
changed_when: false
24+
25+
- name: Parse the hypervisor list to extract hostnames
26+
ansible.builtin.set_fact:
27+
hypervisor_hostnames: "{{ hypervisor_list.stdout | from_json | map(attribute='Hypervisor Hostname') | list }}"
28+
29+
- name: Create a reboot monitor servers script
30+
ansible.builtin.template:
31+
src: "monitor_servers.sh.j2"
32+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_servers.sh"
33+
mode: "0775"
34+
35+
- name: Start the monitor servers script
36+
ansible.builtin.shell: |
37+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_servers.sh &> /dev/null &
38+
echo $!
39+
register: monitor_servers_job
40+
41+
- name: Create a monitor placement monitor script
42+
ansible.builtin.template:
43+
src: "monitor_vm_placement.sh.j2"
44+
dest: "{{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh"
45+
mode: "0775"
46+
47+
- name: Start the monitor placement script
48+
ansible.builtin.shell: |
49+
nohup {{ cifmw_update_artifacts_basedir }}/monitor_vm_placement.sh &> /dev/null &
50+
echo $!
51+
register: monitor_placement_job
52+
53+
- name: Iterate over each hypervisor for the reboot sequence
54+
ansible.builtin.include_tasks: reboot_hypervisor.yml
55+
loop: "{{ hypervisor_hostnames }}"
56+
loop_control:
57+
loop_var: hypervisor
58+
59+
- name: Stop the monitor servers script if running
60+
ansible.builtin.shell: |
61+
if kill -0 {{ monitor_servers_job.stdout }} &>/dev/null; then
62+
kill {{ monitor_servers_job.stdout }}
63+
fi
64+
register: kill_result
65+
failed_when: kill_result.rc not in [0, 1] # We can still have a race
66+
# between kill -0 and
67+
# kill, even if unlikely.
68+
69+
- name: Stop the monitor placement script if running
70+
ansible.builtin.shell: |
71+
if kill -0 {{ monitor_placement_job.stdout }} &>/dev/null; then
72+
kill {{ monitor_placement_job.stdout }}
73+
fi
74+
register: kill_result
75+
failed_when: kill_result.rc not in [0, 1]
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
---
2+
- name: Extract short hostname from FQDN
3+
ansible.builtin.set_fact:
4+
cifmw_update_hypervisor_short_name: "{{ hypervisor.split('.')[0] }}"
5+
6+
- name: Current stage
7+
ansible.builtin.debug:
8+
msg: "Rebooting {{ cifmw_update_hypervisor_short_name }}"
9+
10+
- name: Define command for nova interaction
11+
ansible.builtin.set_fact:
12+
cifmw_update_bash_cmd: >-
13+
oc rsh -n {{ cifmw_update_namespace }} openstackclient bash -c
14+
15+
- name: Check active VMs on hypervisor
16+
ansible.builtin.shell: >-
17+
set -o pipefail;
18+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
19+
| jq -r -c '.[] | select(.Status | contains("ACTIVE") or contains("PAUSED")) | .ID'
20+
register: active_vms
21+
changed_when: false
22+
23+
- name: Evacuate VMs if they are running
24+
environment:
25+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
26+
PATH: "{{ cifmw_path }}"
27+
ansible.builtin.shell: >-
28+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
29+
nova host-evacuate-live
30+
{% if 'ceph' not in storage_backend.stdout %}
31+
--block-migrate
32+
{% endif %}
33+
{{ hypervisor }}"
34+
when: active_vms.stdout != ''
35+
changed_when: true
36+
37+
- name: Wait for compute node to get quiesced
38+
ansible.builtin.shell: >-
39+
set -o pipefail;
40+
{{ cifmw_update_openstack_cmd }} server list --all --host {{ hypervisor }} -f json
41+
| jq -r -c '[.[] | select(.Status |
42+
contains("ACTIVE") or contains("PAUSED") or contains("MIGRATING"))]
43+
| length'
44+
register: compute_node_instances
45+
until: compute_node_instances.stdout.find("0") > -1
46+
retries: 30
47+
delay: 5
48+
when:
49+
- active_vms.stdout != ''
50+
51+
- name: Reboot the hypervisor using CR
52+
ansible.builtin.include_tasks: reboot_hypervisor_using_cr.yml
53+
54+
- name: Perform sanity checks post-reboot
55+
ansible.builtin.include_tasks: reboot_hypervisor_sanity_checks.yml
56+
vars:
57+
current_hypervisor: "{{ hypervisor }}"
58+
59+
- name: Current stage
60+
ansible.builtin.debug:
61+
msg: "Migrate back {{ item }} to {{ cifmw_update_hypervisor_short_name }}."
62+
with_items: "{{ active_vms.stdout_lines }}"
63+
64+
- name: Migrate back VMs post-reboot
65+
environment:
66+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
67+
PATH: "{{ cifmw_path }}"
68+
ansible.builtin.shell: >-
69+
set -o pipefail;
70+
{{ cifmw_update_bash_cmd }} ". cloudrc &&
71+
nova live-migration
72+
{% if 'ceph' not in storage_backend.stdout %}
73+
--block-migrate
74+
{% endif %}
75+
{{ item }} {{ hypervisor }}";
76+
{{ cifmw_update_openstack_cmd }} server show {{ item }} -f json |
77+
jq -r -c '. | .["OS-EXT-SRV-ATTR:host"]'
78+
register: instance_migration_result
79+
until: instance_migration_result.stdout.find(hypervisor) > -1
80+
retries: 30
81+
delay: 5
82+
with_items: "{{ active_vms.stdout_lines }}"
83+
when:
84+
- active_vms.stdout != ''
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
---
2+
- name: Current stage
3+
ansible.builtin.debug:
4+
msg: |
5+
Testing the status of the services for {{ current_hypervisor }} after reboot.
6+
7+
- name: Verify nova-compute service
8+
environment:
9+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
10+
PATH: "{{ cifmw_path }}"
11+
ansible.builtin.shell: >-
12+
set -o pipefail;
13+
{{ cifmw_update_openstack_cmd }} compute service list
14+
--host {{ current_hypervisor }} -f json
15+
| jq -r -c '.[]
16+
| select(.Binary | contains("nova-compute")) | .State'
17+
register: nova_compute_status
18+
until: nova_compute_status.stdout == 'up'
19+
retries: 30
20+
delay: 5
21+
22+
- name: Verify ovn-controller service
23+
environment:
24+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
25+
PATH: "{{ cifmw_path }}"
26+
ansible.builtin.shell: >-
27+
set -o pipefail;
28+
{{ cifmw_update_openstack_cmd }} network agent list
29+
--host {{ current_hypervisor }} -f json
30+
| jq -r -c '.[]
31+
| select(.Binary | contains("ovn-controller")) | .Alive'
32+
register: ovn_controller_status
33+
until: ovn_controller_status.stdout == 'true'
34+
retries: 30
35+
delay: 5
36+
37+
- name: Verify networking-ovn-metadata-agent
38+
environment:
39+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
40+
PATH: "{{ cifmw_path }}"
41+
ansible.builtin.shell: >-
42+
set -o pipefail;
43+
{{ cifmw_update_openstack_cmd }} network agent list
44+
--host {{ current_hypervisor }} -f json
45+
| jq -r -c '.[]
46+
| select(.Binary | contains("neutron-ovn-metadata-agent")) | .Alive'
47+
register: networking_ovn_metadata_status
48+
until: networking_ovn_metadata_status.stdout == 'true'
49+
retries: 30
50+
delay: 5
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
---
2+
- name: Define command prefix for OpenShift operations
3+
ansible.builtin.set_fact:
4+
cifmw_update_oc_cmd_prefix: "oc -n {{ cifmw_update_namespace }}"
5+
6+
- name: Fetch NodeSets for the OpenStackDataPlaneDeployment
7+
environment:
8+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
9+
PATH: "{{ cifmw_path }}"
10+
ansible.builtin.shell: >-
11+
set -o pipefail;
12+
{{ cifmw_update_oc_cmd_prefix }} get openstackdataplanenodeset -o name
13+
| awk -F'/' '{print " - " $2}'
14+
register: cifmw_update_node_sets
15+
changed_when: false
16+
17+
- name: Construct date string for the CR name
18+
ansible.builtin.set_fact:
19+
cifmw_update_cr_date: "{{ lookup('pipe', 'date +%Y%m%d%H%S') }}"
20+
21+
- name: Construct the CR name
22+
ansible.builtin.set_fact:
23+
cifmw_reboot_dep_name: >-
24+
reboot-{{ cifmw_update_hypervisor_short_name }}-{{ cifmw_update_cr_date }}
25+
26+
- name: Create the OpenStackDataPlaneDeployment CR YAML file
27+
ansible.builtin.copy:
28+
dest: "{{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml"
29+
content: |
30+
apiVersion: dataplane.openstack.org/v1beta1
31+
kind: OpenStackDataPlaneDeployment
32+
metadata:
33+
name: {{ cifmw_reboot_dep_name }}
34+
namespace: {{ cifmw_update_namespace }}
35+
spec:
36+
nodeSets:
37+
{{ cifmw_update_node_sets.stdout }}
38+
servicesOverride:
39+
- reboot-os
40+
ansibleExtraVars:
41+
edpm_reboot_strategy: force
42+
ansibleLimit: {{ cifmw_update_hypervisor_short_name }}
43+
44+
- name: Apply the OpenStackDataPlaneDeployment CR to trigger a reboot
45+
environment:
46+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
47+
PATH: "{{ cifmw_path }}"
48+
ansible.builtin.command: >-
49+
{{ cifmw_update_oc_cmd_prefix }}
50+
create -f {{ cifmw_update_artifacts_basedir }}/{{ cifmw_reboot_dep_name }}.yaml
51+
52+
- name: Check OpenStackDataPlaneDeployment status
53+
environment:
54+
KUBECONFIG: "{{ cifmw_openshift_kubeconfig }}"
55+
PATH: "{{ cifmw_path }}"
56+
ansible.builtin.command: >-
57+
{{ cifmw_update_oc_cmd_prefix }}
58+
get openstackdataplanedeployment {{ cifmw_reboot_dep_name }}
59+
register: deployment_status
60+
until: deployment_status.stdout.find('Setup complete') > -1
61+
retries: "{{ cifmw_update_wait_retries_reboot }}"
62+
delay: 5
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
#!/bin/bash
2+
3+
set -e
4+
5+
# List of servers can be input as command line arguments or hardcoded here.
6+
servers=(
7+
{% for server in hypervisor_hostnames %}
8+
{{ server.split('.')[0] }}
9+
{% endfor %}
10+
)
11+
# or, for a hardcoded list: servers=("server1" "server2" ...)
12+
13+
# Log file to store the status changes
14+
log_file="{{ cifmw_update_artifacts_basedir }}/reboot_server_status.log"
15+
16+
# Function to check server status via SSH
17+
# TODO: ping always replies even if server is down.
18+
check_servers() {
19+
for server in "${servers[@]}"; do
20+
if ssh -i {{ cifmw_update_ansible_ssh_private_key_file }} -o BatchMode=yes -o ConnectTimeout=5 "$server" "exit" &> /dev/null; then
21+
# Server is up
22+
if [ "${server_status[$server]}" == "down" ]; then
23+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is UP" | tee -a "$log_file"
24+
server_status[$server]="up"
25+
fi
26+
else
27+
# Server is down
28+
if [ "${server_status[$server]}" != "down" ]; then
29+
echo "$(date '+%Y-%m-%d %H:%M:%S') - $server is DOWN" | tee -a "$log_file"
30+
server_status[$server]="down"
31+
fi
32+
fi
33+
done
34+
}
35+
36+
# Initialize server status array
37+
declare -A server_status
38+
for server in "${servers[@]}"; do
39+
server_status[$server]="unknown"
40+
done
41+
42+
# Main loop to continuously check server status
43+
while true; do
44+
check_servers
45+
sleep 1 # Wait for 60 seconds before re-checking
46+
done

0 commit comments

Comments
 (0)