Skip to content

Commit 549efeb

Browse files
committed
AUFN CI unfinished draft
1 parent 9771e0d commit 549efeb

File tree

3 files changed

+388
-2
lines changed

3 files changed

+388
-2
lines changed

.github/workflows/deploy-aufn.yml

Lines changed: 386 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,386 @@
1+
---
2+
#
3+
4+
name: AUFN KIDDIN' ME?!
5+
6+
on:
7+
push:
8+
branches:
9+
- AUFN-CI
10+
workflow_dispatch:
11+
inputs:
12+
deployment_type:
13+
description: Type of deployment
14+
type: choice
15+
options:
16+
- Test
17+
- Deployment
18+
default: Test
19+
lab_vm_count:
20+
description: Total number of Lab VMs to deploy
21+
type: number
22+
required: true
23+
default: 2
24+
reg_pwd: # When using in workflow use ::add-mask::$ to mask the password
25+
description: Password for registry access
26+
type: string
27+
default: "" # NOTE: This needs to be set at runtime via secrets
28+
os_image:
29+
description: Host OS image
30+
type: choice
31+
options:
32+
- Ubuntu
33+
- Rocky9
34+
default: 'Rocky9'
35+
aufn_branch:
36+
description: Which branch of AUFN to use
37+
type: string
38+
default: smslab/2023.1
39+
au_from_seed:
40+
description: Run 'A Universe From Seed'?
41+
type: boolean
42+
default: false
43+
debug_mode:
44+
description: Keep Test up to debug?
45+
type: boolean
46+
default: false
47+
secrets:
48+
BASTION_TEST_PASSWORD:
49+
required: true
50+
CLOUDS_YAML:
51+
required: true
52+
OS_APPLICATION_CREDENTIAL_ID:
53+
required: true
54+
OS_APPLICATION_CREDENTIAL_SECRET:
55+
required: true
56+
57+
jobs:
58+
set-up-vars:
59+
name: Set up variables
60+
environment: ${{ inputs.deployment_type }}
61+
runs-on: Ubuntu-latest
62+
63+
steps:
64+
- name: Install Package
65+
uses: ConorMacBride/install-package@main
66+
with:
67+
apt: git unzip nodejs python3-pip python3-venv openssh-server openssh-client jq
68+
69+
- name: Install sshpass
70+
run: sudo apt-get update && sudo apt-get install -y sshpass
71+
72+
- name: Start the SSH service
73+
run: |
74+
sudo /etc/init.d/ssh start
75+
76+
# - name: Check if 'Deployment' Lab is already deployed
77+
# uses: softwareforgood/check-artifact-v4-existence@v0
78+
# with:
79+
# name: ${{ inputs.deployment_type }}-terraform-artifacts
80+
#
81+
# or use a ping command to check if the bastion is up
82+
#
83+
84+
- name: Checkout
85+
uses: actions/checkout@v4
86+
with:
87+
path: repo-dir
88+
89+
- name: Move contents to $GITHUB_WORKSPACE
90+
run: |
91+
mv repo-dir/* ~/
92+
93+
- name: Generate clouds.yaml
94+
run: |
95+
cat << EOF > clouds.yaml
96+
${{ secrets.CLOUDS_YAML }}
97+
EOF
98+
99+
- name: Generate terraform.tfvars
100+
run: |
101+
cat << EOF > terraform.tfvars
102+
lab_count = {{ inputs.lab_vm_count }}
103+
lab_net_ipv4 = "stackhpc-ipv4-aufn"
104+
image_id = "${{ env.LAB_IMAGE_ID }}"
105+
image_name = "${{ env.LAB_IMAGE_NAME }}"
106+
lab_flavor = "aufn.v1.large"
107+
registry_flavor = "general.v1.medium"
108+
boot_labs_from_volume = true
109+
image_user = "${{ env.LAB_IMAGE_USER }}"
110+
allocate_floating_ips = false
111+
create_bastion = true
112+
EOF
113+
114+
# Conditionally append bastion_floating_ip
115+
if [ "${{ inputs.deployment_type }}" = "Deployment" ]; then
116+
echo 'bastion_floating_ip = "185.45.78.149"' >> terraform.tfvars
117+
fi
118+
env:
119+
LAB_IMAGE_ID: ${{ inputs.os_image == 'Rocky9' && 'vars.LAB_OS_IMAGE_ROCKY' || inputs.os_image == 'Ubuntu' && 'vars.LAB_OS_IMAGE_UBUNTU' }}
120+
LAB_IMAGE_NAME: ${{ inputs.os_image == 'Ubuntu' && 'Ubuntu-22.04' || inputs.os_image }}
121+
LAB_IMAGE_USER: ${{ inputs.os_image == 'Ubuntu' && 'ubuntu' || inputs.os_image == 'Rocky9' && 'rocky' }}
122+
123+
- name: Terraform Plan
124+
run: terraform plan
125+
env:
126+
OS_CLOUD: ${{ vars.OS_CLOUD }}
127+
128+
- name: Terraform Apply
129+
id: tf_apply
130+
run: |
131+
for attempt in $(seq 5); do
132+
if terraform apply -auto-approve; then
133+
echo "Created infrastructure on attempt $attempt"
134+
exit 0
135+
fi
136+
echo "Failed to create infrastructure on attempt $attempt"
137+
sleep 10
138+
139+
# Need to add a check to see which part failed and then
140+
# taint and retry once more before declating failure
141+
142+
terraform destroy -auto-approve
143+
sleep 60
144+
done
145+
echo "Failed to create infrastructure after $attempt attempts"
146+
exit 1
147+
env:
148+
OS_CLOUD: ${{ vars.OS_CLOUD }}
149+
150+
- name: Get Terraform outputs
151+
id: tf_outputs
152+
run: |
153+
terraform output -json
154+
155+
- name: Write Terraform outputs
156+
run: |
157+
cat << EOF > tf-outputs.yml
158+
${{ steps.tf_outputs.outputs.stdout }}
159+
EOF
160+
161+
- name: Write out Lab VMs info
162+
run: |
163+
terraform output -raw labs > ssh_list.txt
164+
165+
- name: Update bastion password authentication and set login password
166+
run: |
167+
echo "::add-mask::${{ env.reg_pwd_var }}"
168+
169+
ssh [email protected] -i default.pem <<EOF
170+
echo '${{ env.reg_pwd_var }}' | sudo passwd --stdin rocky
171+
sudo sed -i 's/PasswordAuthentication no/PasswordAuthentication yes/' /etc/ssh/sshd_config.d/50-cloud-init.conf
172+
sudo systemctl restart sshd
173+
EOF
174+
shell: bash
175+
env:
176+
reg_pwd_var: ${{ inputs.reg_pwd == '' && 'secrets.BASTION_TEST_PASSWORD' || inputs.reg_pwd }}
177+
178+
- name: Check connection to Lab VMs
179+
run: |
180+
bastion_ip=185.45.78.149
181+
bastion_key="default.pem"
182+
183+
while IFS= read -r line; do
184+
ip=$(echo "$line" | awk '{print $2}')
185+
name=$(echo "$line" | awk '{print $3}')
186+
password=$(echo "$line" | awk '{print $5}')
187+
188+
echo "::add-mask::$password"
189+
190+
echo "Connecting to $name at $ip via bastion..."
191+
192+
sshpass -p "$password" ssh -o StrictHostKeyChecking=no \
193+
-o ProxyJump="${LAB_IMAGE_USER}@${bastion_ip}" \
194+
-o IdentityFile=$bastion_key \
195+
"${LAB_IMAGE_USER}@${ip}" \
196+
'echo "Connected to $(hostname)"'
197+
done < ssh_list.txt
198+
shell: bash
199+
env:
200+
LAB_IMAGE_USER: ${{ inputs.os_image == 'Ubuntu' && 'ubuntu' || inputs.os_image == 'Rocky9' && 'rocky' }}
201+
202+
- name: Validate lab VMs setup
203+
run: |
204+
bastion_ip=185.45.78.149
205+
bastion_key="default.pem"
206+
index=0
207+
failed_indexes=()
208+
209+
while IFS= read -r line; do
210+
ip=$(echo "$line" | awk '{print $2}')
211+
name=$(echo "$line" | awk '{print $3}')
212+
password=$(echo "$line" | awk '{print $5}')
213+
taint="false"
214+
215+
echo "::add-mask::$password"
216+
echo "Connecting to $name at $ip..."
217+
218+
# Run the compound remote commands
219+
sshpass -p "$password" ssh -o StrictHostKeyChecking=no \
220+
-o ProxyJump="${LAB_IMAGE_USER}@${bastion_ip}" \
221+
-o IdentityFile=$bastion_key \
222+
"${LAB_IMAGE_USER}@${ip}" <<'EOF'
223+
224+
echo Checking 'virsh list --all'..."
225+
output=$(sudo virsh list --all)
226+
echo "$output"
227+
228+
if ! echo "$output" | grep -q 'seed.*running'; then echo "'seed' not running"; taint="true"; fi
229+
if ! echo "$output" | grep -q 'compute0.*shut off'; then echo "'compute0' not shut off"; taint="true"; fi
230+
if ! echo "$output" | grep -q 'controller0.*shut off'; then echo "'controller0' not shut off"; taint="true"; fi
231+
232+
echo "Checking 'bifrost_deploy' container..."
233+
container_output=$(ssh [email protected] 'sudo docker ps')
234+
echo "$container_output"
235+
if ! echo "$container_output" | grep -q bifrost_deploy; then echo "Container bifrost_deploy not found running"; taint="true"; fi
236+
237+
echo "Checking openssh package source..."
238+
pkg_output=$(ssh [email protected] 'sudo dnf info openssh')
239+
echo "$pkg_output"
240+
if ! echo "$pkg_output" | grep -q 'Repository *: *@System'; then echo "Package openssh not from @System"; taint="true"; fi
241+
242+
echo "Checking a-seed-from-nothing.out log result..."
243+
if ! tail -n 10 a-seed-from-nothing.out | grep -q 'PLAY RECAP.*failed=0'; then
244+
echo "Ansible PLAY RECAP failed != 0"
245+
taint="true"
246+
fi
247+
248+
echo "All checks passed on $HOSTNAME"
249+
EOF
250+
if [ "$taint" == "true" ]; then failed_indexes+=($index); fi
251+
index=$((index + 1))
252+
253+
done < ssh_list.txt
254+
echo "FAILED_VM_INDEXES=${failed_indexes[*]}" >> $GITHUB_ENV
255+
shell: bash
256+
env:
257+
LAB_IMAGE_USER: ${{ inputs.os_image == 'Ubuntu' && 'ubuntu' || inputs.os_image == 'Rocky9' && 'rocky' }}
258+
259+
- name: Taint failed lab VMs (if any)
260+
run: |
261+
if [ -z "${FAILED_VM_INDEXES}" ]; then
262+
echo "No failed VMs detected"
263+
exit 0
264+
fi
265+
266+
for idx in $FAILED_VM_INDEXES; do
267+
echo "Tainting openstack_compute_instance_v2.lab[$idx]"
268+
terraform taint "openstack_compute_instance_v2.lab[$idx]"
269+
done
270+
271+
echo "Re-running Terraform apply to fix failed VMs"
272+
terraform apply -auto-approve
273+
env:
274+
FAILED_VM_INDEXES: ${{ env.FAILED_VM_INDEXES }}
275+
shell: bash
276+
277+
- name: Get Terraform outputs
278+
id: tf_outputs
279+
run: |
280+
terraform output -json
281+
282+
- name: Write Terraform outputs
283+
run: |
284+
cat << EOF > tf-outputs.yml
285+
${{ steps.tf_outputs.outputs.stdout }}
286+
EOF
287+
288+
- name: Write out Lab VMs info
289+
run: |
290+
terraform output -raw labs > ssh_list.txt
291+
292+
- name: Re-test failed lab VMs after redeploy
293+
run: |
294+
set -euo pipefail
295+
296+
bastion_ip=185.45.78.149
297+
bastion_key="default.pem"
298+
mapfile -t ssh_lines < ssh_list.txt
299+
300+
for idx in $FAILED_VM_INDEXES; do
301+
line="${ssh_lines[$idx]}"
302+
ip=$(echo "$line" | awk '{print $2}')
303+
name=$(echo "$line" | awk '{print $3}')
304+
password=$(echo "$line" | awk '{print $5}')
305+
306+
echo "::add-mask::$password"
307+
echo "Re-testing $name at $ip (index $idx)..."
308+
309+
sshpass -p "$password" ssh -o StrictHostKeyChecking=no \
310+
-o ProxyJump="${LAB_IMAGE_USER}@${bastion_ip}" \
311+
-o IdentityFile=$bastion_key \
312+
"${LAB_IMAGE_USER}@${ip}" <<'EOF' || {
313+
echo "Post-deploy check failed on $name. Destroying all infrastructure..."
314+
terraform destroy -auto-approve
315+
exit 1
316+
}
317+
318+
echo "Re-checking virsh VMs..."
319+
output=$(sudo virsh list --all)
320+
echo "$output"
321+
if ! echo "$output" | grep -q 'seed.*running'; then echo "'seed' not running"; exit 1; fi
322+
if ! echo "$output" | grep -q 'compute0.*shut off'; then echo "'compute0' not shut off"; exit 1; fi
323+
if ! echo "$output" | grep -q 'controller0.*shut off'; then echo "'controller0' not shut off"; exit 1; fi
324+
325+
echo "Checking bifrost container..."
326+
if ! ssh [email protected] 'sudo docker ps' | grep -q bifrost_deploy; then
327+
echo "bifrost_deploy container not running"; exit 1;
328+
fi
329+
330+
echo "Checking openssh package source..."
331+
if ! ssh [email protected] 'sudo dnf info openssh' | grep -q 'Repository *: *@System'; then
332+
echo "openssh not from @System"; exit 1;
333+
fi
334+
335+
echo "Checking a-seed-from-nothing.out for Ansible success..."
336+
if ! tail -n 20 a-seed-from-nothing.out | grep -q 'PLAY RECAP.*failed=0'; then
337+
echo "Ansible PLAY RECAP shows failures"; exit 1;
338+
fi
339+
340+
echo "All post-redeploy checks passed on $HOSTNAME"
341+
EOF
342+
343+
done
344+
shell: bash
345+
env:
346+
LAB_IMAGE_USER: ${{ inputs.os_image == 'Ubuntu' && 'ubuntu' || inputs.os_image == 'Rocky9' && 'rocky' }}
347+
FAILED_VM_INDEXES: ${{ env.FAILED_VM_INDEXES }}
348+
349+
- name: Run a-universe-from-seed.sh if true
350+
if: inputs.au_from_seed == true
351+
run: |
352+
bastion_ip=185.45.78.149
353+
bastion_key="default.pem"
354+
355+
mapfile -t ssh_lines < ssh_list.txt
356+
357+
for i in "${!ssh_lines[@]}"; do
358+
line="${ssh_lines[$i]}"
359+
ip=$(echo "$line" | awk '{print $2}')
360+
name=$(echo "$line" | awk '{print $3}')
361+
password=$(echo "$line" | awk '{print $5}')
362+
363+
echo "::add-mask::$password"
364+
echo "Launching a-universe-from-seed.sh on $name at $ip in tmux..."
365+
366+
sshpass -p "$password" ssh -o StrictHostKeyChecking=no \
367+
-o ProxyJump="${LAB_IMAGE_USER}@${bastion_ip}" \
368+
-o IdentityFile=$bastion_key \
369+
"${LAB_IMAGE_USER}@${ip}" \
370+
"tmux new-session -d -s aus-run './a-universe-from-seed.sh'"
371+
done
372+
shell: bash
373+
env:
374+
LAB_IMAGE_USER: ${{ inputs.os_image == 'Ubuntu' && 'ubuntu' || inputs.os_image == 'Rocky9' && 'rocky' }}
375+
376+
# - name: Run test workflow
377+
# if: inputs.deployment_type == 'Test'
378+
# uses: ./.github/workflows/AUFN-test.yml
379+
380+
# - name: Upload Terraform outputs
381+
# if: ${{ inputs.deployment_type == 'Deployment' || inputs.debug_mode == true }}
382+
# uses: actions/upload-artifact@v4
383+
# with:
384+
# name: ${{ inputs.deployment_type }}-terraform-artifacts
385+
386+

a-seed-from-nothing.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ cd $HOME
101101
git clone https://github.com/stackhpc/beokay.git -b master
102102

103103
# Use Beokay to bootstrap your control host.
104-
[[ -d deployment ]] || beokay/beokay.py create --base-path ~/deployment --kayobe-repo https://opendev.org/openstack/kayobe.git --kayobe-branch stable/2023.1 --kayobe-config-repo https://github.com/stackhpc/a-universe-from-nothing.git --kayobe-config-branch stable/2023.1
104+
[[ -d deployment ]] || beokay/beokay.py create --base-path ~/deployment --kayobe-repo https://opendev.org/openstack/kayobe.git --kayobe-branch unmaintained/2023.1 --kayobe-config-repo https://github.com/stackhpc/a-universe-from-nothing.git --kayobe-config-branch stable/2023.1
105105

106106
# Bump the provisioning time - it can be lengthy on virtualised storage
107107
sed -i.bak 's%^[# ]*wait_active_timeout:.*% wait_active_timeout: 5000%' ~/deployment/src/kayobe/ansible/overcloud-provision.yml

0 commit comments

Comments
 (0)