Skip to content

Commit 63d684a

Browse files
committed
prep-fog-capture: Refactor, split, re-add cobbler-provided hacks
- Configure netplan on boot for Ubuntu - Configure NetworkManager on boot for CentOS/Rocky - No more rc.local Signed-off-by: David Galloway <david.galloway@ibm.com>
1 parent 7d6cf93 commit 63d684a

File tree

11 files changed

+705
-173
lines changed

11 files changed

+705
-173
lines changed

tools/prep-fog-capture.yml

Lines changed: 6 additions & 173 deletions
Original file line numberDiff line numberDiff line change
@@ -1,181 +1,14 @@
11
---
2-
### This standalone playbook can be used to prep a COBBLER-IMAGED testnode
2+
### This role is used to prep a {FOG|MAAS}-IMAGED testnode
33
### so that it can be used to capture an OS image for FOG.
44
### This playbook is needed for a couple reasons
55
### - NIC configs get hard coded into the captured FOG images so nodes reimaged by FOG don't come up with network
6+
### - SSH host keys need to be deleted
7+
### - apt and cloud-init services need to be disabled
68

79
- hosts:
810
- testnodes
9-
become: true
11+
roles:
12+
- prep-fog-capture
1013
gather_facts: false
11-
tasks:
12-
13-
# (Missing in RHEL8)
14-
- name: Check for /usr/bin/python
15-
shell: echo marco
16-
register: polo
17-
ignore_errors: true
18-
19-
- name: Set ansible_python_interpreter=/usr/bin/python3
20-
set_fact:
21-
ansible_python_interpreter: /usr/bin/python3
22-
when: polo is failed
23-
24-
# Now that we know where python is, we can gather_facts
25-
- setup:
26-
27-
# We need to leave /.cephlab_rc_local or else each FOG reimage would tell Cobbler to run ceph-cm-ansible
28-
- name: Remove lock files and udev rules
29-
file:
30-
path: "{{ item }}"
31-
state: absent
32-
with_items:
33-
- /etc/udev/rules.d/70-persistent-net.rules
34-
- /.cephlab_net_configured
35-
- /ceph-qa-ready
36-
37-
- name: Get list of ifcfg scripts from host used to capture image
38-
shell: "ls -1 /etc/sysconfig/network-scripts/ifcfg-* | grep -v ifcfg-lo"
39-
register: ifcfg_scripts
40-
when: ansible_os_family == "RedHat"
41-
ignore_errors: true
42-
43-
- name: Get list of ifcfg scripts from host used to capture image
44-
shell: "ls -1 /etc/sysconfig/network/ifcfg-* | grep -v ifcfg-lo"
45-
register: ifcfg_scripts
46-
when: ansible_os_family == "Suse"
47-
ignore_errors: true
48-
49-
- name: Delete ifcfg scripts
50-
file:
51-
path: "{{ item }}"
52-
state: absent
53-
with_items: "{{ ifcfg_scripts.stdout_lines|default([]) }}"
54-
when: ifcfg_scripts is defined
55-
56-
- name: Remove /var/lib/ceph mountpoint from fstab
57-
shell: sed -i '/\/var\/lib\/ceph/d' /etc/fstab
58-
59-
- name: Unmount /var/lib/ceph
60-
ansible.posix.mount:
61-
path: /var/lib/ceph
62-
state: unmounted
63-
64-
- name: Install one-shot service to regenerate SSH host keys on first boot
65-
copy:
66-
dest: /etc/systemd/system/regen-ssh-hostkeys.service
67-
owner: root
68-
group: root
69-
mode: '0644'
70-
content: |
71-
[Unit]
72-
Description=Regenerate SSH host keys on first boot
73-
ConditionPathExists=!/etc/ssh/ssh_host_ed25519_key
74-
Before=ssh.service
75-
76-
[Service]
77-
Type=oneshot
78-
ExecStart=/usr/bin/ssh-keygen -A
79-
ExecStartPost=/bin/systemctl disable regen-ssh-hostkeys.service
80-
81-
[Install]
82-
WantedBy=multi-user.target
83-
84-
- name: Reload systemd daemon
85-
systemd:
86-
daemon_reload: true
87-
88-
- name: Enable regen-ssh-hostkeys.service
89-
systemd:
90-
name: regen-ssh-hostkeys.service
91-
enabled: true
92-
93-
- name: Get list of SSH host keys
94-
shell: "ls -1 /etc/ssh/ssh_host_*"
95-
register: ssh_host_keys
96-
ignore_errors: true
97-
98-
# Key regeneration is done automatically on CentOS firstboot.
99-
# For Ubuntu, we'll add `dpkg-reconfigure openssh-server` to rc.local
100-
- name: Delete SSH host keys so they're generated during firstboot on cloned machines
101-
file:
102-
path: "{{ item }}"
103-
state: absent
104-
with_items: "{{ ssh_host_keys.stdout_lines|default([]) }}"
105-
when: ssh_host_keys is defined
106-
107-
- name: Unsubscribe RHEL
108-
command: subscription-manager unregister
109-
when: ansible_distribution == "RedHat"
110-
failed_when: false
111-
112-
# A file gets leftover when a testnode is registered with Satellite that caused
113-
# each registered subsequent testnode to report the wrong hostname
114-
- name: Clean up katello facts
115-
file:
116-
path: /etc/rhsm/facts/katello.facts
117-
state: absent
118-
when: ansible_distribution == "RedHat"
119-
120-
# https://bugzilla.redhat.com/show_bug.cgi?id=1814337
121-
- name: Disable dnf-makecache service
122-
service:
123-
name: dnf-makecache.timer
124-
state: stopped
125-
enabled: no
126-
when:
127-
- ansible_os_family == "RedHat"
128-
- ansible_distribution_major_version|int >= 8
129-
130-
# Hopefully fixes https://github.com/ceph/ceph-cm-ansible/pull/544#issuecomment-599076564
131-
- name: Clean DNF cache
132-
shell: "dnf clean all && rm -rf /var/cache/dnf/*"
133-
when:
134-
- ansible_os_family == "RedHat"
135-
- ansible_distribution_major_version|int >= 8
136-
137-
- set_fact:
138-
ntp_service: ntp
139-
when: ansible_os_family == "Debian"
140-
141-
- set_fact:
142-
ntp_service: ntpd
143-
when: ansible_os_family == "RedHat" and ansible_distribution_major_version|int <= 7
144-
145-
- set_fact:
146-
ntp_service: chronyd
147-
when: (ansible_os_family == "RedHat" and ansible_distribution_major_version|int >= 8) or
148-
ansible_os_family == "Suse"
149-
150-
- name: "Stop {{ ntp_service }} service"
151-
service:
152-
name: "{{ ntp_service }}"
153-
state: stopped
154-
when: '"ntp" in ntp_service'
155-
156-
# The theory here is although we do have the ntp service running on boot,
157-
# if the time is off, it slowly drifts back in sync. Since our testnodes
158-
# are ephemeral, they don't ever have enough time to correctly drift
159-
# back to the correct time. So we'll force it in the captured OS images.
160-
- name: Install ntpdate command if missing
161-
package:
162-
name: ntpdate
163-
state: present
164-
when: '"ntp" in ntp_service'
165-
166-
- name: Force time synchronization using stepping | ntp
167-
command: "ntpdate -b {{ ntp_servers|join(' ') }}"
168-
when: '"ntp" in ntp_service'
169-
170-
- name: "Start {{ ntp_service }}"
171-
service:
172-
name: "{{ ntp_service }}"
173-
state: started
174-
175-
# chronyd needs to be started in order to force time sync. This differs from ntpd.
176-
- name: Force time synchronization using stepping | chrony
177-
command: chronyc -a makestep
178-
when: '"chrony" in ntp_service'
179-
180-
- name: Sync the hardware clock
181-
command: "hwclock --systohc"
14+
become: true
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[Unit]
2+
Description=Ceph Lab hostname configuration
3+
After=network-online.target nss-lookup.target
4+
Wants=nss-lookup.target
5+
6+
[Service]
7+
Type=oneshot
8+
ExecStart=/usr/local/sbin/cephlab-set-hostname.sh
9+
RemainAfterExit=yes
10+
11+
[Install]
12+
WantedBy=multi-user.target
Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
#!/usr/bin/env bash
2+
# Wait for /.cephlab_net_configured, then set hostname + /etc/hostname + /etc/hosts
3+
# Portable across Ubuntu / CentOS Stream / Rocky (GNU userland).
4+
set -euo pipefail
5+
6+
WAIT_FOR_FILE="/.cephlab_net_configured"
7+
HOSTNAME_IS_SET_FILE="/.cephlab_hostname_set"
8+
LOG="/var/log/cephlab-set-hostname.log"
9+
10+
# Default nameserver, override via env var or arg1
11+
DEFAULT_NAMESERVER="10.20.192.11"
12+
NAMESERVER="${NAMESERVER:-${1:-${DEFAULT_NAMESERVER}}}"
13+
14+
MAX_WAIT_SECONDS="300"
15+
SLEEP_SECONDS="1"
16+
17+
touch "$LOG"
18+
chmod 0644 "$LOG"
19+
20+
# Send *all* stdout/stderr to logfile, journald, and console
21+
if [[ -w /dev/ttyS1 ]]; then
22+
exec > >(tee -a "$LOG" /dev/ttyS1) 2>&1
23+
else
24+
exec > >(tee -a "$LOG") 2>&1
25+
fi
26+
27+
log() {
28+
echo "$(date -u +%FT%T.%N | cut -c1-23) cephlab-set-hostname: $*" >&2
29+
}
30+
31+
# Exit if this isn't the first boot and hostname is already set
32+
if [[ -f "${HOSTNAME_IS_SET_FILE}" ]]; then
33+
log "We've already set the hostname before. Exiting..."
34+
exit 0
35+
fi
36+
37+
# Wait for /.cephlab_net_configured
38+
log "Waiting for ${WAIT_FOR_FILE} (up to ${MAX_WAIT_SECONDS}s)..."
39+
end=$((SECONDS + MAX_WAIT_SECONDS))
40+
while [[ ! -f "${WAIT_FOR_FILE}" ]]; do
41+
if (( SECONDS >= end )); then
42+
log "Timed out waiting for ${WAIT_FOR_FILE}. Exiting."
43+
exit 1
44+
fi
45+
sleep "${SLEEP_SECONDS}"
46+
done
47+
log "Flag file present. Proceeding."
48+
49+
# Gather IPv4 addresses
50+
attempts=0
51+
myips=""
52+
while [[ -z "${myips}" && ${attempts} -lt 10 ]]; do
53+
# Print global-scope IPv4 addresses, one per line (excludes 127/8)
54+
# Works across iproute2 versions on Ubuntu/CentOS/Rocky.
55+
myips="$(ip -4 -o addr show scope global 2>/dev/null | awk '$2 != "docker0" {print $4}' | cut -d/ -f1)"
56+
attempts=$((attempts + 1))
57+
sleep 1
58+
done
59+
60+
if [[ -z "${myips}" ]]; then
61+
log "No non-loopback IPv4 addresses found. Nothing to do."
62+
exit 0
63+
fi
64+
65+
# Reverse lookup helper: prefers dig, then host, then getent (best-effort).
66+
reverse_lookup() {
67+
local ip="$1"
68+
local ns="$2"
69+
local name=""
70+
71+
if command -v dig >/dev/null 2>&1; then
72+
# +short yields trailing dot; strip it
73+
name="$(dig +time=1 +tries=1 +short -x "${ip}" @"${ns}" 2>/dev/null | head -n1 | sed 's/\.$//' || true)"
74+
elif command -v host >/dev/null 2>&1; then
75+
# host -W 1 sets timeout (bind-utils)
76+
name="$(host -W 1 "${ip}" "${ns}" 2>/dev/null | awk '/domain name pointer/ {print $5}' | sed 's/\.$//' | head -n1 || true)"
77+
elif command -v getent >/dev/null 2>&1; then
78+
# getent hosts does forward lookups usually; reverse may work depending on NSS/DNS config
79+
name="$(getent hosts "${ip}" 2>/dev/null | awk '{print $2}' | head -n1 || true)"
80+
fi
81+
82+
echo "${name}"
83+
}
84+
85+
set_hostname() {
86+
local fqdn="$1"
87+
# Prefer hostnamectl when available; fall back to hostname
88+
if command -v hostnamectl >/dev/null 2>&1; then
89+
hostnamectl set-hostname "${fqdn}"
90+
else
91+
hostname "${fqdn}"
92+
fi
93+
}
94+
95+
# Pick the first IP that can reach the nameserver
96+
for ip in ${myips}; do
97+
log "Testing nameserver reachability from ${ip} -> ${NAMESERVER} ..."
98+
if timeout 1s ping -I "${ip}" -nq -c1 "${NAMESERVER}" >/dev/null 2>&1; then
99+
log "Nameserver reachable from ${ip}. Doing reverse lookup..."
100+
newhostname="$(reverse_lookup "${ip}" "${NAMESERVER}")"
101+
102+
if [[ -z "${newhostname}" ]]; then
103+
log "Reverse lookup returned empty hostname. Not changing anything."
104+
exit 0
105+
fi
106+
107+
log "Resolved ${ip} -> ${newhostname}"
108+
109+
# Set runtime hostname (hostnamectl / hostname)
110+
set_hostname "${newhostname}"
111+
112+
shorthostname="${newhostname%%.*}"
113+
114+
# Persist hostname (use FQDN)
115+
echo "${newhostname}" > /etc/hostname
116+
117+
log "Rewriting /etc/hosts from scratch"
118+
119+
cat > /etc/hosts <<EOF
120+
127.0.0.1 localhost
121+
${ip} ${newhostname} ${shorthostname}
122+
123+
# IPv6
124+
::1 localhost ip6-localhost ip6-loopback
125+
ff02::1 ip6-allnodes
126+
ff02::2 ip6-allrouters
127+
EOF
128+
129+
log "Hostname updated: $(hostname) ; /etc/hostname + /etc/hosts rewritten."
130+
touch "$HOSTNAME_IS_SET_FILE"
131+
exit 0
132+
fi
133+
done
134+
135+
log "No IP could reach nameserver ${NAMESERVER}. Nothing changed."
136+
exit 1
137+
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Unit]
2+
Description=Write netplan from link carrier once
3+
After=systemd-networkd.service local-fs.target
4+
Wants=systemd-networkd.service
5+
6+
[Service]
7+
StandardOutput=journal+console
8+
StandardError=journal+console
9+
Type=oneshot
10+
ExecStart=/usr/local/sbin/netplan-from-link.sh
11+
RemainAfterExit=yes
12+
13+
[Install]
14+
WantedBy=multi-user.target

0 commit comments

Comments
 (0)