Skip to content

Commit 9b79967

Browse files
author
Harish Kumar
committed
Add LUKS recovery key to vault
* Add LUKS recovery key to vault as part of mangoctl bootstrap or enroll for LUKS volumes in each node * added validation steps in self_test.sh * Added additional recovery_test.sh to run as part of integration test within run_tests.sh * add machine id entity metadata for a node in the vault * Add policy so node token can write (not update or read) recovery key in vault kv
1 parent 181f61c commit 9b79967

File tree

8 files changed

+208
-11
lines changed

8 files changed

+208
-11
lines changed

.github/workflows/build.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ jobs:
105105
- name: Test it
106106
run: |
107107
#!/bin/bash
108-
set -x
109108
set -e
110109
sudo apt-get update -y
111110
# mkosi doesn't pick this up from the tools dir for some reason

.github/workflows/pr.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,6 @@ jobs:
7171
- name: Test it
7272
run: |
7373
#!/bin/bash
74-
set -x
7574
set -e
7675
sudo apt-get update -y
7776
# mkosi doesn't pick this up from the tools dir for some reason

mkosi.images/base/mkosi.build.chroot

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
#!/bin/sh
2-
set -x
32

43
cp -r ${SRCDIR}/resources/mangosctl ${BUILDDIR}
54
cd ${BUILDDIR}/mangosctl

mkosi.images/base/mkosi.extra/usr/share/mangos/self_test.sh

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ BASE_URL=${BASE_URL:-http://10.0.2.2:8081}
33
export BASE_URL
44

55
set -e
6-
set -x
76

87
trap 'journalctl -n 1000 --no-pager' ERR
98
systemctl is-active [email protected]
@@ -26,3 +25,36 @@ do
2625
sleep 10
2726
echo "Trying again. $tries tries left"
2827
done
28+
29+
echo "===> Validating Recovery Keys"
30+
machine_id=$(cat /etc/machine-id)
31+
32+
# Auto-detect LUKS partitions
33+
luks_partitions=$(lsblk -nlo NAME,TYPE,FSTYPE | awk '$2 == "part" && $3 == "crypto_LUKS" {print $1}' | tr '\n' ' ')
34+
35+
if [ -z "$luks_partitions" ]; then
36+
echo "No LUKS partitions found, skipping recovery key validation"
37+
else
38+
# Test 1: Verify recovery keys exist in Vault
39+
for device in $luks_partitions; do
40+
partition=$(lsblk -nlo PARTLABEL /dev/$device | tr -d '\n')
41+
if ! mangosctl sudo -- vault kv get "secrets/mangos/recovery-keys/${machine_id}/${partition}" >/dev/null 2>&1; then
42+
echo "ERROR: Recovery key not found in Vault for ${partition}"
43+
exit 1
44+
fi
45+
echo "Recovery key for ${partition}: OK"
46+
done
47+
48+
# Test 2: Verify LUKS has multiple keyslots (TPM + recovery)
49+
for device in $luks_partitions; do
50+
partition=$(lsblk -nlo PARTLABEL /dev/$device | tr -d '\n')
51+
slots=$(cryptsetup luksDump /dev/$device 2>/dev/null | grep -c "^ [0-9]: luks2" || echo 0)
52+
if [ "$slots" -lt 2 ]; then
53+
echo "ERROR: ${partition} has only ${slots} keyslot(s), expected at least 2 (TPM + recovery)"
54+
exit 1
55+
fi
56+
echo "LUKS keyslots for ${partition}: ${slots} OK"
57+
done
58+
59+
echo "Recovery key validation: PASSED"
60+
fi

mkosi.images/terraform/share/terraform/pki.tf

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -264,6 +264,7 @@ resource "vault_cert_auth_backend_role" "node" {
264264
vault_policy.node-cert-self-renew.name,
265265
vault_policy.ssh-host-self-signer.name,
266266
vault_policy.consul-gossip.name,
267+
vault_policy.node-recovery-keys.name,
267268
]
268269
}
269270

@@ -277,6 +278,22 @@ resource "vault_policy" "node-cert-self-renew" {
277278
EOP
278279
}
279280

281+
resource "vault_policy" "node-recovery-keys" {
282+
name = "node-recovery-keys"
283+
284+
policy = <<-EOP
285+
# Allow nodes to create recovery keys for their own machine-id only (write-once, no read/update)
286+
# No read allowed because node does not need to read its own recovery key,
287+
# it is only needed to be read by admins to recover the node
288+
# No update allowed to avoid compromised node may update recovery key
289+
# Any update of recovery keys (even for rotating recovery keys) need admin actions
290+
# For which admin key should be used
291+
path "secrets/mangos/recovery-keys/{{identity.entity.metadata.machine_id}}/*" {
292+
capabilities = ["create"]
293+
}
294+
EOP
295+
}
296+
280297
resource "vault_identity_group" "vault-servers" {
281298
name = "vault-servers"
282299
type = "internal"

recovery_test.sh

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/bin/bash
2+
set -e
3+
4+
machine_id=$(cat /etc/machine-id)
5+
6+
# Auto-detect first LUKS partition for testing
7+
test_device=$(lsblk -nlo NAME,TYPE,FSTYPE | awk '$2 == "part" && $3 == "crypto_LUKS" {print $1; exit}')
8+
if [ -z "$test_device" ]; then
9+
echo "No LUKS partitions found, skipping recovery test"
10+
exit 0
11+
fi
12+
13+
test_partition=$(lsblk -nlo PARTLABEL /dev/$test_device | tr -d '\n')
14+
mapper_name=$(lsblk -nlo NAME /dev/$test_device | awk 'NR==2')
15+
16+
echo "Testing recovery unlock for ${test_partition} (device: ${test_device}, mapper: ${mapper_name})..."
17+
18+
# Get recovery key from Vault
19+
recovery_key=$(mangosctl sudo -- vault kv get -field=key "secrets/mangos/recovery-keys/${machine_id}/${test_partition}")
20+
21+
# Find TPM keyslot number
22+
tpm_slot=$(cryptsetup luksDump /dev/$test_device | \
23+
awk '/Tokens:/,/Keyslots:/ {if (/systemd-tpm2/) found=1; if (found && /^ [0-9]+:/) {print $1; exit}}' | \
24+
tr -d ':')
25+
26+
echo "Removing TPM keyslot ${tpm_slot} (simulating TPM failure)..."
27+
PASSWORD="$recovery_key" systemd-cryptenroll --wipe-slot=tpm2 /dev/$test_device
28+
29+
30+
# Get mount point for this partition
31+
mount_point=$(findmnt -n -o TARGET /dev/mapper/$mapper_name)
32+
33+
# Unmount and close
34+
if [ -n "$mount_point" ]; then
35+
systemctl stop $(systemd-escape -p --suffix=mount "$mount_point")
36+
fi
37+
cryptsetup close $mapper_name
38+
39+
# THE CRITICAL TEST: Unlock with recovery key
40+
echo "Unlocking with recovery key..."
41+
echo -n "$recovery_key" | systemd-cryptsetup attach $mapper_name /dev/$test_device -
42+
43+
# Remount
44+
if [ -n "$mount_point" ]; then
45+
mount /dev/mapper/$mapper_name "$mount_point"
46+
fi
47+
48+
# Verify device is accessible
49+
if [ ! -b /dev/mapper/$mapper_name ]; then
50+
echo "ERROR: Device not accessible after recovery"
51+
exit 1
52+
fi
53+
54+
echo "Data accessible after recovery: OK"
55+
56+
# Re-enroll TPM (cleanup for future tests)
57+
echo "Re-enrolling TPM keyslot..."
58+
echo -n "$recovery_key" | systemd-cryptenroll /dev/$test_device \
59+
--tpm2-device=auto \
60+
--tpm2-pcrs=7 \
61+
--tpm2-public-key-pcrs=11 \
62+
--unlock-key-file=/dev/stdin
63+
64+
echo "Recovery test: PASSED"

resources/mangosctl/mangosctl.sh

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,62 @@ do_install() {
320320
fi
321321
}
322322

323+
# Enroll recovery keys for encrypted partitions and store them in Vault
324+
enroll_recovery_keys() {
325+
local vault_token="$1"
326+
local machine_id="$(cat /etc/machine-id)"
327+
local found_any=0
328+
329+
# Find all LUKS-encrypted partitions
330+
local devices=($(lsblk -ln -o NAME,TYPE,FSTYPE | awk '$2=="part" && $3=="crypto_LUKS" {print "/dev/"$1}'))
331+
332+
for device in "${devices[@]}"; do
333+
local partlabel=$(lsblk -n -o PARTLABEL "$device" 2>/dev/null | tr -d ' \n\r\t')
334+
335+
# Skip if no valid partition label
336+
if [ -z "$partlabel" ]; then
337+
continue
338+
fi
339+
340+
# Skip if recovery key already exists in Vault
341+
if VAULT_TOKEN="${vault_token}" vault kv get "secrets/mangos/recovery-keys/${machine_id}/${partlabel}" >/dev/null 2>&1; then
342+
continue
343+
fi
344+
345+
found_any=1
346+
step "Enrolling recovery key for ${partlabel}"
347+
348+
# Generate and enroll recovery key (systemd-cryptenroll generates and prints the key)
349+
# Use TPM to unlock the device, then enroll a new recovery key
350+
local output=$(systemd-cryptenroll "${device}" --recovery-key --unlock-tpm2-device=auto 2>&1)
351+
352+
# Extract recovery key - format: 6 lowercase alphanumeric groups of 8, separated by dashes
353+
# Example: etklvner-lblhnbgl-kdtnujtk-ikjlgbur-lnlrjrrc-iuikkidg-feientnn-dkjeeuft
354+
LUKS_RECOVERY_KEY_REGEX='[a-z0-9]{8}(-[a-z0-9]{8}){7}'
355+
local recovery_key=$(echo "$output" | grep -oE "${LUKS_RECOVERY_KEY_REGEX}" | head -n 1)
356+
357+
if [ -n "$recovery_key" ] && [[ "$recovery_key" =~ ^${LUKS_RECOVERY_KEY_REGEX}$ ]]; then
358+
VAULT_TOKEN="${vault_token}" vault kv put "secrets/mangos/recovery-keys/${machine_id}/${partlabel}" \
359+
key="${recovery_key}" hostname="${HOSTNAME}" device="${device}" created="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
360+
if [ $? -eq 0 ]; then
361+
greenln Success
362+
else
363+
red "Failed to store in Vault"
364+
echo
365+
fi
366+
else
367+
red "Failed to enroll or extract recovery key"
368+
echo
369+
fi
370+
done
371+
372+
if [ $found_any -eq 0 ]; then
373+
echo " > All recovery keys already enrolled"
374+
else
375+
echo " > Recovery keys enrolled and stored in Vault"
376+
fi
377+
}
378+
323379
do_enroll() {
324380
declare -A groups
325381

@@ -433,6 +489,11 @@ do_enroll() {
433489
entity_name=$(vault write -field=name identity/lookup/entity alias_name=${HOSTNAME}.mangos alias_mount_accessor=${node_auth_accessor})
434490
echo $entity_name
435491

492+
step "Setting machine-id as entity metadata"
493+
machine_id=$(cat /etc/machine-id)
494+
vault write identity/entity/name/${entity_name} metadata=machine_id="${machine_id}"
495+
greenln Success
496+
436497
for group in ${!groups[@]}
437498
do
438499
do_step "Adding host to group '${group}'" chronic do_entity addgroup ${entity_name} ${group}
@@ -553,6 +614,8 @@ do_enroll() {
553614
greenln Success
554615

555616
do_step "Reloading confexts" chronic systemd-confext refresh --mutable=auto
617+
618+
do_step "Enrolling recovery keys for encrypted partitions" enroll_recovery_keys "${NODE_VAULT_TOKEN}"
556619
}
557620

558621
do_group() {
@@ -987,6 +1050,12 @@ do_bootstrap() {
9871050
NOMAD_TOKEN="${nomad_mgmt_token}" \
9881051
CONSUL_HTTP_TOKEN=${consul_mgmt_token} \
9891052
do_step "Final Terraform run" run_terraform_apply
1053+
1054+
echo
1055+
echo "Bootstrap complete! Next steps:"
1056+
echo " 1. Run: mangosctl sudo enroll -g vault-server -g consul-server -g nomad-server 127.0.0.1"
1057+
echo " 2. This will enroll the bootstrap node's identity and recovery keys"
1058+
echo
9901059
}
9911060

9921061
set_agent_token() {

run_tests.sh

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -258,16 +258,18 @@ chmod +x "${tmpdir}/is_ready.sh"
258258

259259
# Exit status 130 means killed by signal 2 (SIGINT)
260260
step 'Waiting for installed OS to be ready'
261-
$systemd_run -u "mangos-test-${testid}-socat" -d -p SuccessExitStatus=130 -q --wait -- mkosi --debug sandbox -- socat VSOCK-LISTEN:23433,fork,socktype=5 EXEC:"${tmpdir}/is_ready.sh"
262-
report_outcome
261+
$systemd_run -u "mangos-test-${testid}-socat" -d -p SuccessExitStatus=130 -q --wait -- \
262+
mkosi --debug sandbox -- socat VSOCK-LISTEN:23433,fork,socktype=5 EXEC:"${tmpdir}/is_ready.sh"
263+
263264

264265
step ssh into VM
266+
265267
if $systemd_run -d --wait -q -p StandardOutput=journal -- ssh -i ./mkosi.key \
266-
-o UserKnownHostsFile=/dev/null \
267-
-o StrictHostKeyChecking=no \
268-
-o LogLevel=ERROR \
269-
-o ProxyCommand="mkosi sandbox -- socat - VSOCK-CONNECT:42:%p" \
270-
root@mkosi 'mangosctl --base-url=http://10.0.2.2:8081 updatectl add-overrides ; /usr/share/mangos/self_test.sh'
268+
-o UserKnownHostsFile=/dev/null \
269+
-o StrictHostKeyChecking=no \
270+
-o LogLevel=ERROR \
271+
-o ProxyCommand="mkosi sandbox -- socat - VSOCK-CONNECT:42:%p" \
272+
root@mkosi 'mangosctl --base-url=http://10.0.2.2:8081 updatectl add-overrides ; /usr/share/mangos/self_test.sh'
271273
then
272274
success
273275
$systemd_run -u "mangos-test-${testid}-result" -q -- echo "Mangos test ${testid} succeeded"
@@ -276,3 +278,19 @@ else
276278
$systemd_run -u "mangos-test-${testid}-result" -q -- echo "Mangos test ${testid} failed"
277279
exit 1
278280
fi
281+
282+
step 'Testing LUKS recovery functionality'
283+
if $systemd_run -d --wait -q -p StandardOutput=journal -- ssh -i ./mkosi.key \
284+
-o UserKnownHostsFile=/dev/null \
285+
-o StrictHostKeyChecking=no \
286+
-o LogLevel=ERROR \
287+
-o ProxyCommand="mkosi sandbox -- socat - VSOCK-CONNECT:42:%p" \
288+
root@mkosi bash -s < ./recovery_test.sh
289+
then
290+
success
291+
$systemd_run -u "mangos-test-${testid}-result" -q -- echo "Mangos test ${testid} succeeded"
292+
else
293+
failure
294+
$systemd_run -u "mangos-test-${testid}-result" -q -- echo "Recovery test failed"
295+
exit 1
296+
fi

0 commit comments

Comments
 (0)