Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,6 @@ jobs:
- name: Test it
run: |
#!/bin/bash
set -x
set -e
sudo apt-get update -y
# mkosi doesn't pick this up from the tools dir for some reason
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ jobs:
- name: Test it
run: |
#!/bin/bash
set -x
set -e
sudo apt-get update -y
# mkosi doesn't pick this up from the tools dir for some reason
Expand Down
1 change: 0 additions & 1 deletion mkosi.images/base/mkosi.build.chroot
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/sh
set -x

cp -r ${SRCDIR}/resources/mangosctl ${BUILDDIR}
cd ${BUILDDIR}/mangosctl
Expand Down
3 changes: 3 additions & 0 deletions mkosi.images/base/mkosi.conf
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ Packages=
# CA certificates
ca-certificates

# LUKS support
cryptsetup

# systemd's DNS resolver
systemd-resolved

Expand Down
87 changes: 87 additions & 0 deletions mkosi.images/base/mkosi.extra/usr/share/mangos/recovery_test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/bin/bash

set -e
set -x

machine_id="$(cat /etc/machine-id)"

test_recovery_key() {
test_device="$1"

test_partition="$(lsblk -nlo PARTLABEL ${test_device} | tr -d '\n')"
mapper_name="$(lsblk -nlo NAME ${test_device} | awk 'NR==2')"

echo "Testing recovery unlock for ${test_partition} (device: ${test_device}, mapper: ${mapper_name})..."
echo ""

# Get recovery key from Vault
recovery_key="$(mangosctl sudo -- vault kv get -field=key "secrets/mangos/recovery-keys/${machine_id}/${test_partition}")"

# Find TPM keyslot number
tpm_slot="$(cryptsetup luksDump ${test_device} | \
awk '/Tokens:/,/Keyslots:/ {if (/systemd-tpm2/) found=1; if (found && /^ [0-9]+:/) {print $1; exit}}' | \
tr -d ':')"

echo "Removing TPM keyslot ${tpm_slot} (simulating TPM failure)..."
# Provide the recovery key on stdin so systemd-cryptenroll does not prompt interactively.
# Use --unlock-key-file=/dev/stdin to read the key from stdin when wiping the TPM slot.
printf '%s' "$recovery_key" | systemd-cryptenroll --wipe-slot=tpm2 --unlock-key-file=/dev/stdin ${test_device}


if [ ! -b /dev/mapper/"${mapper_name}" ]; then
echo "ERROR: Device not found - /dev/mapper/${mapper_name}"
return 1
fi

# Get mount point for this partition
mount_point="$(findmnt -n -o TARGET /dev/mapper/${mapper_name} || true)"

# Unmount and close
if [ -n "${mount_point}" ]; then
systemctl stop "$(systemd-escape -p --suffix=mount "${mount_point}")"
fi

cryptsetup close "${mapper_name}"

# THE CRITICAL TEST: Unlock with recovery key
echo "Unlocking with recovery key..."
echo -n "$recovery_key" | systemd-cryptsetup attach "${mapper_name}" "${test_device}" -
# Remount
if [ -n "${mount_point}" ]; then
mount /dev/mapper/"${mapper_name}" "${mount_point}"
fi

# Verify device is accessible
if [ ! -b /dev/mapper/"${mapper_name}" ]; then
echo "ERROR: Device not accessible after recovery"
exit 1
fi

echo "Data accessible after recovery: OK"

# Re-enroll TPM (cleanup for future tests)
echo "Re-enrolling TPM keyslot..."
# Re-enroll by supplying the recovery key on stdin (non-interactive)
printf '%s' "${recovery_key}" | systemd-cryptenroll "${test_device}" \
--tpm2-device=auto \
--tpm2-pcrs=7 \
--tpm2-public-key-pcrs=11 \
--unlock-key-file=/dev/stdin

echo "Recovery test: PASSED"

}

# Auto-detect first LUKS partition for testing
devices="$(lsblk -ln -o NAME,TYPE,FSTYPE | awk '$2=="part" && $3=="crypto_LUKS" {print "/dev/"$1}' | tr '\n' ' ')"

echo "> LUKS-encrypted devices found: $devices"

for test_device in ${devices}; do
echo "> Testing device: ${test_device}"
test_recovery_key "${test_device}"
done

echo "> All recovery tests completed successfully."
echo "failing to see whats happening"
exit 1
45 changes: 44 additions & 1 deletion mkosi.images/base/mkosi.extra/usr/share/mangos/self_test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ BASE_URL=${BASE_URL:-http://10.0.2.2:8081}
export BASE_URL

set -e
set -x

trap 'journalctl -n 1000 --no-pager' ERR
systemctl is-active systemd-veritysetup@root.service
Expand All @@ -26,3 +25,47 @@ do
sleep 10
echo "Trying again. $tries tries left"
done

echo "===> Validating Recovery Keys"
machine_id=$(cat /etc/machine-id)

# Auto-detect LUKS partitions
luks_partitions=$(lsblk -nlo NAME,TYPE,FSTYPE | awk '$2 == "part" && $3 == "crypto_LUKS" {print $1}' | tr '\n' ' ')

if [ -z "$luks_partitions" ]; then
echo "No LUKS partitions found, skipping recovery key validation"
else
# Test 1: Verify recovery keys exist in Vault
for device in $luks_partitions; do
partition=$(lsblk -nlo PARTLABEL /dev/$device | tr -d '\n')
if ! mangosctl sudo -- vault kv get "secrets/mangos/recovery-keys/${machine_id}/${partition}" >/dev/null 2>&1; then
echo "ERROR: Recovery key not found in Vault for ${partition}"
exit 1
fi
echo "Recovery key for ${partition}: OK"
done

# Test 2: Verify LUKS has multiple keyslots (TPM + recovery)
for device in $luks_partitions; do
partition=$(lsblk -nlo PARTLABEL /dev/$device | tr -d '\n')
slots=$(cryptsetup luksDump /dev/$device 2>/dev/null | grep -c "^ [0-9]: luks2" || echo 0)
if [ "$slots" -lt 2 ]; then
echo "ERROR: ${partition} has only ${slots} keyslot(s), expected at least 2 (TPM + recovery)"
exit 1
fi
echo "LUKS keyslots for ${partition}: ${slots} OK"
done

echo "Recovery key validation: PASSED"
fi

echo 'Testing LUKS recovery functionality'

if /usr/share/mangos/recovery_test.sh; then
echo "LUKS recovery test: PASSED"
else
echo "LUKS recovery test: FAILED"
exit 1
fi

echo "All self-tests completed successfully."
17 changes: 17 additions & 0 deletions mkosi.images/terraform/share/terraform/pki.tf
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,7 @@ resource "vault_cert_auth_backend_role" "node" {
vault_policy.node-cert-self-renew.name,
vault_policy.ssh-host-self-signer.name,
vault_policy.consul-gossip.name,
vault_policy.node-recovery-keys.name,
]
}

Expand All @@ -277,6 +278,22 @@ resource "vault_policy" "node-cert-self-renew" {
EOP
}

resource "vault_policy" "node-recovery-keys" {
name = "node-recovery-keys"

policy = <<-EOP
# Allow nodes to create recovery keys for their own machine-id only (write-once, no read/update)
# No read allowed because node does not need to read its own recovery key,
# it is only needed to be read by admins to recover the node
# No update allowed to avoid compromised node may update recovery key
# Any update of recovery keys (even for rotating recovery keys) need admin actions
# For which admin key should be used
path "secrets/mangos/recovery-keys/{{identity.entity.metadata.machine_id}}/*" {
capabilities = ["create"]
}
EOP
}

resource "vault_identity_group" "vault-servers" {
name = "vault-servers"
type = "internal"
Expand Down
2 changes: 1 addition & 1 deletion mkosi.images/vault/mkosi.version
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.21.2-1
1.21.2
105 changes: 98 additions & 7 deletions resources/mangosctl/mangosctl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,95 @@ do_install() {
fi
}

# Enroll recovery keys for encrypted partitions and store them in Vault
enroll_recovery_keys() {
local vault_token="$1"
local machine_id="$(cat /etc/machine-id)"
local found_any=0

local marker_dir="/var/lib/mangos/luks-recovery-keys-enrolled"
mkdir -p "${marker_dir}"

# Find all LUKS-encrypted partitions
local devices="$(lsblk -ln -o NAME,TYPE,FSTYPE | awk '$2=="part" && $3=="crypto_LUKS" {print "/dev/"$1}' | tr '\n' ' ')"

echo "> LUKS-encrypted devices found: $devices"

for device in ${devices}; do
echo "> Processing device: ${device}"
local partlabel="$(lsblk -n -o PARTLABEL "${device}" 2>/dev/null | tr -d ' \n\r\t')"

# Skip if no valid partition label
if [ -z "${partlabel}" ]; then
echo "> Device ${device} has no PARTLABEL, skipping"
continue
fi

# Skip if already enrolled
local marker_file="${marker_dir}/${partlabel}"
if [ -f "${marker_file}" ]; then
echo "> Recovery key for ${partlabel} already enrolled, skipping"
continue
fi

found_any=1
step "Enrolling recovery key for ${partlabel}"

# Generate and enroll recovery key (systemd-cryptenroll generates and prints the key)
# Use TPM to unlock the device, then enroll a new recovery key
local output=$(systemd-cryptenroll "${device}" --recovery-key --unlock-tpm2-device=auto 2>&1)

# Extract recovery key - format: 6 lowercase alphanumeric groups of 8, separated by dashes
# Example: etklvner-lblhnbgl-kdtnujtk-ikjlgbur-lnlrjrrc-iuikkidg-feientnn-dkjeeuft
LUKS_RECOVERY_KEY_REGEX='[a-z0-9]{8}(-[a-z0-9]{8}){7}'
local recovery_key="$(echo "$output" | grep -oE "${LUKS_RECOVERY_KEY_REGEX}" | head -n 1)"

if [ -n "${recovery_key}" ]; then
if VAULT_TOKEN="${vault_token}" vault kv put "secrets/mangos/recovery-keys/${machine_id}/${partlabel}" \
key="${recovery_key}" hostname="${HOSTNAME}" device="${device}" created="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
then
greenln Success
touch "${marker_file}"
else
red "Failed to store in Vault"
fi
else
red "Failed to extract recovery key. cryptenroll output:"
echo "${output}"
fi
done

if [ ${found_any} -eq 0 ]; then
echo "> All recovery keys already enrolled"
else
echo "> Recovery keys enrolled and stored in Vault"
fi
}

write_machine_id_metadata() {
step "Getting mount accessor for node-cert"
node_auth_accessor="$(vault read -field=accessor sys/auth/node-cert)"

step "Looking up entity name for this node"
entity_name="$(vault write -field=name identity/lookup/entity alias_name=${HOSTNAME}.mangos alias_mount_accessor=${node_auth_accessor})"

step "Setting machine-id as entity metadata"
machine_id="$(cat /etc/machine-id)"

# Read current metadata, merge with new machine_id, and write back
current_metadata="$(vault read -format=json identity/entity/name/${entity_name} | jq -r '.data.metadata // {}')"
new_metadata="$(echo "${current_metadata}" | jq --arg mid "${machine_id}" '. + {machine_id: $mid}')"

# Convert JSON to key=value arguments for Vault CLI
metadata_args=()
while IFS='=' read -r k v; do
metadata_args+=("metadata=${k}=${v}")
done < <(echo "${new_metadata}" | jq -r 'to_entries|map("\(.key)=\(.value|tostring)")|.[]')

vault write identity/entity/name/"${entity_name}" "${metadata_args[@]}"
greenln Success
}

do_enroll() {
declare -A groups

Expand Down Expand Up @@ -425,13 +514,7 @@ do_enroll() {
NODE_VAULT_TOKEN=$(vault login -method=cert -path=node-cert -client-cert=/var/lib/mangos/mangos.crt -client-key=<(systemd-creds decrypt ${confext_dir}/etc/credstore.encrypted/mangos.key) -token-only)
greenln Success

step "Getting mount accessor for node-cert"
node_auth_accessor=$(vault read -field=accessor sys/auth/node-cert)
echo $node_auth_accessor

step "Looking up entity name for this node"
entity_name=$(vault write -field=name identity/lookup/entity alias_name=${HOSTNAME}.mangos alias_mount_accessor=${node_auth_accessor})
echo $entity_name
do_step "Writing machine ID metadata to Vault" write_machine_id_metadata

for group in ${!groups[@]}
do
Expand Down Expand Up @@ -553,6 +636,8 @@ do_enroll() {
greenln Success

do_step "Reloading confexts" chronic systemd-confext refresh --mutable=auto

do_step "Enrolling recovery keys for encrypted partitions" enroll_recovery_keys "${NODE_VAULT_TOKEN}"
}

do_group() {
Expand Down Expand Up @@ -987,6 +1072,12 @@ do_bootstrap() {
NOMAD_TOKEN="${nomad_mgmt_token}" \
CONSUL_HTTP_TOKEN=${consul_mgmt_token} \
do_step "Final Terraform run" run_terraform_apply

echo
echo "Bootstrap complete! Next steps:"
echo " 1. Run: mangosctl sudo enroll -g vault-server -g consul-server -g nomad-server 127.0.0.1"
echo " 2. This will enroll the bootstrap node's identity and recovery keys"
echo
}

set_agent_token() {
Expand Down
Loading