Skip to content

Commit 7d25ba1

Browse files
committed
fix: storage node provisioning — wipe all disks, stable disk paths, verify volume injection
Three fixes for storage node provisioning: 1. WipeAllDisks for storage nodes: Old Talos installs on non-OS NVMe disks cause boot loops — the server boots from the stale install instead of the new one. Storage nodes (ephemeralSize set) now wipe ALL NVMe disks during fresh provision. Compute nodes still use WipeOSDisk to preserve Ceph data on other disks. 2. Stable /dev/disk/by-id/ paths: NVMe device names (/dev/nvme0n1) swap between rescue mode and Talos boot due to different PCI probe order. ResolveStableDiskPath resolves the bare device to its by-id symlink (based on serial number). This stable path is used for both the Talos installer in rescue AND injected into the machineconfig (machine.install.disk) so Talos references the correct physical disk at boot. The resolved path is persisted in status.resolvedInstallDisk for use during stateApplyConfig. 3. Verified injectStorageVolumes: Already called correctly in stateApplyConfig when ephemeralSize is set, with proper log message. Also fixes pre-existing test failures where injectVLANConfig and injectIPv6Config tests were missing the primaryMAC argument added when those functions switched to deviceSelector-based NIC matching.
1 parent 6cb39f1 commit 7d25ba1

File tree

4 files changed

+169
-24
lines changed

4 files changed

+169
-24
lines changed

api/v1alpha1/hetznerrobotmachine_types.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,13 @@ type HetznerRobotMachineStatus struct {
9393
// +optional
9494
PrimaryMAC string `json:"primaryMAC,omitempty"`
9595

96+
// ResolvedInstallDisk is the stable /dev/disk/by-id/ path resolved during rescue.
97+
// NVMe device names (/dev/nvme0n1) swap between rescue and Talos boot due to
98+
// different PCI probe order. This stable path ensures both the installer and
99+
// Talos machineconfig reference the same physical disk.
100+
// +optional
101+
ResolvedInstallDisk string `json:"resolvedInstallDisk,omitempty"`
102+
96103
// FailureReason is a brief string indicating why this machine failed.
97104
// +optional
98105
FailureReason *string `json:"failureReason,omitempty"`

controllers/hetznerrobotmachine_controller.go

Lines changed: 56 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -562,25 +562,65 @@ func (r *HetznerRobotMachineReconciler) stateInstallTalos(
562562
"configured", configuredDisk, "resolved", installDisk, "ip", serverIP)
563563
}
564564

565-
// Wipe only the OS install disk — Ceph OSD data on other disks must survive
566-
// reprovision. Wiping all disks would destroy storage cluster data.
567-
logger.Info("Wiping OS disk on server", "ip", serverIP, "disk", installDisk)
568-
if out, err := sshClient.WipeOSDisk(installDisk); err != nil {
569-
return ctrl.Result{}, fmt.Errorf("wipe OS disk %s on %s: %w\nOutput: %s", installDisk, serverIP, err, out)
565+
// Resolve the install disk to a stable /dev/disk/by-id/ path.
566+
// NVMe device names (/dev/nvme0n1) swap between rescue and Talos boot due to
567+
// different PCI probe order. The by-id path references the physical disk by
568+
// serial number, so both the installer in rescue AND Talos at boot will
569+
// reference the same physical disk regardless of enumeration order.
570+
stableDisk, err := sshClient.ResolveStableDiskPath(installDisk)
571+
if err != nil {
572+
// Non-fatal: fall back to unstable path
573+
logger.Info("Could not resolve stable disk path, using bare device name",
574+
"disk", installDisk, "error", err)
575+
stableDisk = installDisk
576+
}
577+
if stableDisk != installDisk {
578+
logger.Info("Resolved install disk to stable by-id path",
579+
"bare", installDisk, "stable", stableDisk, "ip", serverIP)
580+
}
581+
582+
// Persist the stable disk path in status so stateApplyConfig can inject it
583+
// into the Talos machineconfig. This ensures Talos uses the by-id path for
584+
// upgrades, surviving NVMe enumeration changes across reboots.
585+
hrm.Status.ResolvedInstallDisk = stableDisk
586+
587+
// Storage nodes (ephemeralSize set): wipe ALL NVMe disks for fresh provision.
588+
// Old Talos installs on OTHER disks cause boot loops — the server boots from
589+
// the old install instead of the new one. This is safe because fresh storage
590+
// nodes have no existing Ceph data.
591+
//
592+
// Compute nodes (no ephemeralSize): wipe ONLY the OS install disk.
593+
// Ceph OSD data on other disks must survive reprovision.
594+
isStorageNode := hrm.Spec.EphemeralSize != ""
595+
if isStorageNode {
596+
logger.Info("Storage node: wiping ALL NVMe disks for fresh provision",
597+
"ip", serverIP, "installDisk", stableDisk, "ephemeralSize", hrm.Spec.EphemeralSize)
598+
if out, err := sshClient.WipeAllDisks(stableDisk); err != nil {
599+
return ctrl.Result{}, fmt.Errorf("wipe all disks on %s: %w\nOutput: %s", serverIP, err, out)
600+
} else {
601+
logger.Info("All NVMe disks wiped", "ip", serverIP, "output", out)
602+
}
570603
} else {
571-
logger.Info("OS disk wiped", "ip", serverIP, "disk", installDisk, "output", out)
604+
logger.Info("Compute node: wiping OS disk only", "ip", serverIP, "disk", stableDisk)
605+
if out, err := sshClient.WipeOSDisk(installDisk); err != nil {
606+
return ctrl.Result{}, fmt.Errorf("wipe OS disk %s on %s: %w\nOutput: %s", installDisk, serverIP, err, out)
607+
} else {
608+
logger.Info("OS disk wiped", "ip", serverIP, "disk", installDisk, "output", out)
609+
}
572610
}
573611

574612
factoryURL := hrc.Spec.TalosFactoryBaseURL
575613
if factoryURL == "" {
576614
factoryURL = talosFactoryDefaultBaseURL
577615
}
578616

617+
// Use stable by-id path for the Talos installer so the correct physical disk
618+
// is targeted regardless of NVMe enumeration order at boot.
579619
if err := sshClient.InstallTalos(
580620
factoryURL,
581621
hrm.Spec.TalosSchematic,
582622
hrm.Spec.TalosVersion,
583-
installDisk,
623+
stableDisk,
584624
); err != nil {
585625
return ctrl.Result{}, fmt.Errorf("install Talos on %s: %w", serverIP, err)
586626
}
@@ -1171,9 +1211,16 @@ func (r *HetznerRobotMachineReconciler) stateApplyConfig(
11711211

11721212
// Inject install disk into machineconfig — CAPT doesn't include it,
11731213
// but Talos requires machine.install.disk to be set.
1174-
installDisk := hrm.Spec.InstallDisk
1214+
// Prefer the stable /dev/disk/by-id/ path resolved during rescue install.
1215+
// This ensures Talos references the correct physical disk regardless of
1216+
// NVMe enumeration order (which can differ between rescue and Talos boot).
1217+
installDisk := hrm.Status.ResolvedInstallDisk
11751218
if installDisk == "" {
1176-
installDisk = "/dev/nvme0n1"
1219+
// Fallback for machines provisioned before this fix was deployed
1220+
installDisk = hrm.Spec.InstallDisk
1221+
if installDisk == "" {
1222+
installDisk = "/dev/nvme0n1"
1223+
}
11771224
}
11781225
bootstrapData, err = injectInstallDisk(bootstrapData, installDisk)
11791226
if err != nil {

controllers/inject_test.go

Lines changed: 22 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ cluster:
6464
PrefixLength: 24,
6565
}
6666

67-
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.1")
67+
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.1", "aa:bb:cc:dd:ee:ff")
6868
if err != nil {
6969
t.Fatalf("injectVLANConfig failed: %v", err)
7070
}
@@ -83,8 +83,10 @@ cluster:
8383
}
8484

8585
iface := interfaces[0].(map[string]interface{})
86-
if iface["interface"] != "enp193s0f0np0" {
87-
t.Errorf("expected interface enp193s0f0np0, got %v", iface["interface"])
86+
// injectVLANConfig uses deviceSelector by MAC, not interface name
87+
selector := iface["deviceSelector"].(map[string]interface{})
88+
if selector["hardwareAddr"] != "aa:bb:cc:dd:ee:ff" {
89+
t.Errorf("expected deviceSelector.hardwareAddr aa:bb:cc:dd:ee:ff, got %v", selector["hardwareAddr"])
8890
}
8991
if iface["dhcp"] != true {
9092
t.Errorf("expected dhcp true, got %v", iface["dhcp"])
@@ -110,7 +112,7 @@ func TestInjectVLANConfig_NilConfig(t *testing.T) {
110112
input := []byte(`machine:
111113
type: controlplane
112114
`)
113-
result, err := injectVLANConfig(input, nil, "10.10.0.1")
115+
result, err := injectVLANConfig(input, nil, "10.10.0.1", "aa:bb:cc:dd:ee:ff")
114116
if err != nil {
115117
t.Fatalf("injectVLANConfig failed: %v", err)
116118
}
@@ -124,7 +126,7 @@ func TestInjectVLANConfig_EmptyInternalIP(t *testing.T) {
124126
type: controlplane
125127
`)
126128
vlanCfg := &infrav1.VLANConfig{ID: 4000, Interface: "eth0"}
127-
result, err := injectVLANConfig(input, vlanCfg, "")
129+
result, err := injectVLANConfig(input, vlanCfg, "", "aa:bb:cc:dd:ee:ff")
128130
if err != nil {
129131
t.Fatalf("injectVLANConfig failed: %v", err)
130132
}
@@ -134,11 +136,12 @@ func TestInjectVLANConfig_EmptyInternalIP(t *testing.T) {
134136
}
135137

136138
func TestInjectVLANConfig_MergeExistingInterface(t *testing.T) {
137-
// Existing config already has the interface with some settings
139+
// Existing config already has the interface matched by deviceSelector MAC
138140
input := []byte(`machine:
139141
network:
140142
interfaces:
141-
- interface: enp193s0f0np0
143+
- deviceSelector:
144+
hardwareAddr: "aa:bb:cc:dd:ee:ff"
142145
mtu: 9000
143146
`)
144147
vlanCfg := &infrav1.VLANConfig{
@@ -147,7 +150,7 @@ func TestInjectVLANConfig_MergeExistingInterface(t *testing.T) {
147150
PrefixLength: 24,
148151
}
149152

150-
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.2")
153+
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.2", "aa:bb:cc:dd:ee:ff")
151154
if err != nil {
152155
t.Fatalf("injectVLANConfig failed: %v", err)
153156
}
@@ -338,7 +341,7 @@ func TestInjectVLANConfig_DefaultPrefixLength(t *testing.T) {
338341
// PrefixLength not set — should default to 24
339342
}
340343

341-
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.3")
344+
result, err := injectVLANConfig(input, vlanCfg, "10.10.0.3", "aa:bb:cc:dd:ee:ff")
342345
if err != nil {
343346
t.Fatalf("injectVLANConfig failed: %v", err)
344347
}
@@ -366,7 +369,7 @@ func TestInjectIPv6Config(t *testing.T) {
366369
input := []byte(`machine:
367370
type: controlplane
368371
`)
369-
result, err := injectIPv6Config(input, "2a01:4f8:271:3b49::", "enp193s0f0np0")
372+
result, err := injectIPv6Config(input, "2a01:4f8:271:3b49::", "aa:bb:cc:dd:ee:ff", "10.10.0.1")
370373
if err != nil {
371374
t.Fatalf("injectIPv6Config failed: %v", err)
372375
}
@@ -385,8 +388,10 @@ func TestInjectIPv6Config(t *testing.T) {
385388
}
386389

387390
iface := interfaces[0].(map[string]interface{})
388-
if iface["interface"] != "enp193s0f0np0" {
389-
t.Errorf("expected interface enp193s0f0np0, got %v", iface["interface"])
391+
// Uses deviceSelector by MAC, not interface name
392+
deviceSelector := iface["deviceSelector"].(map[string]interface{})
393+
if deviceSelector == nil {
394+
t.Errorf("expected deviceSelector, got nil")
390395
}
391396

392397
addrs := iface["addresses"].([]interface{})
@@ -414,7 +419,7 @@ func TestInjectIPv6Config_Empty(t *testing.T) {
414419
input := []byte(`machine:
415420
type: controlplane
416421
`)
417-
result, err := injectIPv6Config(input, "", "enp193s0f0np0")
422+
result, err := injectIPv6Config(input, "", "aa:bb:cc:dd:ee:ff", "10.10.0.1")
418423
if err != nil {
419424
t.Fatalf("injectIPv6Config failed: %v", err)
420425
}
@@ -424,15 +429,17 @@ func TestInjectIPv6Config_Empty(t *testing.T) {
424429
}
425430

426431
func TestInjectIPv6Config_MergeExistingInterface(t *testing.T) {
432+
// Existing config uses deviceSelector with MAC — injectIPv6Config matches by hardwareAddr
427433
input := []byte(`machine:
428434
network:
429435
interfaces:
430-
- interface: enp193s0f0np0
436+
- deviceSelector:
437+
hardwareAddr: aa:bb:cc:dd:ee:ff
431438
dhcp: true
432439
addresses:
433440
- 10.0.0.1/24
434441
`)
435-
result, err := injectIPv6Config(input, "2a01:4f8::", "enp193s0f0np0")
442+
result, err := injectIPv6Config(input, "2a01:4f8::", "aa:bb:cc:dd:ee:ff", "10.10.0.1")
436443
if err != nil {
437444
t.Fatalf("injectIPv6Config failed: %v", err)
438445
}

pkg/sshrescue/client.go

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,90 @@ func (c *Client) WipeOSDisk(disk string) (string, error) {
196196
return c.Run(cmd)
197197
}
198198

199+
// WipeAllDisks wipes ALL NVMe disks on the server for a fresh provision.
200+
// Used for storage nodes where old Talos installs on ANY disk cause boot loops —
201+
// the server boots from the old install instead of the new one. Unlike WipeOSDisk,
202+
// this does NOT check for ceph_bluestore because it is only called during fresh
203+
// provisioning when no Ceph data exists yet.
204+
//
205+
// The installDisk parameter is logged but all NVMe disks are wiped regardless.
206+
func (c *Client) WipeAllDisks(installDisk string) (string, error) {
207+
// List all NVMe block devices
208+
listCmd := `lsblk --noheadings --nodeps --paths --output NAME,TYPE | grep disk | grep nvme | awk '{print $1}'`
209+
out, err := c.Run(listCmd)
210+
if err != nil {
211+
return out, fmt.Errorf("list NVMe disks: %w", err)
212+
}
213+
214+
disks := strings.Fields(strings.TrimSpace(out))
215+
if len(disks) == 0 {
216+
return "", fmt.Errorf("no NVMe disks found")
217+
}
218+
219+
var results []string
220+
for _, disk := range disks {
221+
cmd := fmt.Sprintf(
222+
`set -e; `+
223+
`echo "Wiping disk %[1]s..."; `+
224+
`wipefs -af %[1]q 2>/dev/null || true; `+
225+
`sgdisk --zap-all %[1]q 2>/dev/null || true; `+
226+
`dd if=/dev/zero of=%[1]q bs=1M count=100 conv=notrunc 2>/dev/null || true; `+
227+
`blkdiscard %[1]q 2>/dev/null || true; `+
228+
`sync; `+
229+
`echo "Disk %[1]s wiped"`,
230+
disk,
231+
)
232+
wipeOut, wipeErr := c.Run(cmd)
233+
if wipeErr != nil {
234+
return wipeOut, fmt.Errorf("wipe disk %s: %w", disk, wipeErr)
235+
}
236+
results = append(results, strings.TrimSpace(wipeOut))
237+
}
238+
239+
summary := fmt.Sprintf("Wiped %d NVMe disks (install=%s): %s", len(disks), installDisk, strings.Join(disks, ", "))
240+
results = append(results, summary)
241+
return strings.Join(results, "\n"), nil
242+
}
243+
244+
// ResolveStableDiskPath resolves a bare NVMe device path (e.g. /dev/nvme0n1)
245+
// to its stable /dev/disk/by-id/ path using the disk's serial number.
246+
// This is critical because NVMe device names swap between rescue mode and Talos
247+
// boot due to different PCI probe order. The by-id path references the physical
248+
// disk deterministically regardless of enumeration order.
249+
//
250+
// If the disk is already a /dev/disk/by-id/ path, it is returned as-is.
251+
// Returns the original disk path if resolution fails (best-effort).
252+
func (c *Client) ResolveStableDiskPath(disk string) (string, error) {
253+
// Already a stable path — nothing to resolve
254+
if strings.HasPrefix(disk, "/dev/disk/by-id/") {
255+
return disk, nil
256+
}
257+
258+
// Get the basename (e.g. "nvme0n1" from "/dev/nvme0n1")
259+
basename := disk
260+
if idx := strings.LastIndex(disk, "/"); idx >= 0 {
261+
basename = disk[idx+1:]
262+
}
263+
264+
// Find the by-id symlink that points to this device (excluding partition entries)
265+
cmd := fmt.Sprintf(
266+
`ls -la /dev/disk/by-id/ 2>/dev/null | grep -E '%s$' | grep nvme | grep -v part | awk '{print $9}' | head -1`,
267+
basename,
268+
)
269+
out, err := c.Run(cmd)
270+
if err != nil {
271+
return disk, nil // best-effort: return original on failure
272+
}
273+
274+
byIDName := strings.TrimSpace(out)
275+
if byIDName == "" {
276+
return disk, nil // no by-id link found, return original
277+
}
278+
279+
stablePath := "/dev/disk/by-id/" + byIDName
280+
return stablePath, nil
281+
}
282+
199283
// InstallTalos installs Talos Linux on the server using the official OCI
200284
// installer binary inside a minimal namespace created by unshare(1).
201285
//

0 commit comments

Comments
 (0)