Skip to content

Commit 03ed605

Browse files
authored
🐛 Script to detect invalid WWN/RAID configuration of hbmh. (#1164)
🐛 Detect invalid WWN configuration of hbmh.
1 parent 503e30e commit 03ed605

File tree

8 files changed

+178
-6
lines changed

8 files changed

+178
-6
lines changed

api/v1beta1/conditions_const.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,10 @@ const (
177177
CloudInitNotInstalledReason = "CloudInitNotInstalled"
178178
// ServerNotFoundReason indicates that a bare metal server could not be found.
179179
ServerNotFoundReason = "ServerNotFound"
180+
// LinuxOnOtherDiskFoundReason indicates that the server can't be provisioned on the given WWN, since the reboot would fail.
181+
LinuxOnOtherDiskFoundReason = "LinuxOnOtherDiskFound"
182+
// SSHToRescueSystemFailedReason indicates that the rescue system can't be reached via ssh.
183+
SSHToRescueSystemFailedReason = "SSHToRescueSystemFailed"
180184
)
181185

182186
const (

controllers/hetznerbaremetalhost_controller_test.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -891,4 +891,5 @@ name="eth0" model="Realtek Semiconductor Co., Ltd. RTL8111/8168/8411 PCI Express
891891
sshClient.On("ExecuteInstallImage", mock.Anything).Return(sshclient.Output{})
892892
sshClient.On("Reboot").Return(sshclient.Output{})
893893
sshClient.On("GetCloudInitOutput").Return(sshclient.Output{StdOut: "dummy content of /var/log/cloud-init-output.log"})
894+
sshClient.On("DetectLinuxOnAnotherDisk", mock.Anything).Return(sshclient.Output{})
894895
}

hack/filter-caph-controller-manager-logs.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020

2121
keys_to_skip = ['controller', 'controllerGroup', 'controllerKind', 'reconcileID',
2222
'HetznerCluster', 'Cluster',
23-
'namespace', 'name', 'Machine', 'stack', 'stacktrace']
23+
'namespace', 'name', 'Machine', 'stack', 'stacktrace',
24+
'logger',
25+
]
2426

2527
rows_to_skip = [
2628
'controller-runtime.webhook', 'certwatcher/certwatcher', 'Registering a validating webhook',

pkg/services/baremetal/client/mocks/ssh/Client.go

Lines changed: 42 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
#!/bin/bash
2+
# Copyright 2024 The Kubernetes Authors.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
16+
set -euo pipefail
17+
18+
trap 'echo "Warning: A command has failed. Exiting the script. Line was ($0:$LINENO): $(sed -n "${LINENO}p" "$0")"; exit 3' ERR
19+
20+
function usage() {
21+
echo "$0 wwn1 [wwn2 ...]"
22+
echo " Check if there is a Linux partition, but skip all WWNs given as arguments"
23+
echo " Background: If we provision a disk, then there must not be a Linux OS on an other partition"
24+
echo " otherwise it is likely that the old OS gets booted, and not the new OS."
25+
echo " Exit 0: If there is no Linux installation found."
26+
echo " Exit 1: There is a Linux on a different disk.".
27+
echo " Exit 3: Unexpected error."
28+
echo "Existing WWNs:"
29+
lsblk -oNAME,WWN | grep -vi loop || true
30+
}
31+
32+
if [ $# -eq 0 ]; then
33+
echo "Error: No WWN was provided."
34+
echo
35+
usage
36+
exit 3
37+
fi
38+
39+
# Iterate over all input arguments
40+
for wwn in "$@"; do
41+
if ! lsblk -l -oWWN | grep -qP '^'${wwn}'$'; then
42+
echo "$wwn is not a WWN of this machine"
43+
echo
44+
usage
45+
exit 3
46+
fi
47+
done
48+
fail=0
49+
while read name wwn type parttype; do
50+
if [[ " $* " == *" $wwn "* ]]; then
51+
#echo "ok: skipping $name $wwn, since it was an argument to the script."
52+
continue
53+
fi
54+
root_directory_content=$(grub-fstest /dev/$name ls / 2>/dev/null || true | tr ' ' '\n' | sort | tr '\n' ' ')
55+
if [[ $root_directory_content =~ .*boot/.*etc/.* ]]; then
56+
echo "FAIL: $name $wwn partitionType=$parttype looks like a Linux root partition on another disk."
57+
fail=1
58+
continue
59+
fi
60+
if [[ $root_directory_content =~ .*initrd.*vmlinuz.* ]]; then
61+
echo "FAIL: $name $wwn partitionType=$parttype looks like a Linux /boot partition on another disk."
62+
fail=1
63+
continue
64+
fi
65+
#echo "ok: $name $wwn $parttype, does not look like root Linux partition."
66+
done < <(lsblk -r -oNAME,WWN,TYPE,PARTTYPENAME | grep -v NAME | grep -i part)
67+
if [ $fail -eq 1 ]; then
68+
exit 1
69+
fi
70+
echo "Looks good. No Linux root partition on other devices"

pkg/services/baremetal/client/ssh/ssh_client.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ package sshclient
2020
import (
2121
"bufio"
2222
"bytes"
23+
_ "embed"
2324
"encoding/base64"
2425
"errors"
2526
"fmt"
@@ -35,6 +36,9 @@ const (
3536
sshTimeOut time.Duration = 5 * time.Second
3637
)
3738

39+
//go:embed detect-linux-on-another-disk.sh
40+
var detectLinuxOnAnotherDiskShellScript string
41+
3842
var downloadFromOciShellScript = `#!/bin/bash
3943
4044
# Copyright 2023 The Kubernetes Authors.
@@ -200,6 +204,7 @@ type Client interface {
200204
CleanCloudInitInstances() Output
201205
ResetKubeadm() Output
202206
UntarTGZ() Output
207+
DetectLinuxOnAnotherDisk(sliceOfWwns []string) Output
203208
}
204209

205210
// Factory is the interface for creating new Client objects.
@@ -482,6 +487,13 @@ func (c *sshClient) ResetKubeadm() Output {
482487
return output
483488
}
484489

490+
func (c *sshClient) DetectLinuxOnAnotherDisk(sliceOfWwns []string) Output {
491+
return c.runSSH(fmt.Sprintf(`cat <<'EOF' | bash -s -- %s
492+
%s
493+
EOF
494+
`, strings.Join(sliceOfWwns, " "), detectLinuxOnAnotherDiskShellScript))
495+
}
496+
485497
func (c *sshClient) UntarTGZ() Output {
486498
fileName := "/installimage.tgz"
487499
data, err := os.ReadFile(fileName)

pkg/services/baremetal/client/ssh/ssh_client_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,17 @@ func Test_removeUselessLinesFromCloudInitOutput(t *testing.T) {
2626
want string
2727
}{
2828
{
29-
name: "ignore: 10000K .......... .......... .......... .......... .......... 6%!M(MISSING) 1s",
29+
name: "ignore: 10000K ...",
3030
s: "foo\n 10000K .......... .......... .......... .......... .......... 6%!M(MISSING) 1s\nbar",
3131
want: "foo\nbar",
3232
},
3333
{
34-
name: "ignore: ^10000K .......... .......... .......... .......... .......... 6%!M(MISSING) 1s",
34+
name: "ignore: ^10000K ...2",
3535
s: "foo\n10000K .......... .......... .......... .......... .......... 6%!M(MISSING) 1s\nbar",
3636
want: "foo\nbar",
3737
},
3838
{
39-
name: "ignore: Get:17 http://archive.ubuntu.com/ubuntu focal/universe Translation-en [5,124 kB[]",
39+
name: "ignore: Get:17 http://...",
4040
s: "foo\nGet:17 http://archive.ubuntu.com/ubuntu focal/universe Translation-en [5,124 kB[]\nbar",
4141
want: "foo\nbar",
4242
},

pkg/services/baremetal/host/host.go

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
"time"
3030

3131
"github.com/syself/hrobot-go/models"
32+
"golang.org/x/crypto/ssh"
3233
corev1 "k8s.io/api/core/v1"
3334
apierrors "k8s.io/apimachinery/pkg/api/errors"
3435
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -514,7 +515,8 @@ func (s *Service) actionRegistering() actionResult {
514515

515516
// Check hostname with sshClient
516517
out := sshClient.GetHostName()
517-
if trimLineBreak(out.StdOut) != rescue {
518+
hostName := trimLineBreak(out.StdOut)
519+
if hostName != rescue {
518520
// give the reboot some time until it takes effect
519521
if s.hasJustRebooted() {
520522
return actionContinue{delay: 2 * time.Second}
@@ -951,14 +953,53 @@ func (s *Service) actionImageInstalling() actionResult {
951953

952954
s.scope.HetznerBareMetalHost.Spec.Status.SSHStatus.OSKey = &sshKey
953955

956+
// If there is a Linux OS on an other disk, then the reboot after the provisioning
957+
// will likely fail, because the machine boots into the other operating system.
958+
// We want detect that early, and not start the provisioning process.
959+
out := sshClient.DetectLinuxOnAnotherDisk(s.scope.HetznerBareMetalHost.Spec.RootDeviceHints.ListOfWWN())
960+
if out.Err != nil {
961+
var exitErr *ssh.ExitError
962+
if errors.As(out.Err, &exitErr) && exitErr.ExitStatus() > 0 {
963+
// The script detected Linux on an other disk. This is a permanent error.
964+
msg := fmt.Sprintf("DetectLinuxOnAnotherDisk failed (permanent error): %s. StdErr: %s (%s)",
965+
out.StdOut, out.StdErr, out.Err.Error())
966+
conditions.MarkFalse(
967+
s.scope.HetznerBareMetalHost,
968+
infrav1.ProvisionSucceededCondition,
969+
infrav1.LinuxOnOtherDiskFoundReason,
970+
clusterv1.ConditionSeverityError,
971+
msg,
972+
)
973+
record.Warn(s.scope.HetznerBareMetalHost, infrav1.LinuxOnOtherDiskFoundReason, msg)
974+
s.scope.HetznerBareMetalHost.SetError(infrav1.PermanentError, msg)
975+
return actionStop{}
976+
}
977+
978+
// Some other error like connection timeout. Retry again later.
979+
// This often during provisioning.
980+
msg := fmt.Sprintf("DetectLinuxOnAnotherDisk failed (will retry): %s. StdErr: %s (%s)",
981+
out.StdOut, out.StdErr, out.Err.Error())
982+
conditions.MarkFalse(
983+
s.scope.HetznerBareMetalHost,
984+
infrav1.ProvisionSucceededCondition,
985+
infrav1.SSHToRescueSystemFailedReason,
986+
clusterv1.ConditionSeverityInfo,
987+
msg,
988+
)
989+
record.Event(s.scope.HetznerBareMetalHost, infrav1.SSHToRescueSystemFailedReason, msg)
990+
return actionContinue{
991+
delay: 10 * time.Second,
992+
}
993+
}
994+
954995
autoSetupInput, actionRes := s.createAutoSetupInput(sshClient)
955996
if actionRes != nil {
956997
return actionRes
957998
}
958999

9591000
autoSetup := buildAutoSetup(s.scope.HetznerBareMetalHost.Spec.Status.InstallImage, autoSetupInput)
9601001

961-
out := sshClient.CreateAutoSetup(autoSetup)
1002+
out = sshClient.CreateAutoSetup(autoSetup)
9621003
if out.Err != nil {
9631004
return actionError{err: fmt.Errorf("failed to create autosetup: %q %q %w", out.StdOut, out.StdErr, out.Err)}
9641005
}

0 commit comments

Comments
 (0)