Skip to content

Commit e2b7539

Browse files
emosbaughsgalsaleh
andauthored
feat(ha): node local loadbalancing (#2012)
* feat(ha): node local loadbalancing * f * f * f * f * f * f * f * f * f * test with new kots binary * no-op * fix kots version * no-op * f * no-op * keep noop * f * f * f * f * f * f * f * f * f --------- Co-authored-by: Salah Aldeen Al Saleh <[email protected]>
1 parent 329783d commit e2b7539

File tree

20 files changed

+351
-84
lines changed

20 files changed

+351
-84
lines changed

.github/workflows/ci.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ jobs:
113113
- name: Checkout
114114
uses: actions/checkout@v4
115115
- name: Make manifests
116+
env:
117+
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
116118
run: make -C operator manifests
117119
- name: Check CRDs
118120
run: |

cmd/buildtools/k0s.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ var k0sImageComponents = map[string]addonComponent{
7676
return fmt.Sprintf("registry.k8s.io/pause:%s", opts.upstreamVersion.Original()), nil
7777
},
7878
},
79+
"quay.io/k0sproject/envoy-distroless": {
80+
name: "envoy-distroless",
81+
getWolfiPackageName: func(opts addonComponentOptions) string {
82+
return fmt.Sprintf("envoy-%d.%d", opts.upstreamVersion.Major(), opts.upstreamVersion.Minor())
83+
},
84+
},
7985
}
8086

8187
var updateK0sImagesCommand = &cli.Command{

cmd/buildtools/utils.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package main
22

33
import (
44
"bufio"
5+
"bytes"
56
"context"
67
"errors"
78
"fmt"
@@ -145,10 +146,12 @@ func ResolveApkoPackageVersion(componentName, packageName, packageVersion string
145146
fmt.Sprintf("PACKAGE_NAME=%s", packageName),
146147
fmt.Sprintf("PACKAGE_VERSION=%s", packageVersion),
147148
}
149+
var errBuf bytes.Buffer
148150
cmd := exec.Command("make", args...)
151+
cmd.Stderr = &errBuf
149152
out, err := cmd.Output()
150153
if err != nil {
151-
return "", fmt.Errorf("run command: %w: %s", err, string(out))
154+
return "", fmt.Errorf("run command: %w: %s", err, errBuf.String())
152155
}
153156
return strings.TrimSpace(string(out)), nil
154157
}
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# source: https://github.com/chainguard-images/images/blob/bea234042585fd6db129bc2c836aad4937f55799/images/envoy/config/main.tf
2+
contents:
3+
repositories:
4+
- https://packages.wolfi.dev/os
5+
keyring:
6+
- https://packages.wolfi.dev/os/wolfi-signing.rsa.pub
7+
packages:
8+
- envoy~${PACKAGE_VERSION}
9+
- envoy-config~${PACKAGE_VERSION}
10+
- envoy-oci-entrypoint~${PACKAGE_VERSION}
11+
- su-exec
12+
13+
accounts:
14+
groups:
15+
- groupname: nonroot
16+
gid: 65532
17+
users:
18+
- username: nonroot
19+
uid: 65532
20+
gid: 65532
21+
run-as: 65532
22+
23+
paths:
24+
- path: /etc/envoy
25+
type: directory
26+
uid: 65532
27+
gid: 65532
28+
permissions: 0o755
29+
30+
entrypoint:
31+
command: /var/lib/envoy/init/envoy-entrypoint.sh

e2e/cluster/docker/cluster.go

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,23 @@ import (
1313
type Cluster struct {
1414
Nodes []*Container
1515

16-
t *testing.T
16+
t *testing.T
17+
supportBundleNodeIndex int
1718
}
1819

1920
type ClusterInput struct {
20-
T *testing.T
21-
Nodes int
22-
Distro string
23-
LicensePath string
24-
ECBinaryPath string
25-
ECReleaseBuilderPath string
26-
K0sDir string
21+
T *testing.T
22+
Nodes int
23+
Distro string
24+
LicensePath string
25+
ECBinaryPath string
26+
ECReleaseBuilderPath string
27+
K0sDir string
28+
SupportBundleNodeIndex int
2729
}
2830

2931
func NewCluster(in *ClusterInput) *Cluster {
30-
c := &Cluster{t: in.T}
32+
c := &Cluster{t: in.T, supportBundleNodeIndex: in.SupportBundleNodeIndex}
3133

3234
c.Nodes = make([]*Container, in.Nodes)
3335

@@ -170,16 +172,17 @@ func (c *Cluster) generateSupportBundle(envs ...map[string]string) {
170172
}(i, &wg)
171173
}
172174

173-
c.t.Logf("%s: generating cluster support bundle from node 0", time.Now().Format(time.RFC3339))
174-
if stdout, stderr, err := c.RunCommandOnNode(0, []string{"collect-support-bundle-cluster.sh"}, envs...); err != nil {
175+
node := c.Nodes[c.supportBundleNodeIndex]
176+
c.t.Logf("%s: generating cluster support bundle from node %d", time.Now().Format(time.RFC3339), c.supportBundleNodeIndex)
177+
if stdout, stderr, err := c.RunCommandOnNode(c.supportBundleNodeIndex, []string{"collect-support-bundle-cluster.sh"}, envs...); err != nil {
175178
c.t.Logf("stdout: %s", stdout)
176179
c.t.Logf("stderr: %s", stderr)
177-
c.t.Logf("fail to generate cluster support from node %d bundle: %v", 0, err)
180+
c.t.Logf("fail to generate cluster support from node %d bundle: %v", c.supportBundleNodeIndex, err)
178181
} else {
179-
c.t.Logf("%s: copying cluster support bundle from node 0 to local machine", time.Now().Format(time.RFC3339))
180-
src := fmt.Sprintf("%s:cluster.tar.gz", c.Nodes[0].GetName())
182+
c.t.Logf("%s: copying cluster support bundle from node %d to local machine", time.Now().Format(time.RFC3339), c.supportBundleNodeIndex)
183+
src := fmt.Sprintf("%s:cluster.tar.gz", node.GetName())
181184
dst := "support-bundle-cluster.tar.gz"
182-
if stdout, stderr, err := c.Nodes[0].CopyFile(src, dst); err != nil {
185+
if stdout, stderr, err := node.CopyFile(src, dst); err != nil {
183186
c.t.Logf("stdout: %s", stdout)
184187
c.t.Logf("stderr: %s", stderr)
185188
c.t.Logf("fail to generate cluster support bundle from node 0: %v", err)

e2e/cluster/lxd/cluster.go

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ type ClusterInput struct {
7070
WithProxy bool
7171
id string
7272
AdditionalFiles []File
73+
SupportBundleNodeIndex int
7374
}
7475

7576
// File holds information about a file that must be uploaded to a node.
@@ -88,12 +89,13 @@ type Dir struct {
8889
// Cluster is returned when a cluster is created. Contain a list of all node
8990
// names and the cluster id.
9091
type Cluster struct {
91-
Nodes []string
92-
IPs []string
93-
network string
94-
id string
95-
T *testing.T
96-
Proxy string
92+
Nodes []string
93+
IPs []string
94+
network string
95+
id string
96+
T *testing.T
97+
Proxy string
98+
supportBundleNodeIndex int
9799
}
98100

99101
// Destroy destroys a cluster pointed by the id property.
@@ -210,9 +212,10 @@ func NewCluster(in *ClusterInput) *Cluster {
210212
in.network = <-networkaddr
211213

212214
out := &Cluster{
213-
T: in.T,
214-
network: in.network,
215-
id: in.id,
215+
T: in.T,
216+
network: in.network,
217+
id: in.id,
218+
supportBundleNodeIndex: in.SupportBundleNodeIndex,
216219
}
217220
out.T.Cleanup(out.Destroy)
218221

@@ -1174,10 +1177,10 @@ func (c *Cluster) generateSupportBundle(envs ...map[string]string) {
11741177
}(i, &wg)
11751178
}
11761179

1177-
node := c.Nodes[0]
1180+
node := c.Nodes[c.supportBundleNodeIndex]
11781181
c.T.Logf("%s: generating cluster support bundle from node %s", time.Now().Format(time.RFC3339), node)
11791182
line := []string{"collect-support-bundle-cluster.sh"}
1180-
if stdout, stderr, err := c.RunCommandOnNode(0, line, envs...); err != nil {
1183+
if stdout, stderr, err := c.RunCommandOnNode(c.supportBundleNodeIndex, line, envs...); err != nil {
11811184
c.T.Logf("stdout: %s", stdout)
11821185
c.T.Logf("stderr: %s", stderr)
11831186
c.T.Logf("fail to generate cluster support from node %s bundle: %v", node, err)

e2e/install_test.go

Lines changed: 39 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1851,11 +1851,12 @@ func TestMultiNodeAirgapUpgradePreviousStable(t *testing.T) {
18511851
// for them to report ready. Runs additional high availability validations afterwards.
18521852
func TestMultiNodeHAInstallation(t *testing.T) {
18531853
tc := docker.NewCluster(&docker.ClusterInput{
1854-
T: t,
1855-
Nodes: 4,
1856-
Distro: "debian-bookworm",
1857-
LicensePath: "license.yaml",
1858-
ECBinaryPath: "../output/bin/embedded-cluster",
1854+
T: t,
1855+
Nodes: 4,
1856+
Distro: "debian-bookworm",
1857+
LicensePath: "license.yaml",
1858+
ECBinaryPath: "../output/bin/embedded-cluster",
1859+
SupportBundleNodeIndex: 2,
18591860
})
18601861
defer tc.Cleanup()
18611862

@@ -1943,24 +1944,30 @@ func TestMultiNodeHAInstallation(t *testing.T) {
19431944
}
19441945

19451946
bin := strings.Split(command, " ")[0]
1946-
t.Logf("%s: resetting controller node 2", time.Now().Format(time.RFC3339))
1947-
stdout, stderr, err = tc.RunCommandOnNode(2, []string{bin, "reset", "--yes"})
1947+
t.Logf("%s: resetting controller node 0", time.Now().Format(time.RFC3339))
1948+
stdout, stderr, err = tc.RunCommandOnNode(0, []string{bin, "reset", "--yes"})
19481949
if err != nil {
1949-
t.Fatalf("fail to remove controller node 2: %v: %s: %s", err, stdout, stderr)
1950+
t.Fatalf("fail to remove controller node 0: %v: %s: %s", err, stdout, stderr)
19501951
}
19511952
if !strings.Contains(stdout, "High-availability clusters must maintain at least three controller nodes") {
19521953
t.Errorf("reset output does not contain the ha warning")
19531954
t.Logf("stdout: %s\nstderr: %s", stdout, stderr)
19541955
}
19551956

1956-
stdout, stderr, err = tc.RunCommandOnNode(0, []string{"check-nodes-removed.sh", "3"})
1957+
stdout, stderr, err = tc.RunCommandOnNode(2, []string{"check-nodes-removed.sh", "3"})
19571958
if err != nil {
1958-
t.Fatalf("fail to remove worker node 0: %v: %s: %s", err, stdout, stderr)
1959+
t.Fatalf("fail to check nodes removed: %v: %s: %s", err, stdout, stderr)
1960+
}
1961+
1962+
t.Logf("%s: checking nllb", time.Now().Format(time.RFC3339))
1963+
line = []string{"check-nllb.sh"}
1964+
if stdout, stderr, err := tc.RunCommandOnNode(2, line); err != nil {
1965+
t.Fatalf("fail to check nllb: %v: %s: %s", err, stdout, stderr)
19591966
}
19601967

19611968
t.Logf("%s: checking installation state after upgrade", time.Now().Format(time.RFC3339))
19621969
line = []string{"check-postupgrade-state.sh", k8sVersion(), ecUpgradeTargetVersion()}
1963-
if stdout, stderr, err := tc.RunCommandOnNode(0, line); err != nil {
1970+
if stdout, stderr, err := tc.RunCommandOnNode(2, line); err != nil {
19641971
t.Fatalf("fail to check postupgrade state: %v: %s: %s", err, stdout, stderr)
19651972
}
19661973

@@ -1993,6 +2000,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
19932000
WithProxy: true,
19942001
AirgapInstallBundlePath: airgapInstallBundlePath,
19952002
AirgapUpgradeBundlePath: airgapUpgradeBundlePath,
2003+
SupportBundleNodeIndex: 2,
19962004
})
19972005
defer tc.Cleanup()
19982006

@@ -2023,10 +2031,7 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
20232031
if _, _, err := tc.RunCommandOnNode(0, line); err != nil {
20242032
t.Fatalf("fail to remove airgap bundle on node %s: %v", tc.Nodes[0], err)
20252033
}
2026-
line = []string{"rm", "/usr/local/bin/embedded-cluster"}
2027-
if _, _, err := tc.RunCommandOnNode(0, line); err != nil {
2028-
t.Fatalf("fail to remove embedded-cluster binary on node %s: %v", tc.Nodes[0], err)
2029-
}
2034+
// do not remove the embedded-cluster binary as it is used for reset
20302035

20312036
if _, _, err := tc.SetupPlaywrightAndRunTest("deploy-app"); err != nil {
20322037
t.Fatalf("fail to run playwright test deploy-app: %v", err)
@@ -2054,8 +2059,8 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
20542059
t.Fatalf("fail to prepare airgap files on node 1: %v", err)
20552060
}
20562061
t.Logf("%s: joining node 1 to the cluster as a worker", time.Now().Format(time.RFC3339))
2057-
if _, _, err := tc.RunCommandOnNode(1, strings.Split(command, " ")); err != nil {
2058-
t.Fatalf("fail to join node 1 to the cluster as a worker: %v", err)
2062+
if stdout, stderr, err := tc.RunCommandOnNode(1, strings.Split(command, " ")); err != nil {
2063+
t.Fatalf("fail to join node 1 to the cluster as a worker: %v: %s: %s", err, stdout, stderr)
20592064
}
20602065
// remove the airgap bundle and binary after joining
20612066
line = []string{"rm", "/assets/release.airgap"}
@@ -2091,7 +2096,10 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
20912096
if _, _, err := tc.RunCommandOnNode(2, line); err != nil {
20922097
t.Fatalf("fail to remove airgap bundle on node 2: %v", err)
20932098
}
2094-
// don't remove the embedded-cluster binary as it is used for reset
2099+
line = []string{"rm", "/usr/local/bin/embedded-cluster"}
2100+
if _, _, err := tc.RunCommandOnNode(2, line); err != nil {
2101+
t.Fatalf("fail to remove embedded-cluster binary on node 2: %v", err)
2102+
}
20952103

20962104
// join another controller in HA mode
20972105
stdout, stderr, err = tc.RunPlaywrightTest("get-join-controller-command")
@@ -2163,27 +2171,32 @@ func TestMultiNodeAirgapHAInstallation(t *testing.T) {
21632171
}
21642172

21652173
bin := strings.Split(command, " ")[0]
2166-
t.Logf("%s: resetting controller node 2 with bin %q", time.Now().Format(time.RFC3339), bin)
2167-
stdout, stderr, err = tc.RunCommandOnNode(2, []string{bin, "reset", "--yes"})
2174+
t.Logf("%s: resetting controller node 0 with bin %q", time.Now().Format(time.RFC3339), bin)
2175+
stdout, stderr, err = tc.RunCommandOnNode(0, []string{bin, "reset", "--yes"})
21682176
if err != nil {
21692177
t.Logf("stdout: %s\nstderr: %s", stdout, stderr)
2170-
t.Fatalf("fail to remove controller node %s:", err)
2178+
t.Fatalf("fail to remove controller node 0 %s:", err)
21712179
}
21722180
if !strings.Contains(stdout, "High-availability clusters must maintain at least three controller nodes") {
21732181
t.Errorf("reset output does not contain the ha warning")
21742182
t.Logf("stdout: %s\nstderr: %s", stdout, stderr)
21752183
}
21762184

2177-
stdout, _, err = tc.RunCommandOnNode(0, []string{"check-nodes-removed.sh", "3"})
2185+
stdout, _, err = tc.RunCommandOnNode(2, []string{"check-nodes-removed.sh", "3"})
21782186
if err != nil {
2179-
t.Log(stdout)
2180-
t.Fatalf("fail to remove worker node %s:", err)
2187+
t.Fatalf("fail to check nodes removed: %v: %s: %s", err, stdout, stderr)
2188+
}
2189+
2190+
t.Logf("%s: checking nllb", time.Now().Format(time.RFC3339))
2191+
line = []string{"check-nllb.sh"}
2192+
if stdout, stderr, err := tc.RunCommandOnNode(2, line); err != nil {
2193+
t.Fatalf("fail to check nllb: %v: %s: %s", err, stdout, stderr)
21812194
}
21822195

21832196
t.Logf("%s: checking installation state after upgrade", time.Now().Format(time.RFC3339))
21842197
line = []string{"check-postupgrade-state.sh", k8sVersion(), ecUpgradeTargetVersion()}
2185-
if _, _, err := tc.RunCommandOnNode(0, line); err != nil {
2186-
t.Fatalf("fail to check postupgrade state: %v", err)
2198+
if stdout, stderr, err := tc.RunCommandOnNode(2, line); err != nil {
2199+
t.Fatalf("fail to check postupgrade state: %v: %s: %s", err, stdout, stderr)
21872200
}
21882201

21892202
t.Logf("%s: test complete", time.Now().Format(time.RFC3339))

e2e/reset_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ func TestMultiNodeReset(t *testing.T) {
109109

110110
stdout, stderr, err = tc.RunCommandOnNode(0, []string{"check-nodes-removed.sh", "2"})
111111
if err != nil {
112-
t.Fatalf("fail to remove worker node 0: %v: %s: %s", err, stdout, stderr)
112+
t.Fatalf("fail to check nodes removed: %v: %s: %s", err, stdout, stderr)
113113
}
114114

115115
t.Logf("%s: checking installation state", time.Now().Format(time.RFC3339))

e2e/scripts/check-nllb.sh

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
#!/bin/bash
2+
3+
# This script is meant to be run after deleting the first control plane node to ensure the cluster
4+
# is still functional.
5+
6+
set -euxo pipefail
7+
8+
DIR=/usr/local/bin
9+
. $DIR/common.sh
10+
11+
function main() {
12+
local worker_node=
13+
worker_node=$(kubectl get nodes -l node-role.kubernetes.io/control-plane!=true -oname | awk -F/ 'NR==1{print $2}')
14+
15+
if [ -z "$worker_node" ]; then
16+
echo "No worker node found"
17+
exit 1
18+
fi
19+
20+
local kotsadm_image=
21+
kotsadm_image=$(kubectl -n kotsadm get deploy kotsadm -o jsonpath='{.spec.template.spec.containers[0].image}')
22+
23+
if [ -z "$kotsadm_image" ]; then
24+
echo "No kotsadm image found"
25+
exit 1
26+
fi
27+
28+
# run the pod on a worker node
29+
kubectl run test-nllb --image "$kotsadm_image" \
30+
--overrides='{"spec": { "nodeSelector": {"kubernetes.io/hostname": "'"$worker_node"'"}}}' --command -- sleep infinity
31+
32+
# wait for the pod to be running
33+
if ! kubectl wait --for=condition=ready pod/test-nllb --timeout=1m; then
34+
echo "Pod test-nllb did not become ready"
35+
kubectl describe pod test-nllb
36+
exit 1
37+
fi
38+
}
39+
40+
main "$@"

e2e/scripts/restore-installation-airgap.exp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ expect {
9292
}
9393

9494
expect {
95-
-timeout 210 "Velero is ready!" {}
95+
-timeout 300 "Velero is ready!" {}
9696
timeout {
9797
puts "\n\nFailed to wait for Velero to be ready."
9898
exit 1

0 commit comments

Comments
 (0)