Skip to content

Commit 813959b

Browse files
authored
fix(seaweedfs): raft is flakey causing HA registry instability (#3049)
1 parent 1f1669f commit 813959b

File tree

9 files changed

+617
-48
lines changed

9 files changed

+617
-48
lines changed

.github/workflows/ci.yaml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ jobs:
128128
- int-tests-api
129129
- int-tests-kind
130130
- int-tests-kind-ha-registry
131+
- int-tests-kind-ha-registry-disable-hashi-raft
131132
if: ${{ !cancelled() }}
132133
steps:
133134
# https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
@@ -140,6 +141,9 @@ jobs:
140141
- name: fail if int-tests-kind-ha-registry job was not successful
141142
if: needs.int-tests-kind-ha-registry.result != 'success' && needs.int-tests-kind-ha-registry.result != 'skipped'
142143
run: exit 1
144+
- name: fail if int-tests-kind-ha-registry-disable-hashi-raft job was not successful
145+
if: needs.int-tests-kind-ha-registry-disable-hashi-raft.result != 'success' && needs.int-tests-kind-ha-registry-disable-hashi-raft.result != 'skipped'
146+
run: exit 1
143147
- name: succeed if everything else passed
144148
run: echo "Integration tests succeeded"
145149

@@ -231,6 +235,55 @@ jobs:
231235
export VERSION=${{ needs.output-vars.outputs.ec_version }}
232236
make -C tests/integration/kind test-registry RUN=TestRegistry_EnableHAAirgap
233237
238+
int-tests-kind-ha-registry-disable-hashi-raft:
239+
name: Integration tests (kind) HA registry migrate seaweed raft
240+
runs-on: ubuntu-latest
241+
needs:
242+
- output-vars
243+
- should-run-int-tests-kind
244+
if: needs.should-run-int-tests-kind.outputs.run == 'true'
245+
steps:
246+
- name: Checkout
247+
uses: actions/checkout@v5
248+
- name: Setup go
249+
uses: actions/setup-go@v6
250+
with:
251+
go-version-file: go.mod
252+
cache-dependency-path: "**/*.sum"
253+
- name: Free up runner disk space # this is much faster than .github/actions/free-disk-space
254+
run: |
255+
df -h
256+
sudo rm -rf \
257+
/usr/share/swift \
258+
/usr/share/dotnet \
259+
/usr/lib/jvm \
260+
/usr/local/share/boost \
261+
/usr/local/lib/heroku \
262+
/usr/local/julia* \
263+
/usr/local/.ghcup \
264+
/usr/local/share/powershell \
265+
/usr/local/bin/aliyun \
266+
/usr/local/bin/azcopy \
267+
/usr/local/bin/bicep \
268+
/usr/local/bin/cpack \
269+
/usr/local/bin/hub \
270+
/usr/local/bin/minikube \
271+
/usr/local/bin/packer \
272+
/usr/local/bin/pulumi* \
273+
/usr/local/bin/sam \
274+
/usr/local/bin/stack \
275+
/usr/local/bin/terraform \
276+
/usr/local/bin/oc
277+
df -h
278+
- name: Install kind
279+
uses: helm/kind-action@7cd7463a0995e35ab5d0f2c119f892514f3a3778
280+
with:
281+
install_only: true
282+
- name: Run tests
283+
run: |
284+
export VERSION=${{ needs.output-vars.outputs.ec_version }}
285+
make -C tests/integration/kind test-registry RUN=TestRegistry_DisableHashiRaft
286+
234287
dryrun-tests:
235288
name: Dryrun tests
236289
runs-on: ubuntu-latest

pkg/addons/seaweedfs/static/values.tpl.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@ master:
1313
nodeSelector: ~
1414
disableHttp: true
1515
volumeSizeLimitMB: 30000
16+
raftHashicorp: true
17+
raftBootstrap: true # masters would crash when raftHashicorp=true and raftBootstrap=false
1618
data:
1719
hostPathPrefix: "/var/lib/embedded-cluster/seaweedfs/ssd"
1820
logs:

pkg/addons/seaweedfs/upgrade.go

Lines changed: 79 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package seaweedfs
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"time"
78

89
"github.com/Masterminds/semver/v3"
@@ -52,6 +53,23 @@ func (s *SeaweedFS) Upgrade(
5253
}
5354
}
5455

56+
// When upgrading from a previous version, we need to disable hashicorp raft as a rolling
57+
// update will fail if toggling raft implementation.
58+
shouldDisableHashicorpRaft, err := s.shouldDisableRaftHashicorp(ctx, kcli)
59+
if err != nil {
60+
return fmt.Errorf("checking if raft hashicorp should be disabled: %w", err)
61+
}
62+
if shouldDisableHashicorpRaft {
63+
logrus.Debug("Setting master.raftHashicorp=false and master.raftBootstrap=false")
64+
if err := helm.SetValue(values, "master.raftHashicorp", false); err != nil {
65+
return fmt.Errorf("setting master.raftHashicorp: %w", err)
66+
}
67+
if err := helm.SetValue(values, "master.raftBootstrap", false); err != nil {
68+
return fmt.Errorf("setting master.raftBootstrap: %w", err)
69+
}
70+
logrus.Debug("master.raftHashicorp=false and master.raftBootstrap=false set")
71+
}
72+
5573
_, err = hcli.Upgrade(ctx, helm.UpgradeOptions{
5674
ReleaseName: s.ReleaseName(),
5775
ChartPath: s.ChartLocation(domains),
@@ -73,39 +91,20 @@ func (s *SeaweedFS) Upgrade(
7391
func (s *SeaweedFS) needsScalingRestart(ctx context.Context, kcli client.Client) (bool, error) {
7492
logrus.Debug("Checking if scaling fix is needed for upgrade from pre-2.7.3")
7593

76-
// Get the latest installation to use for getting the previous one
77-
latest, err := kubeutils.GetLatestInstallation(ctx, kcli)
78-
if err != nil {
79-
return false, fmt.Errorf("getting latest installation: %w", err)
80-
}
81-
82-
// Get the previous installation to check version
83-
previous, err := kubeutils.GetPreviousInstallation(ctx, kcli, latest)
94+
prevVersion, err := getPreviousECVersion(ctx, kcli)
8495
if err != nil {
85-
var errNotFound kubeutils.ErrInstallationNotFound
86-
if errors.As(err, &errNotFound) {
87-
logrus.Debug("No previous installation found, no scaling fix needed")
88-
return false, nil // No previous installation means no upgrade, no scaling fix needed
89-
}
90-
return false, fmt.Errorf("getting previous installation: %w", err)
91-
}
92-
93-
if previous == nil || previous.Spec.Config == nil || previous.Spec.Config.Version == "" {
94-
return false, errors.New("previous installation has no version config")
95-
}
96-
97-
// Parse previous version
98-
prevVersion, err := semver.NewVersion(previous.Spec.Config.Version)
99-
if err != nil {
100-
return false, fmt.Errorf("parsing previous version %s: %w", previous.Spec.Config.Version, err)
96+
return false, fmt.Errorf("get previous installation: %w", err)
97+
} else if prevVersion == nil {
98+
logrus.Debug("No previous version found, no scaling fix needed")
99+
return false, nil
101100
}
102101

103102
// Only restart if upgrading from < 2.7.3
104103
if !lessThanECVersion273(prevVersion) {
105-
logrus.Debugf("Previous version %s >= 2.7.3, no scaling fix needed", prevVersion.String())
104+
logrus.Debugf("Previous version %s >= 2.7.3, no scaling fix needed", prevVersion)
106105
return false, nil
107106
}
108-
logrus.Debugf("Previous version %s < 2.7.3, checking StatefulSet configuration", prevVersion.String())
107+
logrus.Debugf("Previous version %s < 2.7.3, checking StatefulSet configuration", prevVersion)
109108

110109
// Check if SeaweedFS StatefulSet exists and check current replica configuration
111110
var sts appsv1.StatefulSet
@@ -141,12 +140,65 @@ func (s *SeaweedFS) needsScalingRestart(ctx context.Context, kcli client.Client)
141140
return false, nil
142141
}
143142

143+
func getPreviousECVersion(ctx context.Context, kcli client.Client) (*semver.Version, error) {
144+
latest, err := kubeutils.GetLatestInstallation(ctx, kcli)
145+
if err != nil {
146+
return nil, fmt.Errorf("get latest installation: %w", err)
147+
}
148+
previous, err := kubeutils.GetPreviousInstallation(ctx, kcli, latest)
149+
if err != nil {
150+
var errNotFound kubeutils.ErrInstallationNotFound
151+
if errors.As(err, &errNotFound) {
152+
return nil, nil
153+
}
154+
return nil, fmt.Errorf("get previous installation: %w", err)
155+
}
156+
if previous.Spec.Config == nil || previous.Spec.Config.Version == "" {
157+
return nil, errors.New("previous installation has no version config")
158+
}
159+
sv, err := semver.NewVersion(previous.Spec.Config.Version)
160+
if err != nil {
161+
return nil, fmt.Errorf("parse previous version %s: %w", previous.Spec.Config.Version, err)
162+
}
163+
return sv, nil
164+
}
165+
166+
var version273 = semver.MustParse("2.7.3")
167+
144168
// lessThanECVersion273 checks if a version is less than 2.7.3
145169
func lessThanECVersion273(ver *semver.Version) bool {
146-
version273 := semver.MustParse("2.7.3")
147170
return ver.LessThan(version273)
148171
}
149172

173+
// shouldDisableRaftHashicorp checks to see if there is a previous statefulset without
174+
// -raftHashicorp argument
175+
func (s *SeaweedFS) shouldDisableRaftHashicorp(ctx context.Context, kcli client.Client) (bool, error) {
176+
logrus.Debug("Checking if hashicorp raft should be disabled")
177+
178+
var sts appsv1.StatefulSet
179+
nsn := client.ObjectKey{Namespace: s.Namespace(), Name: "seaweedfs-master"}
180+
if err := kcli.Get(ctx, nsn, &sts); client.IgnoreNotFound(err) != nil {
181+
return false, fmt.Errorf("get seaweedfs master statefulset: %w", err)
182+
} else if err != nil {
183+
// not found, so no previous statefulset
184+
logrus.Debug("No previous statefulset found, do not disable raft hashicorp")
185+
return false, nil
186+
}
187+
// check if the seaweedfs container has the -raftHashicorp argument
188+
for _, container := range sts.Spec.Template.Spec.Containers {
189+
if container.Name == "seaweedfs" {
190+
for _, arg := range append(container.Command, container.Args...) {
191+
if strings.Contains(arg, "-raftHashicorp") {
192+
logrus.Debug("Raft hashicorp is enabled, do not disable it")
193+
return false, nil
194+
}
195+
}
196+
}
197+
}
198+
logrus.Debug("Raft hashicorp is disabled, disable it")
199+
return true, nil
200+
}
201+
150202
// scaleStatefulSet directly scales the StatefulSet to the target replica count
151203
func (s *SeaweedFS) scaleStatefulSet(ctx context.Context, kcli client.Client, replicas int32) error {
152204
// Get the current StatefulSet

0 commit comments

Comments
 (0)