Skip to content

Commit 3f246d6

Browse files
committed
roachtest: ensure valid suspect duration in mixed version decommission
In cockroachdb#106859, the `decommission/mixed-versions` test was updated to properly support the decommission pre-checks introduced in 23.1, however in doing so there was an inadvertent bug introduced in the test due to the `server.time_after_store_suspect` setting. While this setting can be used to shorten the time a store is considered suspect after node restart, there exists a discrepency in this setting between 23.1 (the current predecessor major version) and 23.2, as 23.2 requires the setting to have a minimum of 10s, otherwise reverting to the default of 30s, despite the fact that this validation is not performed when the setting is actually overridden on the predecessor version. This change corrects that mistake, setting the value to the correct minimum version and waiting out the "suspect" time after restart before attempting decommission. Fixes: cockroachdb#107150. Release note: None
1 parent 69bc4c6 commit 3f246d6

File tree

1 file changed

+10
-11
lines changed

1 file changed

+10
-11
lines changed

pkg/cmd/roachtest/tests/mixed_version_decommission.go

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,12 +40,13 @@ func runDecommissionMixedVersions(
4040

4141
h := newDecommTestHelper(t, c)
4242

43-
// The v20.2 CLI can only be run against servers running v20.2. For this
44-
// reason, we grab a handle on a specific server slated for an upgrade.
4543
pinnedUpgrade := h.getRandNode()
4644
t.L().Printf("pinned n%d for upgrade", pinnedUpgrade)
4745

48-
const suspectDuration = 5 * time.Second
46+
// NB: The suspect duration must be at least 10s, as versions 23.2 and
47+
// beyond will reset to the default of 30s if it fails validation, even if
48+
// set by a previous version.
49+
const suspectDuration = 10 * time.Second
4950

5051
allNodes := c.All()
5152
u := newVersionUpgradeTest(c,
@@ -61,7 +62,7 @@ func runDecommissionMixedVersions(
6162

6263
preloadDataStep(pinnedUpgrade),
6364

64-
// We upgrade a pinnedUpgrade and one other random node of the cluster to v20.2.
65+
// We upgrade a pinned node and one other random node of the cluster to the current version.
6566
binaryUpgradeStep(c.Node(pinnedUpgrade), clusterupgrade.MainVersion),
6667
binaryUpgradeStep(c.Node(h.getRandNodeOtherThan(pinnedUpgrade)), clusterupgrade.MainVersion),
6768
checkAllMembership(pinnedUpgrade, "active"),
@@ -101,7 +102,7 @@ func runDecommissionMixedVersions(
101102
// Note also that this has to remain the last step unless we want this test to
102103
// handle the fact that the decommissioned node will no longer be able
103104
// to communicate with the cluster (i.e. most commands against it will fail).
104-
// This is also why we're making sure to avoid decommissioning pinnedUpgrade
105+
// This is also why we're making sure to avoid decommissioning the pinned node
105106
// itself, as we use it to check the membership after.
106107
fullyDecommissionStep(h.getRandNodeOtherThan(pinnedUpgrade), h.getRandNode(), ""),
107108
checkOneMembership(pinnedUpgrade, "decommissioned"),
@@ -181,7 +182,7 @@ func fullyDecommissionStep(target, from int, binaryVersion string) versionStep {
181182

182183
// checkOneDecommissioning checks against the `decommissioning` column in
183184
// crdb_internal.gossip_liveness, asserting that only one node is marked as
184-
// decommissioning. This check can be run against both v20.1 and v20.2 servers.
185+
// decommissioning.
185186
func checkOneDecommissioning(from int) versionStep {
186187
return func(ctx context.Context, t test.Test, u *versionUpgradeTest) {
187188
// We use a retry block here (and elsewhere) because we're consulting
@@ -214,7 +215,7 @@ func checkOneDecommissioning(from int) versionStep {
214215

215216
// checkNoDecommissioning checks against the `decommissioning` column in
216217
// crdb_internal.gossip_liveness, asserting that only no nodes are marked as
217-
// decommissioning. This check can be run against both v20.1 and v20.2 servers.
218+
// decommissioning.
218219
func checkNoDecommissioning(from int) versionStep {
219220
return func(ctx context.Context, t test.Test, u *versionUpgradeTest) {
220221
if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error {
@@ -237,8 +238,7 @@ func checkNoDecommissioning(from int) versionStep {
237238

238239
// checkOneMembership checks against the `membership` column in
239240
// crdb_internal.gossip_liveness, asserting that only one node is marked with
240-
// the specified membership status. This check can be only be run against
241-
// servers running v20.2 and beyond.
241+
// the specified membership status.
242242
func checkOneMembership(from int, membership string) versionStep {
243243
return func(ctx context.Context, t test.Test, u *versionUpgradeTest) {
244244
if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error {
@@ -268,8 +268,7 @@ func checkOneMembership(from int, membership string) versionStep {
268268

269269
// checkAllMembership checks against the `membership` column in
270270
// crdb_internal.gossip_liveness, asserting that all nodes are marked with
271-
// the specified membership status. This check can be only be run against
272-
// servers running v20.2 and beyond.
271+
// the specified membership status.
273272
func checkAllMembership(from int, membership string) versionStep {
274273
return func(ctx context.Context, t test.Test, u *versionUpgradeTest) {
275274
if err := retry.ForDuration(testutils.DefaultSucceedsSoonDuration, func() error {

0 commit comments

Comments
 (0)