Skip to content

Commit 956b1b8

Browse files
committed
roachtest: deflake multi-store-remove
The test was checking on replication by asserting that the number of ranges times the replication factor is equal to the number of replicas. Replicas can persist after no longer being part of a range (waiting for replicaGC), which is why this test was flaky. I first tried to fix it by lowering the replicaGC queue's interval at which it checks ranges (defaults to 12h), but looking more closely at what the test was doing, I decided to change that instead by no longer caring about what replicas each store reports. What matters is what the meta ranges report, and now the test checks replication similar to how many other tests do, by querying ranges_no_leases. Fixes cockroachdb#146435. Fixes cockroachdb#147763. Fixes cockroachdb#156447. Epic: none
1 parent 9bcc3fd commit 956b1b8

File tree

1 file changed

+28
-23
lines changed

1 file changed

+28
-23
lines changed

pkg/cmd/roachtest/tests/multi_store_remove.go

Lines changed: 28 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,15 @@ import (
1313
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
1414
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1515
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
16+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1617
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
1718
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
1819
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
20+
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
1921
"github.com/cockroachdb/cockroach/pkg/util/retry"
2022
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
2123
"github.com/cockroachdb/errors"
24+
"github.com/stretchr/testify/require"
2225
)
2326

2427
const (
@@ -68,6 +71,7 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
6871
)
6972

7073
c.Start(ctx, t.L(), startOpts, startSettings, c.Range(1, 3))
74+
require.NoError(t, roachtestutil.WaitFor3XReplication(ctx, t.L(), c.Conn(ctx, t.L(), 1)))
7175

7276
// Confirm that there are 6 stores live.
7377
t.Status("store setup")
@@ -134,6 +138,9 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
134138
if err := c.StartE(ctx, t.L(), startOpts, startSettings, node); err != nil {
135139
t.Fatalf("restarting node: %s", err)
136140
}
141+
// Example: with 12 stores per node, since we removed n1's last one, we did remove
142+
// s12.
143+
removedStoreID := multiStoreStoresPerNode
137144

138145
// Wait for the store to be marked as dead.
139146
t.Status("awaiting store death")
@@ -151,32 +158,30 @@ func runMultiStoreRemove(ctx context.Context, t test.Test, c cluster.Cluster) {
151158
t.Fatalf("awaiting store death: %s", err)
152159
}
153160

154-
// Wait for up-replication.
155-
// NOTE: At the time of writing, under-replicated ranges are computed using
156-
// node liveness, rather than store liveness, so we instead compare the range
157-
// count to the current per-store replica count to compute whether all there
158-
// are still under-replicated ranges from the dead store.
159-
// TODO(travers): Once #123561 is solved, re-work this.
160-
t.Status("awaiting up-replication")
161-
tStart := timeutil.Now()
162-
var oldReplicas int
161+
// Wait until no ranges remain on the removed store.
162+
t.Status("awaiting range relocation")
163+
sr := sqlutils.MakeSQLRunner(conn)
163164
for {
164-
var ranges, replicas int
165-
if err := conn.QueryRowContext(ctx,
166-
`SELECT
167-
(SELECT count(1) FROM crdB_internal.ranges) AS ranges
168-
, (SELECT sum(range_count) FROM crdb_internal.kv_store_status) AS replicas`,
169-
).Scan(&ranges, &replicas); err != nil {
170-
t.Fatalf("replication status: %s", err)
171-
}
172-
if replicas == 3*ranges {
173-
t.L().Printf("up-replication complete")
165+
res := sr.QueryStr(t, `
166+
SELECT
167+
(SELECT count(*) FROM crdb_internal.ranges_no_leases
168+
WHERE $1::INT = ANY(replicas)) AS cnt,
169+
ARRAY(
170+
SELECT range_id
171+
FROM crdb_internal.ranges_no_leases
172+
WHERE $1::INT = ANY(replicas)
173+
ORDER BY range_id
174+
LIMIT 10) AS sample;
175+
`, removedStoreID)
176+
cnt := res[0][0]
177+
sample := res[0][1]
178+
179+
if cnt == "0" {
180+
t.L().Printf("all ranges moved off removed store")
174181
break
175182
}
176-
if timeutil.Since(tStart) > 30*time.Second || oldReplicas != replicas {
177-
t.L().Printf("still waiting for replication (%d / %d)", replicas, 3*ranges)
178-
}
179-
oldReplicas = replicas
183+
184+
t.L().Printf("%s ranges remain on removed store; for example: %s", cnt, sample)
180185
time.Sleep(5 * time.Second)
181186
}
182187
t.Status("done")

0 commit comments

Comments
 (0)