Skip to content

Commit db2c533

Browse files
[release-22.0] Fix deadlock in ValidateKeyspace (#18114) (#18117)
Signed-off-by: Manan Gupta <[email protected]> Co-authored-by: vitess-bot[bot] <108069721+vitess-bot[bot]@users.noreply.github.com>
1 parent 0aa6ff5 commit db2c533

File tree

2 files changed

+122
-9
lines changed

2 files changed

+122
-9
lines changed

go/vt/vtctl/grpcvtctldserver/server.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5078,8 +5078,17 @@ func (s *VtctldServer) ValidateShard(ctx context.Context, req *vtctldatapb.Valid
50785078

50795079
var (
50805080
wg sync.WaitGroup
5081-
results = make(chan string, len(aliases))
5081+
results = make(chan string, len(aliases)+1)
50825082
)
5083+
// Start processing results immediately, so that we
5084+
// don't end up blocking on writes.
5085+
done := make(chan bool)
5086+
go func() {
5087+
for result := range results {
5088+
resp.Results = append(resp.Results, result)
5089+
}
5090+
done <- true
5091+
}()
50835092

50845093
for _, alias := range aliases {
50855094
wg.Add(1)
@@ -5184,14 +5193,6 @@ func (s *VtctldServer) ValidateShard(ctx context.Context, req *vtctldatapb.Valid
51845193
pingTablets(ctx, tabletMap, results) // done async, using the waitgroup declared above in the main method body.
51855194
}
51865195

5187-
done := make(chan bool)
5188-
go func() {
5189-
for result := range results {
5190-
resp.Results = append(resp.Results, result)
5191-
}
5192-
done <- true
5193-
}()
5194-
51955196
wg.Wait()
51965197
close(results)
51975198
<-done

go/vt/vtctl/grpcvtctldserver/server_test.go

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222
"fmt"
2323
"io"
2424
"os"
25+
"path"
2526
"slices"
2627
"sort"
2728
"strings"
@@ -14398,6 +14399,117 @@ func TestValidateKeyspace(t *testing.T) {
1439814399
}
1439914400
}
1440014401

14402+
// This test validates the bug described in https://github.com/vitessio/vitess/issues/18113.
14403+
func TestValidateKeyspaceErrors(t *testing.T) {
14404+
ctx, cancel := context.WithCancel(context.Background())
14405+
defer cancel()
14406+
cell := "zone-1"
14407+
ks := "ks"
14408+
shard := "-"
14409+
// Setup the topo server and the vtctld server.
14410+
ts := memorytopo.NewServer(ctx, cell)
14411+
tmc := &testutil.TabletManagerClient{}
14412+
vtctldServer := NewTestVtctldServer(ts, tmc)
14413+
conn, err := ts.ConnForCell(ctx, cell)
14414+
require.NoError(t, err)
14415+
14416+
// Create the keyspace and shard record
14417+
_, err = ts.GetOrCreateShard(ctx, ks, shard)
14418+
require.NoError(t, err)
14419+
14420+
// verify ValidateKeyspace finds all the issues without hanging
14421+
resp := validateKeyspace(t, vtctldServer, ctx, ks)
14422+
// We expect two results -
14423+
// 1. no primary for shard ks/-
14424+
// 2. no primary in shard record ks/-
14425+
require.Len(t, resp.GetResultsByShard()[shard].Results, 2)
14426+
14427+
// Next we test the worst case for ValidateKeyspace wherein everything creates
14428+
// issues to ensure we never block on writing to the results field.
14429+
14430+
// We first create all tablets. We intentionally create a mismatch in the tablet alias
14431+
// file name and the tablet alias actually stored so that we see the error in Validate.
14432+
numTablets := 10
14433+
tablets := make([]*topodatapb.Tablet, numTablets)
14434+
for i := 0; i < numTablets; i++ {
14435+
typ := topodatapb.TabletType_REPLICA
14436+
if i == 0 {
14437+
typ = topodatapb.TabletType_PRIMARY
14438+
}
14439+
tablets[i] = &topodatapb.Tablet{
14440+
Keyspace: ks,
14441+
Shard: shard,
14442+
Type: typ,
14443+
Alias: &topodatapb.TabletAlias{
14444+
Cell: cell,
14445+
Uid: uint32(100 + i),
14446+
},
14447+
}
14448+
err = ts.CreateTablet(ctx, tablets[i])
14449+
require.NoError(t, err)
14450+
14451+
// We're explicitly messing up the record, so we can't use
14452+
// the UpdateTablet function as it doesn't not allow this.
14453+
tabletPath := path.Join(topo.TabletsPath, topoproto.TabletAliasString(tablets[i].Alias), topo.TabletFile)
14454+
tablets[i].Alias.Uid += 100
14455+
data, err := tablets[i].MarshalVT()
14456+
require.NoError(t, err)
14457+
_, err = conn.Update(ctx, tabletPath, data, nil)
14458+
require.NoError(t, err)
14459+
// Reset the values for later use.
14460+
tablets[i].Alias.Uid -= 100
14461+
}
14462+
14463+
// verify ValidateKeyspace finds all the issues without hanging
14464+
resp = validateKeyspace(t, vtctldServer, ctx, ks)
14465+
// This time we expect the following results -
14466+
// 1. primary mismatch for shard ks/-: found zone-1-0000000100, expected <nil>
14467+
// 2. no primary in shard record ks/-
14468+
// 3. topo.Validate for all numTablets tablets.
14469+
// 4. tabletmanager ping for all numTablets tablets with a general error.
14470+
require.Len(t, resp.GetResultsByShard()[shard].Results, 2*numTablets+2, strings.Join(resp.GetResultsByShard()[shard].Results, ","))
14471+
14472+
// We now fix the shard record so that we can get even more errors.
14473+
_, err = ts.UpdateShardFields(ctx, ks, shard, func(info *topo.ShardInfo) error {
14474+
info.PrimaryAlias = tablets[0].Alias
14475+
return nil
14476+
})
14477+
require.NoError(t, err)
14478+
14479+
tmc.GetReplicasResults = map[string]struct {
14480+
Replicas []string
14481+
Error error
14482+
}{
14483+
"zone-1-0000000200": {
14484+
// We return numTablets-1 values for the replica tablets.
14485+
Replicas: make([]string, numTablets-1),
14486+
},
14487+
}
14488+
14489+
// verify ValidateKeyspace finds all the issues without hanging
14490+
resp = validateKeyspace(t, vtctldServer, ctx, ks)
14491+
// This time we expect the following results results -
14492+
// 1. topo.Validate for all numTablets tablets.
14493+
// 2. Failure in resolving the hostname for all numTablets tablets.
14494+
// 3. Replicas not being in the replication graph (since we couldn't get their hostnames). This is for all the replicas -> numTablets - 1
14495+
// 4. Resolving hostname again for all the replicas -> numTablets - 1
14496+
// 5. tabletmanager ping for all numTablets tablets with a general error.
14497+
require.Len(t, resp.GetResultsByShard()[shard].Results, 5*numTablets-2, strings.Join(resp.GetResultsByShard()[shard].Results, ","))
14498+
}
14499+
14500+
// validateKeyspace is a helper function for testing.
14501+
func validateKeyspace(t *testing.T, vtctldServer *VtctldServer, ctx context.Context, ks string) *vtctldatapb.ValidateKeyspaceResponse {
14502+
t.Helper()
14503+
resp, err := vtctldServer.ValidateKeyspace(ctx, &vtctldatapb.ValidateKeyspaceRequest{
14504+
Keyspace: ks,
14505+
PingTablets: true,
14506+
})
14507+
require.NoError(t, err)
14508+
require.NotNil(t, resp)
14509+
require.NotNil(t, resp.GetResultsByShard())
14510+
return resp
14511+
}
14512+
1440114513
func TestValidateShard(t *testing.T) {
1440214514
t.Parallel()
1440314515

0 commit comments

Comments
 (0)