Skip to content

Commit 7bdb74f

Browse files
[release-20.0] Filter out tablets with unknown replication lag when electing a new primary (vitessio#18004) (vitessio#18074)
Signed-off-by: Florent Poinsard <[email protected]> Co-authored-by: Florent Poinsard <[email protected]>
1 parent 794c89a commit 7bdb74f

File tree

2 files changed

+153
-15
lines changed

2 files changed

+153
-15
lines changed

go/vt/vtctl/reparentutil/util.go

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,16 @@ func ElectNewPrimary(
124124
tb := tablet
125125
errorGroup.Go(func() error {
126126
// find and store the positions for the tablet
127-
pos, replLag, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout)
127+
pos, replLag, replUnknown, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, waitReplicasTimeout)
128128
mu.Lock()
129129
defer mu.Unlock()
130130
if err == nil && (tolerableReplLag == 0 || tolerableReplLag >= replLag) {
131-
validTablets = append(validTablets, tb)
132-
tabletPositions = append(tabletPositions, pos)
131+
if replUnknown {
132+
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v position known but unknown replication status", topoproto.TabletAliasString(tablet.Alias)))
133+
} else {
134+
validTablets = append(validTablets, tb)
135+
tabletPositions = append(tabletPositions, pos)
136+
}
133137
} else {
134138
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v has %v replication lag which is more than the tolerable amount", topoproto.TabletAliasString(tablet.Alias), replLag))
135139
}
@@ -158,7 +162,7 @@ func ElectNewPrimary(
158162

159163
// findPositionAndLagForTablet processes the replication position and lag for a single tablet and
160164
// returns it. It is safe to call from multiple goroutines.
161-
func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, error) {
165+
func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) {
162166
logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias))
163167

164168
ctx, cancel := context.WithTimeout(ctx, waitTimeout)
@@ -169,10 +173,10 @@ func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet,
169173
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
170174
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
171175
logger.Warningf("no replication statue from %v, using empty gtid set", topoproto.TabletAliasString(tablet.Alias))
172-
return replication.Position{}, 0, nil
176+
return replication.Position{}, 0, false, nil
173177
}
174178
logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err)
175-
return replication.Position{}, 0, err
179+
return replication.Position{}, 0, false, err
176180
}
177181

178182
// Use the relay log position if available, otherwise use the executed GTID set (binary log position).
@@ -183,10 +187,10 @@ func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet,
183187
pos, err := replication.DecodePosition(positionString)
184188
if err != nil {
185189
logger.Warningf("cannot decode replica position %v for tablet %v, ignoring tablet: %v", positionString, topoproto.TabletAliasString(tablet.Alias), err)
186-
return replication.Position{}, 0, err
190+
return replication.Position{}, 0, status.ReplicationLagUnknown, err
187191
}
188192

189-
return pos, time.Second * time.Duration(status.ReplicationLagSeconds), nil
193+
return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.ReplicationLagUnknown, nil
190194
}
191195

192196
// FindCurrentPrimary returns the current primary tablet of a shard, if any. The

go/vt/vtctl/reparentutil/util_test.go

Lines changed: 141 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,114 @@ func TestElectNewPrimary(t *testing.T) {
137137
},
138138
errContains: nil,
139139
},
140+
{
141+
name: "more advanced replica has an unknown replication lag",
142+
tmc: &chooseNewPrimaryTestTMClient{
143+
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
144+
replicationStatuses: map[string]*replicationdatapb.Status{
145+
"zone1-0000000101": {
146+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
147+
ReplicationLagSeconds: 10,
148+
},
149+
"zone1-0000000102": {
150+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
151+
ReplicationLagUnknown: true,
152+
},
153+
},
154+
},
155+
tolerableReplLag: 50 * time.Second,
156+
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
157+
PrimaryAlias: &topodatapb.TabletAlias{
158+
Cell: "zone1",
159+
Uid: 100,
160+
},
161+
}, nil),
162+
tabletMap: map[string]*topo.TabletInfo{
163+
"primary": {
164+
Tablet: &topodatapb.Tablet{
165+
Alias: &topodatapb.TabletAlias{
166+
Cell: "zone1",
167+
Uid: 100,
168+
},
169+
Type: topodatapb.TabletType_PRIMARY,
170+
},
171+
},
172+
"replica1": {
173+
Tablet: &topodatapb.Tablet{
174+
Alias: &topodatapb.TabletAlias{
175+
Cell: "zone1",
176+
Uid: 101,
177+
},
178+
Type: topodatapb.TabletType_REPLICA,
179+
},
180+
},
181+
"replica2": {
182+
Tablet: &topodatapb.Tablet{
183+
Alias: &topodatapb.TabletAlias{
184+
Cell: "zone1",
185+
Uid: 102,
186+
},
187+
Type: topodatapb.TabletType_REPLICA,
188+
},
189+
},
190+
},
191+
avoidPrimaryAlias: &topodatapb.TabletAlias{
192+
Cell: "zone1",
193+
Uid: 0,
194+
},
195+
expected: &topodatapb.TabletAlias{
196+
Cell: "zone1",
197+
Uid: 101,
198+
},
199+
errContains: nil,
200+
},
201+
{
202+
name: "replica with unknown replication lag",
203+
tmc: &chooseNewPrimaryTestTMClient{
204+
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
205+
replicationStatuses: map[string]*replicationdatapb.Status{
206+
"zone1-0000000101": {
207+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
208+
ReplicationLagUnknown: true,
209+
},
210+
},
211+
},
212+
tolerableReplLag: 50 * time.Second,
213+
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
214+
PrimaryAlias: &topodatapb.TabletAlias{
215+
Cell: "zone1",
216+
Uid: 100,
217+
},
218+
}, nil),
219+
tabletMap: map[string]*topo.TabletInfo{
220+
"primary": {
221+
Tablet: &topodatapb.Tablet{
222+
Alias: &topodatapb.TabletAlias{
223+
Cell: "zone1",
224+
Uid: 100,
225+
},
226+
Type: topodatapb.TabletType_PRIMARY,
227+
},
228+
},
229+
"replica1": {
230+
Tablet: &topodatapb.Tablet{
231+
Alias: &topodatapb.TabletAlias{
232+
Cell: "zone1",
233+
Uid: 101,
234+
},
235+
Type: topodatapb.TabletType_REPLICA,
236+
},
237+
},
238+
},
239+
avoidPrimaryAlias: &topodatapb.TabletAlias{
240+
Cell: "zone1",
241+
Uid: 0,
242+
},
243+
expected: nil,
244+
errContains: []string{
245+
"zone1-0000000101 position known but unknown replication status",
246+
},
247+
},
140248
{
141249
name: "new primary alias provided - no tolerable replication lag",
142250
tolerableReplLag: 0,
@@ -751,12 +859,13 @@ func TestFindPositionForTablet(t *testing.T) {
751859
ctx := context.Background()
752860
logger := logutil.NewMemoryLogger()
753861
tests := []struct {
754-
name string
755-
tmc *testutil.TabletManagerClient
756-
tablet *topodatapb.Tablet
757-
expectedPosition string
758-
expectedLag time.Duration
759-
expectedErr string
862+
name string
863+
tmc *testutil.TabletManagerClient
864+
tablet *topodatapb.Tablet
865+
expectedPosition string
866+
expectedLag time.Duration
867+
expectedErr string
868+
expectedUnknownReplLag bool
760869
}{
761870
{
762871
name: "executed gtid set",
@@ -846,12 +955,36 @@ func TestFindPositionForTablet(t *testing.T) {
846955
},
847956
},
848957
expectedErr: `parse error: unknown GTIDSet flavor ""`,
958+
}, {
959+
name: "unknown replication lag",
960+
tmc: &testutil.TabletManagerClient{
961+
ReplicationStatusResults: map[string]struct {
962+
Position *replicationdatapb.Status
963+
Error error
964+
}{
965+
"zone1-0000000100": {
966+
Position: &replicationdatapb.Status{
967+
RelayLogPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
968+
ReplicationLagUnknown: true,
969+
},
970+
},
971+
},
972+
},
973+
tablet: &topodatapb.Tablet{
974+
Alias: &topodatapb.TabletAlias{
975+
Cell: "zone1",
976+
Uid: 100,
977+
},
978+
},
979+
expectedLag: 0,
980+
expectedPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
981+
expectedUnknownReplLag: true,
849982
},
850983
}
851984

852985
for _, test := range tests {
853986
t.Run(test.name, func(t *testing.T) {
854-
pos, lag, err := findPositionAndLagForTablet(ctx, test.tablet, logger, test.tmc, 10*time.Second)
987+
pos, lag, replUnknown, err := findPositionAndLagForTablet(ctx, test.tablet, logger, test.tmc, 10*time.Second)
855988
if test.expectedErr != "" {
856989
require.EqualError(t, err, test.expectedErr)
857990
return
@@ -860,6 +993,7 @@ func TestFindPositionForTablet(t *testing.T) {
860993
posString := replication.EncodePosition(pos)
861994
require.Equal(t, test.expectedPosition, posString)
862995
require.Equal(t, test.expectedLag, lag)
996+
require.Equal(t, test.expectedUnknownReplLag, replUnknown)
863997
})
864998
}
865999
}

0 commit comments

Comments
 (0)