Skip to content

Commit 47e7bb5

Browse files
[release-21.0] Filter out tablets with unknown replication lag when electing a new primary (#18004) (#18075)
Signed-off-by: Florent Poinsard <[email protected]> Co-authored-by: Florent Poinsard <[email protected]>
1 parent f2cdfa6 commit 47e7bb5

File tree

2 files changed

+154
-16
lines changed

2 files changed

+154
-16
lines changed

go/vt/vtctl/reparentutil/util.go

Lines changed: 13 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,13 +126,17 @@ func ElectNewPrimary(
126126
tb := tablet
127127
errorGroup.Go(func() error {
128128
// find and store the positions for the tablet
129-
pos, replLag, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
129+
pos, replLag, replUnknown, err := findPositionAndLagForTablet(groupCtx, tb, logger, tmc, opts.WaitReplicasTimeout)
130130
mu.Lock()
131131
defer mu.Unlock()
132132
if err == nil && (opts.TolerableReplLag == 0 || opts.TolerableReplLag >= replLag) {
133-
validTablets = append(validTablets, tb)
134-
tabletPositions = append(tabletPositions, pos)
135-
innodbBufferPool = append(innodbBufferPool, innodbBufferPoolData[topoproto.TabletAliasString(tb.Alias)])
133+
if replUnknown {
134+
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v position known but unknown replication status", topoproto.TabletAliasString(tablet.Alias)))
135+
} else {
136+
validTablets = append(validTablets, tb)
137+
tabletPositions = append(tabletPositions, pos)
138+
innodbBufferPool = append(innodbBufferPool, innodbBufferPoolData[topoproto.TabletAliasString(tb.Alias)])
139+
}
136140
} else {
137141
reasonsToInvalidate.WriteString(fmt.Sprintf("\n%v has %v replication lag which is more than the tolerable amount", topoproto.TabletAliasString(tablet.Alias), replLag))
138142
}
@@ -161,7 +165,7 @@ func ElectNewPrimary(
161165

162166
// findPositionAndLagForTablet processes the replication position and lag for a single tablet and
163167
// returns it. It is safe to call from multiple goroutines.
164-
func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, error) {
168+
func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet, logger logutil.Logger, tmc tmclient.TabletManagerClient, waitTimeout time.Duration) (replication.Position, time.Duration, bool, error) {
165169
logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias))
166170

167171
ctx, cancel := context.WithTimeout(ctx, waitTimeout)
@@ -172,10 +176,10 @@ func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet,
172176
sqlErr, isSQLErr := sqlerror.NewSQLErrorFromError(err).(*sqlerror.SQLError)
173177
if isSQLErr && sqlErr != nil && sqlErr.Number() == sqlerror.ERNotReplica {
174178
logger.Warningf("no replication statue from %v, using empty gtid set", topoproto.TabletAliasString(tablet.Alias))
175-
return replication.Position{}, 0, nil
179+
return replication.Position{}, 0, false, nil
176180
}
177181
logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err)
178-
return replication.Position{}, 0, err
182+
return replication.Position{}, 0, false, err
179183
}
180184

181185
// Use the relay log position if available, otherwise use the executed GTID set (binary log position).
@@ -186,10 +190,10 @@ func findPositionAndLagForTablet(ctx context.Context, tablet *topodatapb.Tablet,
186190
pos, err := replication.DecodePosition(positionString)
187191
if err != nil {
188192
logger.Warningf("cannot decode replica position %v for tablet %v, ignoring tablet: %v", positionString, topoproto.TabletAliasString(tablet.Alias), err)
189-
return replication.Position{}, 0, err
193+
return replication.Position{}, 0, status.ReplicationLagUnknown, err
190194
}
191195

192-
return pos, time.Second * time.Duration(status.ReplicationLagSeconds), nil
196+
return pos, time.Second * time.Duration(status.ReplicationLagSeconds), status.ReplicationLagUnknown, nil
193197
}
194198

195199
// FindCurrentPrimary returns the current primary tablet of a shard, if any. The

go/vt/vtctl/reparentutil/util_test.go

Lines changed: 141 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,114 @@ func TestElectNewPrimary(t *testing.T) {
139139
},
140140
errContains: nil,
141141
},
142+
{
143+
name: "more advanced replica has an unknown replication lag",
144+
tmc: &chooseNewPrimaryTestTMClient{
145+
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
146+
replicationStatuses: map[string]*replicationdatapb.Status{
147+
"zone1-0000000101": {
148+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
149+
ReplicationLagSeconds: 10,
150+
},
151+
"zone1-0000000102": {
152+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1-5",
153+
ReplicationLagUnknown: true,
154+
},
155+
},
156+
},
157+
tolerableReplLag: 50 * time.Second,
158+
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
159+
PrimaryAlias: &topodatapb.TabletAlias{
160+
Cell: "zone1",
161+
Uid: 100,
162+
},
163+
}, nil),
164+
tabletMap: map[string]*topo.TabletInfo{
165+
"primary": {
166+
Tablet: &topodatapb.Tablet{
167+
Alias: &topodatapb.TabletAlias{
168+
Cell: "zone1",
169+
Uid: 100,
170+
},
171+
Type: topodatapb.TabletType_PRIMARY,
172+
},
173+
},
174+
"replica1": {
175+
Tablet: &topodatapb.Tablet{
176+
Alias: &topodatapb.TabletAlias{
177+
Cell: "zone1",
178+
Uid: 101,
179+
},
180+
Type: topodatapb.TabletType_REPLICA,
181+
},
182+
},
183+
"replica2": {
184+
Tablet: &topodatapb.Tablet{
185+
Alias: &topodatapb.TabletAlias{
186+
Cell: "zone1",
187+
Uid: 102,
188+
},
189+
Type: topodatapb.TabletType_REPLICA,
190+
},
191+
},
192+
},
193+
avoidPrimaryAlias: &topodatapb.TabletAlias{
194+
Cell: "zone1",
195+
Uid: 0,
196+
},
197+
expected: &topodatapb.TabletAlias{
198+
Cell: "zone1",
199+
Uid: 101,
200+
},
201+
errContains: nil,
202+
},
203+
{
204+
name: "replica with unknown replication lag",
205+
tmc: &chooseNewPrimaryTestTMClient{
206+
// zone1-101 is behind zone1-102 bug zone1-102 has an unknown replication lag, hence picking zone1-101
207+
replicationStatuses: map[string]*replicationdatapb.Status{
208+
"zone1-0000000101": {
209+
Position: "MySQL56/3E11FA47-71CA-11E1-9E33-C80AA9429562:1",
210+
ReplicationLagUnknown: true,
211+
},
212+
},
213+
},
214+
tolerableReplLag: 50 * time.Second,
215+
shardInfo: topo.NewShardInfo("testkeyspace", "-", &topodatapb.Shard{
216+
PrimaryAlias: &topodatapb.TabletAlias{
217+
Cell: "zone1",
218+
Uid: 100,
219+
},
220+
}, nil),
221+
tabletMap: map[string]*topo.TabletInfo{
222+
"primary": {
223+
Tablet: &topodatapb.Tablet{
224+
Alias: &topodatapb.TabletAlias{
225+
Cell: "zone1",
226+
Uid: 100,
227+
},
228+
Type: topodatapb.TabletType_PRIMARY,
229+
},
230+
},
231+
"replica1": {
232+
Tablet: &topodatapb.Tablet{
233+
Alias: &topodatapb.TabletAlias{
234+
Cell: "zone1",
235+
Uid: 101,
236+
},
237+
Type: topodatapb.TabletType_REPLICA,
238+
},
239+
},
240+
},
241+
avoidPrimaryAlias: &topodatapb.TabletAlias{
242+
Cell: "zone1",
243+
Uid: 0,
244+
},
245+
expected: nil,
246+
errContains: []string{
247+
"zone1-0000000101 position known but unknown replication status",
248+
},
249+
},
142250
{
143251
name: "new primary alias provided - no tolerable replication lag",
144252
tolerableReplLag: 0,
@@ -881,12 +989,13 @@ func TestFindPositionForTablet(t *testing.T) {
881989
ctx := context.Background()
882990
logger := logutil.NewMemoryLogger()
883991
tests := []struct {
884-
name string
885-
tmc *testutil.TabletManagerClient
886-
tablet *topodatapb.Tablet
887-
expectedPosition string
888-
expectedLag time.Duration
889-
expectedErr string
992+
name string
993+
tmc *testutil.TabletManagerClient
994+
tablet *topodatapb.Tablet
995+
expectedPosition string
996+
expectedLag time.Duration
997+
expectedErr string
998+
expectedUnknownReplLag bool
890999
}{
8911000
{
8921001
name: "executed gtid set",
@@ -976,12 +1085,36 @@ func TestFindPositionForTablet(t *testing.T) {
9761085
},
9771086
},
9781087
expectedErr: `parse error: unknown GTIDSet flavor ""`,
1088+
}, {
1089+
name: "unknown replication lag",
1090+
tmc: &testutil.TabletManagerClient{
1091+
ReplicationStatusResults: map[string]struct {
1092+
Position *replicationdatapb.Status
1093+
Error error
1094+
}{
1095+
"zone1-0000000100": {
1096+
Position: &replicationdatapb.Status{
1097+
RelayLogPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
1098+
ReplicationLagUnknown: true,
1099+
},
1100+
},
1101+
},
1102+
},
1103+
tablet: &topodatapb.Tablet{
1104+
Alias: &topodatapb.TabletAlias{
1105+
Cell: "zone1",
1106+
Uid: 100,
1107+
},
1108+
},
1109+
expectedLag: 0,
1110+
expectedPosition: "MySQL56/3e11fa47-71ca-11e1-9e33-c80aa9429562:1-5",
1111+
expectedUnknownReplLag: true,
9791112
},
9801113
}
9811114

9821115
for _, test := range tests {
9831116
t.Run(test.name, func(t *testing.T) {
984-
pos, lag, err := findPositionAndLagForTablet(ctx, test.tablet, logger, test.tmc, 10*time.Second)
1117+
pos, lag, replUnknown, err := findPositionAndLagForTablet(ctx, test.tablet, logger, test.tmc, 10*time.Second)
9851118
if test.expectedErr != "" {
9861119
require.EqualError(t, err, test.expectedErr)
9871120
return
@@ -990,6 +1123,7 @@ func TestFindPositionForTablet(t *testing.T) {
9901123
posString := replication.EncodePosition(pos)
9911124
require.Equal(t, test.expectedPosition, posString)
9921125
require.Equal(t, test.expectedLag, lag)
1126+
require.Equal(t, test.expectedUnknownReplLag, replUnknown)
9931127
})
9941128
}
9951129
}

0 commit comments

Comments
 (0)