Skip to content

Commit 2960a36

Browse files
authored
Merge pull request #112 from github/arthur/add-more-logs
Fix `RemoveTablet` during `TabletExternallyReparented` causing connection issues
2 parents 8515324 + 9894a65 commit 2960a36

File tree

2 files changed

+152
-2
lines changed

2 files changed

+152
-2
lines changed

go/vt/discovery/healthcheck.go

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
"encoding/json"
3838
"fmt"
3939
"hash/crc32"
40+
"math"
4041
"net/http"
4142
"sort"
4243
"strings"
@@ -448,10 +449,16 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
448449
continue
449450
}
450451
delete(ths, tabletAlias)
451-
// delete from healthy list
452+
452453
healthy, ok := hc.healthy[key]
453454
if ok && len(healthy) > 0 {
454-
hc.recomputeHealthy(key)
455+
if tabletType == topodata.TabletType_PRIMARY {
456+
// If tablet type is primary, we should only have one tablet in the healthy list.
457+
hc.recomputeHealthyPrimary(key)
458+
} else {
459+
// Simply recompute the list of healthy tablets for all other tablet types.
460+
hc.recomputeHealthy(key)
461+
}
455462
}
456463
}
457464
}()
@@ -579,6 +586,25 @@ func (hc *HealthCheckImpl) recomputeHealthy(key KeyspaceShardTabletType) {
579586
hc.healthy[key] = FilterStatsByReplicationLag(allArray)
580587
}
581588

589+
// Recompute the healthy primary tablet for the given key.
590+
func (hc *HealthCheckImpl) recomputeHealthyPrimary(key KeyspaceShardTabletType) {
591+
highestPrimaryTermStartTime := int64(math.MinInt64)
592+
var newestPrimary *TabletHealth
593+
594+
for _, s := range hc.healthData[key] {
595+
if s.PrimaryTermStartTime >= highestPrimaryTermStartTime {
596+
highestPrimaryTermStartTime = s.PrimaryTermStartTime
597+
newestPrimary = s
598+
}
599+
}
600+
601+
if newestPrimary != nil {
602+
hc.healthy[key] = []*TabletHealth{newestPrimary}
603+
} else {
604+
hc.healthy[key] = []*TabletHealth{}
605+
}
606+
}
607+
582608
// Subscribe adds a listener. Used by vtgate buffer to learn about primary changes.
583609
func (hc *HealthCheckImpl) Subscribe() chan *TabletHealth {
584610
hc.subMu.Lock()
@@ -680,6 +706,11 @@ func (hc *HealthCheckImpl) GetHealthyTabletStats(target *query.Target) []*Tablet
680706
var result []*TabletHealth
681707
hc.mu.Lock()
682708
defer hc.mu.Unlock()
709+
710+
if (target.TabletType == topodata.TabletType_PRIMARY) && len(hc.healthy[KeyFromTarget(target)]) > 1 {
711+
log.Warningf("[BUG] GetHealthyTabletStats called for primary tablet type, but returning more than one primary tablet")
712+
}
713+
683714
return append(result, hc.healthy[KeyFromTarget(target)]...)
684715
}
685716

go/vt/discovery/healthcheck_test.go

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,125 @@ func TestRemoveTablet(t *testing.T) {
777777
assert.Empty(t, a, "wrong result, expected empty list")
778778
}
779779

780+
// When an external primary failover is performed,
781+
// the demoted primary will advertise itself as a `PRIMARY`
782+
// tablet until it recognizes that it was demoted,
783+
// and until all in-flight operations have either finished
784+
// (successfully or unsuccessfully, see `--shutdown_grace_period` flag).
785+
//
786+
// During this time, operations like `RemoveTablet` should not lead
787+
// to multiple tablets becoming valid targets for `PRIMARY`.
788+
func TestRemoveTabletDuringExternalReparenting(t *testing.T) {
789+
// reset error counters
790+
hcErrorCounters.ResetAll()
791+
ts := memorytopo.NewServer("cell")
792+
defer ts.Close()
793+
hc := createTestHc(ts)
794+
// close healthcheck
795+
defer hc.Close()
796+
797+
firstTablet := createTestTablet(0, "cell", "a")
798+
firstTablet.Type = topodatapb.TabletType_PRIMARY
799+
800+
secondTablet := createTestTablet(1, "cell", "b")
801+
secondTablet.Type = topodatapb.TabletType_REPLICA
802+
803+
thirdTablet := createTestTablet(2, "cell", "c")
804+
thirdTablet.Type = topodatapb.TabletType_REPLICA
805+
806+
firstTabletHealthStream := make(chan *querypb.StreamHealthResponse)
807+
firstTabletConn := createFakeConn(firstTablet, firstTabletHealthStream)
808+
firstTabletConn.errCh = make(chan error)
809+
810+
secondTabletHealthStream := make(chan *querypb.StreamHealthResponse)
811+
secondTabletConn := createFakeConn(secondTablet, secondTabletHealthStream)
812+
secondTabletConn.errCh = make(chan error)
813+
814+
thirdTabletHealthStream := make(chan *querypb.StreamHealthResponse)
815+
thirdTabletConn := createFakeConn(thirdTablet, thirdTabletHealthStream)
816+
thirdTabletConn.errCh = make(chan error)
817+
818+
resultChan := hc.Subscribe()
819+
820+
hc.AddTablet(firstTablet)
821+
hc.AddTablet(secondTablet)
822+
hc.AddTablet(thirdTablet)
823+
824+
<-resultChan
825+
<-resultChan
826+
827+
firstTabletPrimaryTermStartTimestamp := time.Now().Unix() - 10
828+
829+
firstTabletHealthStream <- &querypb.StreamHealthResponse{
830+
TabletAlias: firstTablet.Alias,
831+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
832+
Serving: true,
833+
834+
TabletExternallyReparentedTimestamp: firstTabletPrimaryTermStartTimestamp,
835+
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
836+
}
837+
838+
secondTabletHealthStream <- &querypb.StreamHealthResponse{
839+
TabletAlias: secondTablet.Alias,
840+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
841+
Serving: true,
842+
843+
TabletExternallyReparentedTimestamp: 0,
844+
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
845+
}
846+
847+
thirdTabletHealthStream <- &querypb.StreamHealthResponse{
848+
TabletAlias: thirdTablet.Alias,
849+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_REPLICA},
850+
Serving: true,
851+
852+
TabletExternallyReparentedTimestamp: 0,
853+
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 1, CpuUsage: 0.5},
854+
}
855+
856+
<-resultChan
857+
<-resultChan
858+
<-resultChan
859+
860+
secondTabletPrimaryTermStartTimestamp := time.Now().Unix()
861+
862+
// Simulate a failover
863+
firstTabletHealthStream <- &querypb.StreamHealthResponse{
864+
TabletAlias: firstTablet.Alias,
865+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
866+
Serving: true,
867+
868+
TabletExternallyReparentedTimestamp: firstTabletPrimaryTermStartTimestamp,
869+
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
870+
}
871+
872+
secondTabletHealthStream <- &querypb.StreamHealthResponse{
873+
TabletAlias: secondTablet.Alias,
874+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
875+
Serving: true,
876+
877+
TabletExternallyReparentedTimestamp: secondTabletPrimaryTermStartTimestamp,
878+
RealtimeStats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
879+
}
880+
881+
<-resultChan
882+
<-resultChan
883+
884+
hc.RemoveTablet(thirdTablet)
885+
886+
// `secondTablet` should be the primary now
887+
expectedTabletStats := []*TabletHealth{{
888+
Tablet: secondTablet,
889+
Target: &querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY},
890+
Serving: true,
891+
Stats: &querypb.RealtimeStats{ReplicationLagSeconds: 0, CpuUsage: 0.5},
892+
PrimaryTermStartTime: secondTabletPrimaryTermStartTimestamp,
893+
}}
894+
895+
actualTabletStats := hc.GetHealthyTabletStats(&querypb.Target{Keyspace: "k", Shard: "s", TabletType: topodatapb.TabletType_PRIMARY})
896+
mustMatch(t, expectedTabletStats, actualTabletStats, "unexpected result")
897+
}
898+
780899
// TestGetHealthyTablets tests the functionality of GetHealthyTabletStats.
781900
func TestGetHealthyTablets(t *testing.T) {
782901
ts := memorytopo.NewServer("cell")

0 commit comments

Comments
 (0)