Skip to content

Commit d429368

Browse files
vtorc: rename isClusterWideRecovery -> isShardWideRecovery (vitessio#18351)
Signed-off-by: Tim Vaillancourt <[email protected]>
1 parent 887e4c4 commit d429368

File tree

2 files changed

+27
-27
lines changed

2 files changed

+27
-27
lines changed

go/vt/vtorc/inst/analysis_dao.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -51,10 +51,10 @@ func initializeAnalysisDaoPostConfiguration() {
5151
}
5252

5353
type clusterAnalysis struct {
54-
hasClusterwideAction bool
55-
totalTablets int
56-
primaryAlias string
57-
durability policy.Durabler
54+
hasShardWideAction bool
55+
totalTablets int
56+
primaryAlias string
57+
durability policy.Durabler
5858
}
5959

6060
// GetReplicationAnalysis will check for replication problems (dead primary; unreachable primary; etc)
@@ -394,8 +394,8 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
394394
ca := clusters[keyspaceShard]
395395
// Increment the total number of tablets.
396396
ca.totalTablets += 1
397-
if ca.hasClusterwideAction {
398-
// We can only take one cluster level action at a time.
397+
if ca.hasShardWideAction {
398+
// We can only take one shard level action at a time.
399399
return nil
400400
}
401401
if ca.durability == nil {
@@ -412,31 +412,31 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
412412
} else if a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled {
413413
a.Analysis = PrimaryDiskStalled
414414
a.Description = "Primary has a stalled disk"
415-
ca.hasClusterwideAction = true
415+
ca.hasShardWideAction = true
416416
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0 {
417417
a.Analysis = DeadPrimaryWithoutReplicas
418418
a.Description = "Primary cannot be reached by vtorc and has no replica"
419-
ca.hasClusterwideAction = true
419+
ca.hasShardWideAction = true
420420
//
421421
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 {
422422
a.Analysis = DeadPrimary
423423
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
424-
ca.hasClusterwideAction = true
424+
ca.hasShardWideAction = true
425425
//
426426
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 {
427427
a.Analysis = DeadPrimaryAndReplicas
428428
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
429-
ca.hasClusterwideAction = true
429+
ca.hasShardWideAction = true
430430
//
431431
} else if a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 {
432432
a.Analysis = DeadPrimaryAndSomeReplicas
433433
a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
434-
ca.hasClusterwideAction = true
434+
ca.hasShardWideAction = true
435435
//
436436
} else if a.IsClusterPrimary && !a.IsPrimary {
437437
a.Analysis = PrimaryHasPrimary
438438
a.Description = "Primary is replicating from somewhere else"
439-
ca.hasClusterwideAction = true
439+
ca.hasShardWideAction = true
440440
//
441441
} else if a.IsClusterPrimary && a.IsReadOnly {
442442
a.Analysis = PrimaryIsReadOnly
@@ -460,20 +460,20 @@ func GetReplicationAnalysis(keyspace string, shard string, hints *ReplicationAna
460460
// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
461461
a.Analysis = ClusterHasNoPrimary
462462
a.Description = "Cluster has no primary"
463-
ca.hasClusterwideAction = true
463+
ca.hasShardWideAction = true
464464
} else if topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp != "" {
465465
// If there are no primary tablets, but the shard primary start time isn't empty, then we know
466466
// the primary tablet was deleted.
467467
a.Analysis = PrimaryTabletDeleted
468468
a.Description = "Primary tablet has been deleted"
469-
ca.hasClusterwideAction = true
469+
ca.hasShardWideAction = true
470470
} else if a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount {
471471
// The primary is reporting that semi-sync monitor is blocked on writes.
472472
// There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
473473
// There is some network diruption in progress. We should run an ERS.
474474
a.Analysis = PrimarySemiSyncBlocked
475475
a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
476-
ca.hasClusterwideAction = true
476+
ca.hasShardWideAction = true
477477
} else if topo.IsReplicaType(a.TabletType) && !a.IsReadOnly {
478478
a.Analysis = ReplicaIsWritable
479479
a.Description = "Replica is writable"

go/vt/vtorc/logic/topology_recovery.go

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -480,8 +480,8 @@ func getRecoverFunctionName(recoveryFunctionCode recoveryFunction) string {
480480
}
481481
}
482482

483-
// isClusterWideRecovery returns whether the given recovery is a cluster-wide recovery or not
484-
func isClusterWideRecovery(recoveryFunctionCode recoveryFunction) bool {
483+
// isShardWideRecovery returns whether the given recovery is a recovery that affects all tablets in a shard
484+
func isShardWideRecovery(recoveryFunctionCode recoveryFunction) bool {
485485
switch recoveryFunctionCode {
486486
case recoverDeadPrimaryFunc, electNewPrimaryFunc, recoverPrimaryTabletDeletedFunc:
487487
return true
@@ -548,13 +548,13 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
548548

549549
// Prioritise primary recovery.
550550
// If we are performing some other action, first ensure that it is not because of primary issues.
551-
// This step is only meant to improve the time taken to detect and fix cluster wide recoveries, it does not impact correctness.
551+
// This step is only meant to improve the time taken to detect and fix shard-wide recoveries, it does not impact correctness.
552552
// If a VTOrc detects an issue on a replica like ReplicationStopped, the underlying cause could be a dead primary instead.
553553
// So, we try to reload that primary's information before proceeding with the replication stopped fix. We do this before acquiring the shard lock
554554
// to allow another VTOrc instance to proceed with the dead primary recovery if it is indeed the case and it detects it before us. If however, the primary
555555
// is not dead, then we will proceed with the fix for the replica. Essentially, we are trading off speed in replica recoveries (by doing an additional primary tablet reload)
556-
// for speed in cluster-wide recoveries (by not holding the shard lock before reloading the primary tablet information).
557-
if !isClusterWideRecovery(checkAndRecoverFunctionCode) {
556+
// for speed in shard-wide recoveries (by not holding the shard lock before reloading the primary tablet information).
557+
if !isShardWideRecovery(checkAndRecoverFunctionCode) {
558558
if err = recheckPrimaryHealth(analysisEntry, DiscoverInstance); err != nil {
559559
return err
560560
}
@@ -586,10 +586,10 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
586586
logger.Errorf("Failed to refresh keyspace and shard, aborting recovery: %v", err)
587587
return err
588588
}
589-
// If we are about to run a cluster-wide recovery, it is imperative to first refresh all the tablets
590-
// of a shard because a new tablet could have been promoted, and we need to have this visibility before we
591-
// run a cluster operation of our own.
592-
if isClusterWideRecovery(checkAndRecoverFunctionCode) {
589+
// If we are about to run a shard-wide recovery, it is imperative to first refresh all the tablets
590+
// of a shard because a new tablet could have been promoted, and we need to have this visibility
591+
// before we run a shard-wide operation of our own.
592+
if isShardWideRecovery(checkAndRecoverFunctionCode) {
593593
var tabletsToIgnore []string
594594
if checkAndRecoverFunctionCode == recoverDeadPrimaryFunc {
595595
tabletsToIgnore = append(tabletsToIgnore, analysisEntry.AnalyzedInstanceAlias)
@@ -599,7 +599,7 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
599599
logger.Info("Force refreshing all shard tablets")
600600
forceRefreshAllTabletsInShard(ctx, analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, tabletsToIgnore)
601601
} else {
602-
// If we are not running a cluster-wide recovery, then it is only concerned with the specific tablet
602+
// If we are not running a shard-wide recovery, then it is only concerned with the specific tablet
603603
// on which the failure occurred and the primary instance of the shard.
604604
// For example, ConnectedToWrongPrimary analysis only cares for whom the current primary tablet is
605605
// and the host-port set on the tablet in question.
@@ -664,11 +664,11 @@ func executeCheckAndRecoverFunction(analysisEntry *inst.ReplicationAnalysis) (er
664664
} else {
665665
logger.Infof("Topology recovery: %+v", topologyRecovery)
666666
}
667-
// If we ran a cluster wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster
667+
// If we ran a shard-wide recovery and actually attempted it, then we know that the replication state for all the tablets in this cluster
668668
// would have changed. So we can go ahead and pre-emptively refresh them.
669669
// For this refresh we don't use the same context that we used for the recovery, since that context might have expired or could expire soon
670670
// Instead we pass the background context. The call forceRefreshAllTabletsInShard handles adding a timeout to it for us.
671-
if isClusterWideRecovery(checkAndRecoverFunctionCode) {
671+
if isShardWideRecovery(checkAndRecoverFunctionCode) {
672672
logger.Info("Forcing refresh of all tablets post recovery")
673673
forceRefreshAllTabletsInShard(context.Background(), analysisEntry.AnalyzedKeyspace, analysisEntry.AnalyzedShard, nil)
674674
} else {

0 commit comments

Comments
 (0)