Skip to content

Commit aa9d1fc

Browse files
Resolve cherry-pick conflicts for #19427 backport to release-23.0
Resolve merge conflicts from cherry-picking the analysis ordering and semi-sync rollout PR onto release-23.0. Adapt the new problem-matching system to the release-23.0 codebase by removing IncapacitatedPrimary (not present on this branch) and keeping fmt.Sprintf style consistent with existing code. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent b7064d4 commit aa9d1fc

File tree

6 files changed

+2
-468
lines changed

6 files changed

+2
-468
lines changed

go/test/endtoend/vtorc/general/vtorc_test.go

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,8 @@ import (
2020
"context"
2121
"encoding/json"
2222
"fmt"
23-
<<<<<<< HEAD
24-
=======
2523
"strconv"
2624
"strings"
27-
>>>>>>> e7888dfa83 (`vtorc`: support analysis ordering, improve semi-sync rollout (#19427))
2825
"testing"
2926
"time"
3027

go/vt/vtorc/inst/analysis_dao.go

Lines changed: 0 additions & 128 deletions
Original file line numberDiff line numberDiff line change
@@ -422,141 +422,13 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
422422
return nil
423423
}
424424
isInvalid := m.GetBool("is_invalid")
425-
<<<<<<< HEAD
426-
switch {
427-
case a.IsClusterPrimary && isInvalid:
428-
a.Analysis = InvalidPrimary
429-
a.Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown"
430-
case isInvalid:
431-
a.Analysis = InvalidReplica
432-
a.Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
433-
case a.IsClusterPrimary && !a.LastCheckValid && a.IsDiskStalled:
434-
a.Analysis = PrimaryDiskStalled
435-
a.Description = "Primary has a stalled disk"
436-
ca.hasShardWideAction = true
437-
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas == 0:
438-
a.Analysis = DeadPrimaryWithoutReplicas
439-
a.Description = "Primary cannot be reached by vtorc and has no replica"
440-
ca.hasShardWideAction = true
441-
//
442-
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0:
443-
a.Analysis = DeadPrimary
444-
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
445-
ca.hasShardWideAction = true
446-
//
447-
case a.IsClusterPrimary && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0:
448-
a.Analysis = DeadPrimaryAndReplicas
449-
a.Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
450-
ca.hasShardWideAction = true
451-
//
452-
case a.IsClusterPrimary && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0:
453-
a.Analysis = DeadPrimaryAndSomeReplicas
454-
a.Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
455-
ca.hasShardWideAction = true
456-
//
457-
case a.IsClusterPrimary && !a.IsPrimary:
458-
a.Analysis = PrimaryHasPrimary
459-
a.Description = "Primary is replicating from somewhere else"
460-
ca.hasShardWideAction = true
461-
//
462-
case a.IsClusterPrimary && a.IsReadOnly:
463-
a.Analysis = PrimaryIsReadOnly
464-
a.Description = "Primary is read-only"
465-
//
466-
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) != 0 && !a.SemiSyncPrimaryEnabled:
467-
a.Analysis = PrimarySemiSyncMustBeSet
468-
a.Description = "Primary semi-sync must be set"
469-
//
470-
case a.IsClusterPrimary && policy.SemiSyncAckers(ca.durability, tablet) == 0 && a.SemiSyncPrimaryEnabled:
471-
a.Analysis = PrimarySemiSyncMustNotBeSet
472-
a.Description = "Primary semi-sync must not be set"
473-
//
474-
case a.IsClusterPrimary && a.CurrentTabletType != topodatapb.TabletType_UNKNOWN && a.CurrentTabletType != topodatapb.TabletType_PRIMARY:
475-
a.Analysis = PrimaryCurrentTypeMismatch
476-
a.Description = "Primary tablet's current type is not PRIMARY"
477-
case isStaleTopoPrimary(a, ca):
478-
a.Analysis = StaleTopoPrimary
479-
a.Description = "Primary tablet is stale, older than current primary"
480-
case topo.IsReplicaType(a.TabletType) && a.ErrantGTID != "":
481-
a.Analysis = ErrantGTIDDetected
482-
a.Description = "Tablet has errant GTIDs"
483-
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && a.ShardPrimaryTermTimestamp.IsZero():
484-
// ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
485-
a.Analysis = ClusterHasNoPrimary
486-
a.Description = "Cluster has no primary"
487-
ca.hasShardWideAction = true
488-
case topo.IsReplicaType(a.TabletType) && ca.primaryAlias == "" && !a.ShardPrimaryTermTimestamp.IsZero():
489-
// If there are no primary tablets, but the shard primary start time isn't empty, then we know
490-
// the primary tablet was deleted.
491-
a.Analysis = PrimaryTabletDeleted
492-
a.Description = "Primary tablet has been deleted"
493-
ca.hasShardWideAction = true
494-
case a.IsPrimary && a.SemiSyncBlocked && a.CountSemiSyncReplicasEnabled >= a.SemiSyncPrimaryWaitForReplicaCount:
495-
// The primary is reporting that semi-sync monitor is blocked on writes.
496-
// There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
497-
// There is some network diruption in progress. We should run an ERS.
498-
a.Analysis = PrimarySemiSyncBlocked
499-
a.Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
500-
ca.hasShardWideAction = true
501-
case topo.IsReplicaType(a.TabletType) && !a.IsReadOnly:
502-
a.Analysis = ReplicaIsWritable
503-
a.Description = "Replica is writable"
504-
//
505-
case topo.IsReplicaType(a.TabletType) && a.IsPrimary:
506-
a.Analysis = NotConnectedToPrimary
507-
a.Description = "Not connected to the primary"
508-
//
509-
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && math.Round(a.HeartbeatInterval*2) != float64(a.ReplicaNetTimeout):
510-
a.Analysis = ReplicaMisconfigured
511-
a.Description = "Replica has been misconfigured"
512-
//
513-
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && ca.primaryAlias != "" && a.AnalyzedInstancePrimaryAlias != ca.primaryAlias:
514-
a.Analysis = ConnectedToWrongPrimary
515-
a.Description = "Connected to wrong primary"
516-
//
517-
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && a.ReplicationStopped:
518-
a.Analysis = ReplicationStopped
519-
a.Description = "Replication is stopped"
520-
//
521-
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && !a.SemiSyncReplicaEnabled:
522-
a.Analysis = ReplicaSemiSyncMustBeSet
523-
a.Description = "Replica semi-sync must be set"
524-
//
525-
case topo.IsReplicaType(a.TabletType) && !a.IsPrimary && !policy.IsReplicaSemiSync(ca.durability, primaryTablet, tablet) && a.SemiSyncReplicaEnabled:
526-
a.Analysis = ReplicaSemiSyncMustNotBeSet
527-
a.Description = "Replica semi-sync must not be set"
528-
//
529-
// TODO(sougou): Events below here are either ignored or not possible.
530-
case a.IsPrimary && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0:
531-
a.Analysis = UnreachablePrimaryWithLaggingReplicas
532-
a.Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
533-
//
534-
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == a.CountValidReplicas:
535-
// partial success is here to reduce noise
536-
a.Analysis = UnreachablePrimary
537-
a.Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
538-
//
539-
case a.IsPrimary && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 && a.CountValidReplicatingReplicas < a.CountValidReplicas:
540-
// partial success is here to reduce noise
541-
a.Analysis = UnreachablePrimaryWithBrokenReplicas
542-
a.Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
543-
//
544-
case a.IsPrimary && a.SemiSyncPrimaryEnabled && a.SemiSyncPrimaryStatus && a.SemiSyncPrimaryWaitForReplicaCount > 0 && a.SemiSyncPrimaryClients < a.SemiSyncPrimaryWaitForReplicaCount:
545-
if isStaleBinlogCoordinates {
546-
a.Analysis = LockedSemiSyncPrimary
547-
a.Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements"
548-
} else {
549-
a.Analysis = LockedSemiSyncPrimaryHypothesis
550-
a.Description = "Semi sync primary seems to be locked, more samplings needed to validate"
551-
=======
552425
var matchedProblems []*DetectionAnalysisProblem
553426
for _, problem := range detectionAnalysisProblems {
554427
// When isInvalid is true, instance data is unreliable (never been reached).
555428
// Only InvalidPrimary/InvalidReplica should match; postProcessAnalyses
556429
// handles upgrading InvalidPrimary to DeadPrimary if needed.
557430
if isInvalid && problem.Meta.Analysis != InvalidPrimary && problem.Meta.Analysis != InvalidReplica {
558431
continue
559-
>>>>>>> e7888dfa83 (`vtorc`: support analysis ordering, improve semi-sync rollout (#19427))
560432
}
561433
if problem.HasMatch(a, ca, primaryTablet, tablet, isInvalid, isStaleBinlogCoordinates) {
562434
matchedProblems = append(matchedProblems, problem)

go/vt/vtorc/inst/analysis_problem.go

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -173,18 +173,6 @@ var detectionAnalysisProblems = []*DetectionAnalysisProblem{
173173
},
174174
},
175175

176-
// IncapacitatedPrimary
177-
{
178-
Meta: &DetectionAnalysisProblemMeta{
179-
Analysis: IncapacitatedPrimary,
180-
Description: "Primary is consistently timing out on health checks and may be incapacitated",
181-
Priority: detectionAnalysisPriorityShardWideAction,
182-
},
183-
MatchFunc: func(a *DetectionAnalysis, ca *clusterAnalysis, primary, tablet *topodatapb.Tablet, isInvalid, isStaleBinlogCoordinates bool) bool {
184-
return a.IsClusterPrimary && !a.LastCheckValid && a.PrimaryHealthUnhealthy
185-
},
186-
},
187-
188176
// PrimaryHasPrimary
189177
{
190178
Meta: &DetectionAnalysisProblemMeta{

go/vt/vtorc/logic/topology_recovery.go

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -21,18 +21,10 @@ import (
2121
"encoding/json"
2222
"errors"
2323
"fmt"
24-
<<<<<<< HEAD
25-
"math/rand/v2"
26-
=======
27-
"log/slog"
2824
"maps"
2925
"math/rand/v2"
30-
"net"
31-
"net/http"
3226
"slices"
33-
"strconv"
3427
"sync"
35-
>>>>>>> e7888dfa83 (`vtorc`: support analysis ordering, improve semi-sync rollout (#19427))
3628
"sync/atomic"
3729
"time"
3830

@@ -1055,13 +1047,7 @@ func CheckAndRecover() {
10551047
})
10561048
for _, key := range shardKeys {
10571049
go func() {
1058-
<<<<<<< HEAD
1059-
if err := executeCheckAndRecoverFunction(analysisEntry); err != nil {
1060-
log.Error(err)
1061-
}
1062-
=======
10631050
recoverShardAnalyses(analysisByShard[key], executeCheckAndRecoverFunction)
1064-
>>>>>>> e7888dfa83 (`vtorc`: support analysis ordering, improve semi-sync rollout (#19427))
10651051
}()
10661052
}
10671053
}

0 commit comments

Comments
 (0)