@@ -422,141 +422,13 @@ func GetDetectionAnalysis(keyspace string, shard string, hints *DetectionAnalysi
422422 return nil
423423 }
424424 isInvalid := m .GetBool ("is_invalid" )
425- << << << < HEAD
426- switch {
427- case a .IsClusterPrimary && isInvalid :
428- a .Analysis = InvalidPrimary
429- a .Description = "VTOrc hasn't been able to reach the primary even once since restart/shutdown"
430- case isInvalid :
431- a .Analysis = InvalidReplica
432- a .Description = "VTOrc hasn't been able to reach the replica even once since restart/shutdown"
433- case a .IsClusterPrimary && ! a .LastCheckValid && a .IsDiskStalled :
434- a .Analysis = PrimaryDiskStalled
435- a .Description = "Primary has a stalled disk"
436- ca .hasShardWideAction = true
437- case a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas == 0 :
438- a .Analysis = DeadPrimaryWithoutReplicas
439- a .Description = "Primary cannot be reached by vtorc and has no replica"
440- ca .hasShardWideAction = true
441- //
442- case a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas == a .CountReplicas && a .CountValidReplicatingReplicas == 0 :
443- a .Analysis = DeadPrimary
444- a .Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
445- ca .hasShardWideAction = true
446- //
447- case a .IsClusterPrimary && ! a .LastCheckValid && a .CountReplicas > 0 && a .CountValidReplicas == 0 && a .CountValidReplicatingReplicas == 0 :
448- a .Analysis = DeadPrimaryAndReplicas
449- a .Description = "Primary cannot be reached by vtorc and none of its replicas is replicating"
450- ca .hasShardWideAction = true
451- //
452- case a .IsClusterPrimary && ! a .LastCheckValid && a .CountValidReplicas < a .CountReplicas && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == 0 :
453- a .Analysis = DeadPrimaryAndSomeReplicas
454- a .Description = "Primary cannot be reached by vtorc; some of its replicas are unreachable and none of its reachable replicas is replicating"
455- ca .hasShardWideAction = true
456- //
457- case a .IsClusterPrimary && ! a .IsPrimary :
458- a .Analysis = PrimaryHasPrimary
459- a .Description = "Primary is replicating from somewhere else"
460- ca .hasShardWideAction = true
461- //
462- case a .IsClusterPrimary && a .IsReadOnly :
463- a .Analysis = PrimaryIsReadOnly
464- a .Description = "Primary is read-only"
465- //
466- case a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) != 0 && ! a .SemiSyncPrimaryEnabled :
467- a .Analysis = PrimarySemiSyncMustBeSet
468- a .Description = "Primary semi-sync must be set"
469- //
470- case a .IsClusterPrimary && policy .SemiSyncAckers (ca .durability , tablet ) == 0 && a .SemiSyncPrimaryEnabled :
471- a .Analysis = PrimarySemiSyncMustNotBeSet
472- a .Description = "Primary semi-sync must not be set"
473- //
474- case a .IsClusterPrimary && a .CurrentTabletType != topodatapb .TabletType_UNKNOWN && a .CurrentTabletType != topodatapb .TabletType_PRIMARY :
475- a .Analysis = PrimaryCurrentTypeMismatch
476- a .Description = "Primary tablet's current type is not PRIMARY"
477- case isStaleTopoPrimary (a , ca ):
478- a .Analysis = StaleTopoPrimary
479- a .Description = "Primary tablet is stale, older than current primary"
480- case topo .IsReplicaType (a .TabletType ) && a .ErrantGTID != "" :
481- a .Analysis = ErrantGTIDDetected
482- a .Description = "Tablet has errant GTIDs"
483- case topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && a .ShardPrimaryTermTimestamp .IsZero ():
484- // ClusterHasNoPrimary should only be detected when the shard record doesn't have any primary term start time specified either.
485- a .Analysis = ClusterHasNoPrimary
486- a .Description = "Cluster has no primary"
487- ca .hasShardWideAction = true
488- case topo .IsReplicaType (a .TabletType ) && ca .primaryAlias == "" && ! a .ShardPrimaryTermTimestamp .IsZero ():
489- // If there are no primary tablets, but the shard primary start time isn't empty, then we know
490- // the primary tablet was deleted.
491- a .Analysis = PrimaryTabletDeleted
492- a .Description = "Primary tablet has been deleted"
493- ca .hasShardWideAction = true
494- case a .IsPrimary && a .SemiSyncBlocked && a .CountSemiSyncReplicasEnabled >= a .SemiSyncPrimaryWaitForReplicaCount :
495- // The primary is reporting that semi-sync monitor is blocked on writes.
496- // There are enough replicas configured to send semi-sync ACKs such that the primary shouldn't be blocked.
497- // There is some network diruption in progress. We should run an ERS.
498- a .Analysis = PrimarySemiSyncBlocked
499- a .Description = "Writes seem to be blocked on semi-sync acks on the primary, even though sufficient replicas are configured to send ACKs"
500- ca .hasShardWideAction = true
501- case topo .IsReplicaType (a .TabletType ) && ! a .IsReadOnly :
502- a .Analysis = ReplicaIsWritable
503- a .Description = "Replica is writable"
504- //
505- case topo .IsReplicaType (a .TabletType ) && a .IsPrimary :
506- a .Analysis = NotConnectedToPrimary
507- a .Description = "Not connected to the primary"
508- //
509- case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && math .Round (a .HeartbeatInterval * 2 ) != float64 (a .ReplicaNetTimeout ):
510- a .Analysis = ReplicaMisconfigured
511- a .Description = "Replica has been misconfigured"
512- //
513- case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ca .primaryAlias != "" && a .AnalyzedInstancePrimaryAlias != ca .primaryAlias :
514- a .Analysis = ConnectedToWrongPrimary
515- a .Description = "Connected to wrong primary"
516- //
517- case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && a .ReplicationStopped :
518- a .Analysis = ReplicationStopped
519- a .Description = "Replication is stopped"
520- //
521- case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && ! a .SemiSyncReplicaEnabled :
522- a .Analysis = ReplicaSemiSyncMustBeSet
523- a .Description = "Replica semi-sync must be set"
524- //
525- case topo .IsReplicaType (a .TabletType ) && ! a .IsPrimary && ! policy .IsReplicaSemiSync (ca .durability , primaryTablet , tablet ) && a .SemiSyncReplicaEnabled :
526- a .Analysis = ReplicaSemiSyncMustNotBeSet
527- a .Description = "Replica semi-sync must not be set"
528- //
529- // TODO(sougou): Events below here are either ignored or not possible.
530- case a .IsPrimary && ! a .LastCheckValid && a .CountLaggingReplicas == a .CountReplicas && a .CountDelayedReplicas < a .CountReplicas && a .CountValidReplicatingReplicas > 0 :
531- a .Analysis = UnreachablePrimaryWithLaggingReplicas
532- a .Description = "Primary cannot be reached by vtorc and all of its replicas are lagging"
533- //
534- case a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas == a .CountValidReplicas :
535- // partial success is here to reduce noise
536- a .Analysis = UnreachablePrimary
537- a .Description = "Primary cannot be reached by vtorc but all of its replicas seem to be replicating; possibly a network/host issue"
538- //
539- case a .IsPrimary && ! a .LastCheckValid && ! a .LastCheckPartialSuccess && a .CountValidReplicas > 0 && a .CountValidReplicatingReplicas > 0 && a .CountValidReplicatingReplicas < a .CountValidReplicas :
540- // partial success is here to reduce noise
541- a .Analysis = UnreachablePrimaryWithBrokenReplicas
542- a .Description = "Primary cannot be reached by vtorc but it has (some, but not all) replicating replicas; possibly a network/host issue"
543- //
544- case a .IsPrimary && a .SemiSyncPrimaryEnabled && a .SemiSyncPrimaryStatus && a .SemiSyncPrimaryWaitForReplicaCount > 0 && a .SemiSyncPrimaryClients < a .SemiSyncPrimaryWaitForReplicaCount :
545- if isStaleBinlogCoordinates {
546- a .Analysis = LockedSemiSyncPrimary
547- a .Description = "Semi sync primary is locked since it doesn't get enough replica acknowledgements"
548- } else {
549- a .Analysis = LockedSemiSyncPrimaryHypothesis
550- a .Description = "Semi sync primary seems to be locked, more samplings needed to validate"
551- == == == =
552425 var matchedProblems []* DetectionAnalysisProblem
553426 for _ , problem := range detectionAnalysisProblems {
554427 // When isInvalid is true, instance data is unreliable (never been reached).
555428 // Only InvalidPrimary/InvalidReplica should match; postProcessAnalyses
556429 // handles upgrading InvalidPrimary to DeadPrimary if needed.
557430 if isInvalid && problem .Meta .Analysis != InvalidPrimary && problem .Meta .Analysis != InvalidReplica {
558431 continue
559- >> >> >> > e7888dfa83 (`vtorc` : support analysis ordering , improve semi - sync rollout (#19427 ))
560432 }
561433 if problem .HasMatch (a , ca , primaryTablet , tablet , isInvalid , isStaleBinlogCoordinates ) {
562434 matchedProblems = append (matchedProblems , problem )
0 commit comments