Skip to content

Commit 2dc657f

Browse files
authored
Merge pull request #64 from fly-apps/diagnosis-cleanup
Abstract standby evaluation logic so it can be reused elsewhere
2 parents 42b3712 + 8d13fe2 commit 2dc657f

File tree

3 files changed

+234
-178
lines changed

3 files changed

+234
-178
lines changed

internal/flypg/node.go

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -342,49 +342,18 @@ func (n *Node) PostInit(ctx context.Context) error {
342342
}
343343
}
344344

345-
totalMembers := len(standbys) + 1
346-
totalActive := 1
347-
totalInactive := 0
348-
totalConflicts := 0
349-
350-
conflictMap := map[string]int{}
351-
352-
for _, standby := range standbys {
353-
// Check for connectivity
354-
mConn, err := repmgr.NewRemoteConnection(ctx, standby.Hostname)
355-
if err != nil {
356-
fmt.Printf("failed to connect to %s", standby.Hostname)
357-
totalInactive++
358-
continue
359-
}
360-
defer mConn.Close(ctx)
361-
362-
// Verify the primary
363-
primary, err := repmgr.PrimaryMember(ctx, mConn)
364-
if err != nil {
365-
fmt.Printf("failed to resolve primary from standby %s", standby.Hostname)
366-
totalInactive++
367-
continue
368-
}
369-
370-
totalActive++
371-
372-
// Record conflict when primary doesn't match.
373-
if primary.Hostname != n.PrivateIP {
374-
totalConflicts++
375-
conflictMap[primary.Hostname]++
376-
}
345+
// Collect sample data from registered standbys
346+
sample, err := TakeDNASample(ctx, n, standbys)
347+
if err != nil {
348+
return fmt.Errorf("failed to resolve cluster metrics: %s", err)
377349
}
378350

379-
primary, err := ZombieDiagnosis(n.PrivateIP, totalMembers, totalInactive, totalActive, conflictMap)
351+
printDNASample(sample)
352+
353+
// Evaluate whether we are a zombie or not.
354+
primary, err := ZombieDiagnosis(sample)
380355
if errors.Is(err, ErrZombieDiagnosisUndecided) {
381356
fmt.Println("Unable to confirm that we are the true primary!")
382-
fmt.Printf("Registered members: %d, Active member(s): %d, Inactive member(s): %d, Conflicts detected: %d\n",
383-
totalMembers,
384-
totalActive,
385-
totalInactive,
386-
totalConflicts,
387-
)
388357

389358
fmt.Println("Writing zombie.lock file.")
390359
if err := writeZombieLock(""); err != nil {

internal/flypg/zombie.go

Lines changed: 69 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
package flypg
22

33
import (
4+
"context"
45
"errors"
6+
"fmt"
57
"os"
68
)
79

@@ -50,41 +52,87 @@ func readZombieLock() (string, error) {
5052
return string(body), nil
5153
}
5254

53-
func ZombieDiagnosis(myHostname string, total int, inactive int, active int, conflictMap map[string]int) (string, error) {
55+
type DNASample struct {
56+
hostname string
57+
totalMembers int
58+
totalActive int
59+
totalInactive int
60+
totalConflicts int
61+
conflictMap map[string]int
62+
}
63+
64+
func TakeDNASample(ctx context.Context, node *Node, standbys []Member) (*DNASample, error) {
65+
sample := &DNASample{
66+
hostname: node.PrivateIP,
67+
totalMembers: len(standbys) + 1,
68+
totalActive: 1,
69+
totalInactive: 0,
70+
totalConflicts: 0,
71+
conflictMap: map[string]int{},
72+
}
73+
74+
for _, standby := range standbys {
75+
// Check for connectivity
76+
mConn, err := node.RepMgr.NewRemoteConnection(ctx, standby.Hostname)
77+
if err != nil {
78+
fmt.Printf("failed to connect to %s", standby.Hostname)
79+
sample.totalInactive++
80+
continue
81+
}
82+
defer mConn.Close(ctx)
83+
84+
// Verify the primary
85+
primary, err := node.RepMgr.PrimaryMember(ctx, mConn)
86+
if err != nil {
87+
fmt.Printf("failed to resolve primary from standby %s", standby.Hostname)
88+
sample.totalInactive++
89+
continue
90+
}
91+
92+
sample.totalActive++
93+
94+
// Record conflict when primary doesn't match.
95+
if primary.Hostname != node.PrivateIP {
96+
sample.totalConflicts++
97+
sample.conflictMap[primary.Hostname]++
98+
}
99+
}
100+
101+
return sample, nil
102+
}
103+
104+
func ZombieDiagnosis(s *DNASample) (string, error) {
54105
// We can short-circuit a single node cluster.
55-
if total == 1 {
56-
return myHostname, nil
106+
if s.totalMembers == 1 {
107+
return s.hostname, nil
57108
}
58109

59-
quorum := total/2 + 1
110+
quorum := s.totalMembers/2 + 1
60111

61-
if active < quorum {
112+
if s.totalActive < quorum {
62113
return "", ErrZombieDiagnosisUndecided
63114
}
64115

65116
topCandidate := ""
66117
highestTotal := 0
67-
totalConflicts := 0
68118

69119
// Evaluate conflicts and calculate top referenced primary
70-
for hostname, total := range conflictMap {
71-
totalConflicts += total
72-
120+
for hostname, total := range s.conflictMap {
73121
if total > highestTotal {
74122
highestTotal = total
75123
topCandidate = hostname
76124
}
77125
}
78126

79127
// Calculate our references
80-
myCount := total - inactive - totalConflicts
128+
myCount := s.totalMembers - s.totalInactive - s.totalConflicts
81129

82130
// We have to fence the primary in case the active cluster is in the middle of a failover.
83131
if myCount >= quorum {
84-
if totalConflicts > 0 {
132+
if s.totalConflicts > 0 {
85133
return "", ErrZombieDiagnosisUndecided
86134
}
87-
return myHostname, nil
135+
return s.hostname, nil
88136
}
89137

90138
if highestTotal >= quorum {
@@ -93,3 +141,12 @@ func ZombieDiagnosis(myHostname string, total int, inactive int, active int, con
93141

94142
return "", ErrZombieDiagnosisUndecided
95143
}
144+
145+
func printDNASample(s *DNASample) {
146+
fmt.Printf("Registered members: %d, Active member(s): %d, Inactive member(s): %d, Conflicts detected: %d\n",
147+
s.totalMembers,
148+
s.totalActive,
149+
s.totalInactive,
150+
s.totalConflicts,
151+
)
152+
}

0 commit comments

Comments
 (0)