1
1
package flypg
2
2
3
3
import (
4
+ "context"
4
5
"errors"
6
+ "fmt"
5
7
"os"
6
8
)
7
9
@@ -50,41 +52,87 @@ func readZombieLock() (string, error) {
50
52
return string (body ), nil
51
53
}
52
54
53
- func ZombieDiagnosis (myHostname string , total int , inactive int , active int , conflictMap map [string ]int ) (string , error ) {
55
+ type DNASample struct {
56
+ hostname string
57
+ totalMembers int
58
+ totalActive int
59
+ totalInactive int
60
+ totalConflicts int
61
+ conflictMap map [string ]int
62
+ }
63
+
64
+ func TakeDNASample (ctx context.Context , node * Node , standbys []Member ) (* DNASample , error ) {
65
+ sample := & DNASample {
66
+ hostname : node .PrivateIP ,
67
+ totalMembers : len (standbys ) + 1 ,
68
+ totalActive : 1 ,
69
+ totalInactive : 0 ,
70
+ totalConflicts : 0 ,
71
+ conflictMap : map [string ]int {},
72
+ }
73
+
74
+ for _ , standby := range standbys {
75
+ // Check for connectivity
76
+ mConn , err := node .RepMgr .NewRemoteConnection (ctx , standby .Hostname )
77
+ if err != nil {
78
+ fmt .Printf ("failed to connect to %s" , standby .Hostname )
79
+ sample .totalInactive ++
80
+ continue
81
+ }
82
+ defer mConn .Close (ctx )
83
+
84
+ // Verify the primary
85
+ primary , err := node .RepMgr .PrimaryMember (ctx , mConn )
86
+ if err != nil {
87
+ fmt .Printf ("failed to resolve primary from standby %s" , standby .Hostname )
88
+ sample .totalInactive ++
89
+ continue
90
+ }
91
+
92
+ sample .totalActive ++
93
+
94
+ // Record conflict when primary doesn't match.
95
+ if primary .Hostname != node .PrivateIP {
96
+ sample .totalConflicts ++
97
+ sample .conflictMap [primary .Hostname ]++
98
+ }
99
+ }
100
+
101
+ return sample , nil
102
+ }
103
+
104
+ func ZombieDiagnosis (s * DNASample ) (string , error ) {
54
105
// We can short-circuit a single node cluster.
55
- if total == 1 {
56
- return myHostname , nil
106
+ if s . totalMembers == 1 {
107
+ return s . hostname , nil
57
108
}
58
109
59
- quorum := total / 2 + 1
110
+ quorum := s . totalMembers / 2 + 1
60
111
61
- if active < quorum {
112
+ if s . totalActive < quorum {
62
113
return "" , ErrZombieDiagnosisUndecided
63
114
}
64
115
65
116
topCandidate := ""
66
117
highestTotal := 0
67
- totalConflicts := 0
68
118
69
119
// Evaluate conflicts and calculate top referenced primary
70
- for hostname , total := range conflictMap {
71
- totalConflicts += total
72
-
120
+ for hostname , total := range s .conflictMap {
73
121
if total > highestTotal {
74
122
highestTotal = total
75
123
topCandidate = hostname
76
124
}
77
125
}
78
126
79
127
// Calculate our references
80
- myCount := total - inactive - totalConflicts
128
+ myCount := s . totalMembers - s . totalInactive - s . totalConflicts
81
129
82
130
// We have to fence the primary in case the active cluster is in the middle of a failover.
83
131
if myCount >= quorum {
84
- if totalConflicts > 0 {
132
+ if s . totalConflicts > 0 {
85
133
return "" , ErrZombieDiagnosisUndecided
86
134
}
87
- return myHostname , nil
135
+ return s . hostname , nil
88
136
}
89
137
90
138
if highestTotal >= quorum {
@@ -93,3 +141,12 @@ func ZombieDiagnosis(myHostname string, total int, inactive int, active int, con
93
141
94
142
return "" , ErrZombieDiagnosisUndecided
95
143
}
144
+
145
+ func printDNASample (s * DNASample ) {
146
+ fmt .Printf ("Registered members: %d, Active member(s): %d, Inactive member(s): %d, Conflicts detected: %d\n " ,
147
+ s .totalMembers ,
148
+ s .totalActive ,
149
+ s .totalInactive ,
150
+ s .totalConflicts ,
151
+ )
152
+ }
0 commit comments