Skip to content

Commit 4d211f4

Browse files
authored
Merge pull request #67 from fly-apps/runtime-zombie-analysis-2
Evaluate quorum at runtime
2 parents cbe1103 + c158f2f commit 4d211f4

File tree

7 files changed

+122
-38
lines changed

7 files changed

+122
-38
lines changed

cmd/event_handler/main.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package main
22

33
import (
44
"context"
5+
"errors"
56
"flag"
67
"fmt"
78
"log"
@@ -10,6 +11,7 @@ import (
1011
"time"
1112

1213
"github.com/fly-apps/postgres-flex/internal/flypg"
14+
"github.com/jackc/pgx/v5"
1315
)
1416

1517
const eventLogFile = "/data/event.log"
@@ -26,6 +28,8 @@ func main() {
2628
details := flag.String("details", "", "details")
2729
flag.Parse()
2830

31+
ctx := context.Background()
32+
2933
logFile, err := os.OpenFile(eventLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0644)
3034
if err != nil {
3135
fmt.Printf("failed to open event log: %s", err)
@@ -91,6 +95,38 @@ func main() {
9195
os.Exit(1)
9296
}
9397

98+
case "child_node_disconnect", "child_node_reconnect", "child_node_new_connect":
99+
node, err := flypg.NewNode()
100+
if err != nil {
101+
log.Printf("failed to initialize node: %s", err)
102+
os.Exit(1)
103+
}
104+
105+
conn, err := node.RepMgr.NewLocalConnection(ctx)
106+
if err != nil {
107+
log.Printf("failed to open local connection: %s", err)
108+
os.Exit(1)
109+
}
110+
defer conn.Close(ctx)
111+
112+
member, err := node.RepMgr.Member(ctx, conn)
113+
if err != nil {
114+
log.Printf("failed to resolve member: %s", err)
115+
os.Exit(1)
116+
}
117+
118+
if member.Role != flypg.PrimaryRoleName {
119+
// We should never get here.
120+
log.Println("skipping since we are not the primary")
121+
os.Exit(0)
122+
}
123+
124+
if err := evaluateClusterState(ctx, conn, node); err != nil {
125+
log.Printf("failed to evaluate cluster state: %s", err)
126+
os.Exit(0)
127+
}
128+
129+
os.Exit(0)
94130
default:
95131
// noop
96132
}
@@ -118,3 +154,50 @@ func reconfigurePGBouncer(id int) error {
118154

119155
return nil
120156
}
157+
158+
func evaluateClusterState(ctx context.Context, conn *pgx.Conn, node *flypg.Node) error {
159+
standbys, err := node.RepMgr.StandbyMembers(ctx, conn)
160+
if err != nil {
161+
if !errors.Is(err, pgx.ErrNoRows) {
162+
return fmt.Errorf("failed to query standbys")
163+
}
164+
}
165+
166+
sample, err := flypg.TakeDNASample(ctx, node, standbys)
167+
if err != nil {
168+
return fmt.Errorf("failed to evaluate cluster data: %s", err)
169+
}
170+
171+
log.Println(flypg.DNASampleString(sample))
172+
173+
primary, err := flypg.ZombieDiagnosis(sample)
174+
if errors.Is(err, flypg.ErrZombieDiagnosisUndecided) || errors.Is(err, flypg.ErrZombieDiscovered) {
175+
// Quarantine primary
176+
if err := flypg.Quarantine(ctx, conn, node, primary); err != nil {
177+
return fmt.Errorf("failed to quarantine failed primary: %s", err)
178+
}
179+
180+
return fmt.Errorf("primary has been quarantined: %s", err)
181+
} else if err != nil {
182+
return fmt.Errorf("failed to run zombie diagnosis: %s", err)
183+
}
184+
185+
// Clear zombie lock if it exists
186+
if flypg.ZombieLockExists() {
187+
log.Println("Clearing zombie lock and enabling read/write")
188+
if err := flypg.RemoveZombieLock(); err != nil {
189+
return fmt.Errorf("failed to remove zombie lock: %s", err)
190+
}
191+
192+
log.Println("Broadcasting readonly state change")
193+
if err := flypg.BroadcastReadonlyChange(ctx, node, false); err != nil {
194+
log.Printf("errors while disabling readonly: %s", err)
195+
}
196+
}
197+
198+
if err := node.PGBouncer.ConfigurePrimary(ctx, primary, true); err != nil {
199+
return fmt.Errorf("failed to reconfigure pgbouncer primary %s", err)
200+
}
201+
202+
return nil
203+
}

internal/flypg/node.go

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ func (n *Node) Init(ctx context.Context) error {
145145

146146
if ZombieLockExists() {
147147
fmt.Println("Zombie lock detected!")
148-
primaryStr, err := readZombieLock()
148+
primaryStr, err := ReadZombieLock()
149149
if err != nil {
150150
return fmt.Errorf("failed to read zombie lock: %s", primaryStr)
151151
}
@@ -172,7 +172,7 @@ func (n *Node) Init(ctx context.Context) error {
172172
// Confirm that our rejoin target still identifies itself as the primary.
173173
if primary.Hostname != ip.String() {
174174
// Clear the zombie.lock file so we can attempt to re-resolve the correct primary.
175-
if err := removeZombieLock(); err != nil {
175+
if err := RemoveZombieLock(); err != nil {
176176
return fmt.Errorf("failed to remove zombie lock: %s", err)
177177
}
178178

@@ -191,7 +191,7 @@ func (n *Node) Init(ctx context.Context) error {
191191

192192
// TODO - Wait for target cluster to register self as a standby.
193193

194-
if err := removeZombieLock(); err != nil {
194+
if err := RemoveZombieLock(); err != nil {
195195
return fmt.Errorf("failed to remove zombie lock: %s", err)
196196
}
197197

@@ -348,45 +348,25 @@ func (n *Node) PostInit(ctx context.Context) error {
348348
return fmt.Errorf("failed to resolve cluster metrics: %s", err)
349349
}
350350

351-
printDNASample(sample)
351+
fmt.Println(DNASampleString(sample))
352352

353353
// Evaluate whether we are a zombie or not.
354354
primary, err := ZombieDiagnosis(sample)
355355
if errors.Is(err, ErrZombieDiagnosisUndecided) {
356356
fmt.Println("Unable to confirm that we are the true primary!")
357357

358-
fmt.Println("Writing zombie.lock file.")
359-
if err := writeZombieLock(""); err != nil {
360-
return fmt.Errorf("failed to set zombie lock: %s", err)
358+
if err := Quarantine(ctx, conn, n, primary); err != nil {
359+
return fmt.Errorf("failed to quarantine failed primary: %s", err)
361360
}
362361

363-
fmt.Println("Turning all user-created databases readonly.")
364-
if err := BroadcastReadonlyChange(ctx, n, true); err != nil {
365-
return fmt.Errorf("failed to set read-only: %s", err)
366-
}
367-
368-
// TODO - Add link to docs
369-
fmt.Println("Please refer to following documentation for more information: <insert-doc-link-here>.")
370-
371362
} else if errors.Is(err, ErrZombieDiscovered) {
372-
fmt.Println("Zombie primary discovered!")
373363
fmt.Printf("The majority of registered members agree that '%s' is the real primary.\n", primary)
374364

375-
fmt.Printf("Reconfiguring PGBouncer to point to '%s'\n", primary)
376-
if err := n.PGBouncer.ConfigurePrimary(ctx, primary, true); err != nil {
377-
return fmt.Errorf("failed to reconfigure pgbouncer: %s", err)
378-
}
379-
380-
fmt.Println("Writing zombie.lock file")
381-
if err := writeZombieLock(primary); err != nil {
382-
return fmt.Errorf("failed to set zombie lock: %s", err)
383-
}
384-
385-
fmt.Println("Turning user-created databases read-only")
386-
if err := BroadcastReadonlyChange(ctx, n, true); err != nil {
387-
return fmt.Errorf("failed to set read-only: %s", err)
365+
if err := Quarantine(ctx, conn, n, primary); err != nil {
366+
return fmt.Errorf("failed to quarantine failed primary: %s", err)
388367
}
389-
368+
// Issue panic to force a process restart so we can attempt to rejoin
369+
// the the cluster we've diverged from.
390370
panic(err)
391371
} else if err != nil {
392372
return fmt.Errorf("failed to run zombie diagnosis: %s", err)

internal/flypg/pgbouncer.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ func (p *PGBouncer) ConfigurePrimary(ctx context.Context, primary string, reload
6262
if reload {
6363
err = p.reloadConfig(ctx)
6464
if err != nil {
65-
fmt.Printf("failed to reconfigure pgbouncer primary %s\n", err)
65+
return fmt.Errorf("failed to reconfigure pgbouncer primary: %s", err)
6666
}
6767
}
6868
return nil

internal/flypg/readonly.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ func changeReadOnlyState(ctx context.Context, n *Node, enable bool) error {
124124

125125
databases, err := admin.ListDatabases(ctx, conn)
126126
if err != nil {
127-
return err
127+
return fmt.Errorf("failed to list database: %s", err)
128128
}
129129

130130
var dbNames []string

internal/flypg/repmgr.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ func (r *RepMgr) setDefaults() {
123123
"promote_command": fmt.Sprintf("'repmgr standby promote -f %s --log-to-file'", r.ConfigPath),
124124
"follow_command": fmt.Sprintf("'repmgr standby follow -f %s --log-to-file --upstream-node-id=%%n'", r.ConfigPath),
125125
"event_notification_command": fmt.Sprintf("'/usr/local/bin/event_handler -node-id %%n -event %%e -success %%s -details \"%%d\" -new-node-id \\'%%p\\''"),
126-
"event_notifications": "'repmgrd_failover_promote,standby_promote,standby_follow'",
126+
"event_notifications": "'repmgrd_failover_promote,standby_promote,standby_follow,child_node_disconnect,child_node_reconnect,child_node_new_connect'",
127127
"location": r.Region,
128128
"primary_visibility_consensus": true,
129129
"failover_validation_command": fmt.Sprintf("'/usr/local/bin/failover_validation -visible-nodes %%v -total-nodes %%t'"),

internal/flypg/restore.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ func clearLocks() error {
210210
}
211211
}
212212

213-
if err := removeZombieLock(); err != nil {
213+
if err := RemoveZombieLock(); err != nil {
214214
if !os.IsNotExist(err) {
215215
return fmt.Errorf("failed to remove zombie lock pre-restore: %s", err)
216216
}

internal/flypg/zombie.go

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ import (
55
"errors"
66
"fmt"
77
"os"
8+
9+
"github.com/jackc/pgx/v5"
810
)
911

1012
var (
@@ -35,15 +37,15 @@ func writeZombieLock(hostname string) error {
3537
return nil
3638
}
3739

38-
func removeZombieLock() error {
40+
func RemoveZombieLock() error {
3941
if err := os.Remove("/data/zombie.lock"); err != nil {
4042
return err
4143
}
4244

4345
return nil
4446
}
4547

46-
func readZombieLock() (string, error) {
48+
func ReadZombieLock() (string, error) {
4749
body, err := os.ReadFile("/data/zombie.lock")
4850
if err != nil {
4951
return "", err
@@ -142,8 +144,27 @@ func ZombieDiagnosis(s *DNASample) (string, error) {
142144
return "", ErrZombieDiagnosisUndecided
143145
}
144146

145-
func printDNASample(s *DNASample) {
146-
fmt.Printf("Registered members: %d, Active member(s): %d, Inactive member(s): %d, Conflicts detected: %d\n",
147+
func Quarantine(ctx context.Context, conn *pgx.Conn, n *Node, primary string) error {
148+
if primary != "" {
149+
if err := n.PGBouncer.ConfigurePrimary(ctx, primary, true); err != nil {
150+
return fmt.Errorf("failed to reconfigure pgbouncer: %s", err)
151+
}
152+
}
153+
154+
fmt.Println("Writing zombie.lock file.")
155+
if err := writeZombieLock(""); err != nil {
156+
return fmt.Errorf("failed to set zombie lock: %s", err)
157+
}
158+
159+
if err := BroadcastReadonlyChange(ctx, n, true); err != nil {
160+
return fmt.Errorf("failed to set read-only: %s", err)
161+
}
162+
163+
return nil
164+
}
165+
166+
func DNASampleString(s *DNASample) string {
167+
return fmt.Sprintf("Registered members: %d, Active member(s): %d, Inactive member(s): %d, Conflicts detected: %d",
147168
s.totalMembers,
148169
s.totalActive,
149170
s.totalInactive,

0 commit comments

Comments
 (0)