Skip to content

Commit 043469e

Browse files
authored
Merge pull request #109 from fly-apps/clusterStateMonitor
Replace readOnlyStateMonitor with clusterStateMonitor
2 parents 5d7c1eb + e12f788 commit 043469e

File tree

6 files changed

+91
-122
lines changed

6 files changed

+91
-122
lines changed

cmd/event_handler/main.go

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -71,42 +71,26 @@ func main() {
7171
}
7272

7373
func evaluateClusterState(ctx context.Context, conn *pgx.Conn, node *flypg.Node) error {
74-
standbys, err := node.RepMgr.StandbyMembers(ctx, conn)
75-
if err != nil {
76-
if !errors.Is(err, pgx.ErrNoRows) {
77-
return fmt.Errorf("failed to query standbys")
78-
}
79-
}
80-
81-
sample, err := flypg.TakeDNASample(ctx, node, standbys)
82-
if err != nil {
83-
return fmt.Errorf("failed to evaluate cluster data: %s", err)
84-
}
85-
86-
log.Println(flypg.DNASampleString(sample))
87-
88-
primary, err := flypg.ZombieDiagnosis(sample)
74+
primary, err := node.EvaluateClusterState(ctx, conn)
8975
if errors.Is(err, flypg.ErrZombieDiagnosisUndecided) || errors.Is(err, flypg.ErrZombieDiscovered) {
90-
// Quarantine primary
9176
if err := flypg.Quarantine(ctx, conn, node, primary); err != nil {
9277
return fmt.Errorf("failed to quarantine failed primary: %s", err)
9378
}
94-
9579
return fmt.Errorf("primary has been quarantined: %s", err)
9680
} else if err != nil {
9781
return fmt.Errorf("failed to run zombie diagnosis: %s", err)
9882
}
9983

10084
// Clear zombie lock if it exists
10185
if flypg.ZombieLockExists() {
102-
log.Println("Clearing zombie lock and enabling read/write")
86+
log.Println("Clearing zombie lock and re-enabling read/write")
10387
if err := flypg.RemoveZombieLock(); err != nil {
10488
return fmt.Errorf("failed to remove zombie lock: %s", err)
10589
}
10690

10791
log.Println("Broadcasting readonly state change")
10892
if err := flypg.BroadcastReadonlyChange(ctx, node, false); err != nil {
109-
log.Printf("errors while disabling readonly: %s", err)
93+
log.Printf("failed to disable readonly: %s", err)
11094
}
11195
}
11296

cmd/monitor/main.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import (
1212
var (
1313
deadMemberMonitorFrequency = time.Hour * 1
1414
replicationStateMonitorFrequency = time.Hour * 1
15-
readonlyStateMonitorFrequency = time.Minute * 1
15+
clusterStateMonitorFrequency = time.Minute * 15
1616

1717
defaultDeadMemberRemovalThreshold = time.Hour * 12
1818
defaultInactiveSlotRemovalThreshold = time.Hour * 12
@@ -35,8 +35,8 @@ func main() {
3535
}()
3636

3737
// Readonly monitor
38-
log.Println("Monitoring readonly state")
39-
go monitorReadOnly(ctx, node)
38+
log.Println("Monitoring cluster state")
39+
go monitorClusterState(ctx, node)
4040

4141
// Replication slot monitor
4242
log.Println("Monitoring replication slots")
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
package main
2+
3+
import (
4+
"context"
5+
"errors"
6+
"fmt"
7+
"log"
8+
"time"
9+
10+
"github.com/fly-apps/postgres-flex/internal/flypg"
11+
)
12+
13+
func monitorClusterState(ctx context.Context, node *flypg.Node) {
14+
ticker := time.NewTicker(clusterStateMonitorFrequency)
15+
defer ticker.Stop()
16+
for range ticker.C {
17+
if err := clusterStateMonitorTick(ctx, node); err != nil {
18+
log.Printf("clusterStateMonitorTick failed with: %s", err)
19+
}
20+
}
21+
}
22+
23+
func clusterStateMonitorTick(ctx context.Context, node *flypg.Node) error {
24+
conn, err := node.RepMgr.NewLocalConnection(ctx)
25+
if err != nil {
26+
return fmt.Errorf("failed to open local connection: %s", err)
27+
}
28+
defer conn.Close(ctx)
29+
30+
member, err := node.RepMgr.Member(ctx, conn)
31+
if err != nil {
32+
return fmt.Errorf("failed to query local member: %s", err)
33+
}
34+
35+
// We only need to monitor the primary
36+
if member.Role != flypg.PrimaryRoleName {
37+
return nil
38+
}
39+
40+
primary, err := node.EvaluateClusterState(ctx, conn)
41+
if errors.Is(err, flypg.ErrZombieDiagnosisUndecided) || errors.Is(err, flypg.ErrZombieDiscovered) {
42+
if err := flypg.Quarantine(ctx, conn, node, primary); err != nil {
43+
return fmt.Errorf("failed to quarantine failed primary: %s", err)
44+
}
45+
return fmt.Errorf("primary has been quarantined: %s", err)
46+
} else if err != nil {
47+
return fmt.Errorf("failed to run zombie diagnosis: %s", err)
48+
}
49+
50+
// Clear zombie lock if it exists
51+
if flypg.ZombieLockExists() {
52+
log.Println("Clearing zombie lock and enabling read/write")
53+
if err := flypg.RemoveZombieLock(); err != nil {
54+
return fmt.Errorf("failed to remove zombie lock: %s", err)
55+
}
56+
57+
log.Println("Broadcasting readonly state change")
58+
if err := flypg.BroadcastReadonlyChange(ctx, node, false); err != nil {
59+
log.Printf("errors while disabling readonly: %s", err)
60+
}
61+
}
62+
63+
return nil
64+
}

cmd/monitor/monitor_readonly.go

Lines changed: 0 additions & 78 deletions
This file was deleted.

internal/flypg/node.go

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -326,30 +326,12 @@ func (n *Node) PostInit(ctx context.Context) error {
326326

327327
switch role {
328328
case PrimaryRoleName:
329-
standbys, err := repmgr.StandbyMembers(ctx, conn)
330-
if err != nil {
331-
if !errors.Is(err, pgx.ErrNoRows) {
332-
return fmt.Errorf("failed to query standbys")
333-
}
334-
}
335-
336-
// Collect sample data from registered standbys
337-
sample, err := TakeDNASample(ctx, n, standbys)
338-
if err != nil {
339-
return fmt.Errorf("failed to resolve cluster metrics: %s", err)
340-
}
341-
342-
fmt.Println(DNASampleString(sample))
343-
344-
// Evaluate whether we are a zombie or not.
345-
primary, err := ZombieDiagnosis(sample)
329+
primary, err := n.EvaluateClusterState(ctx, conn)
346330
if errors.Is(err, ErrZombieDiagnosisUndecided) {
347331
fmt.Println("Unable to confirm that we are the true primary!")
348-
349332
if err := Quarantine(ctx, conn, n, primary); err != nil {
350333
return fmt.Errorf("failed to quarantine failed primary: %s", err)
351334
}
352-
353335
} else if errors.Is(err, ErrZombieDiscovered) {
354336
fmt.Printf("The majority of registered members agree that '%s' is the real primary.\n", primary)
355337

@@ -678,3 +660,21 @@ func setDirOwnership() error {
678660
_, err = cmd.Output()
679661
return err
680662
}
663+
664+
func (n *Node) EvaluateClusterState(ctx context.Context, conn *pgx.Conn) (string, error) {
665+
standbys, err := n.RepMgr.StandbyMembers(ctx, conn)
666+
if err != nil {
667+
if !errors.Is(err, pgx.ErrNoRows) {
668+
return "", fmt.Errorf("failed to query standbys")
669+
}
670+
}
671+
672+
sample, err := TakeDNASample(ctx, n, standbys)
673+
if err != nil {
674+
return "", fmt.Errorf("failed to evaluate cluster data: %s", err)
675+
}
676+
677+
fmt.Println(DNASampleString(sample))
678+
679+
return ZombieDiagnosis(sample)
680+
}

internal/flypg/zombie.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ func TakeDNASample(ctx context.Context, node *Node, standbys []Member) (*DNASamp
8989
// Check for connectivity
9090
mConn, err := node.RepMgr.NewRemoteConnection(ctx, standby.Hostname)
9191
if err != nil {
92-
fmt.Printf("failed to connect to %s", standby.Hostname)
92+
fmt.Printf("failed to connect to %s\n", standby.Hostname)
9393
sample.totalInactive++
9494
continue
9595
}
@@ -98,7 +98,7 @@ func TakeDNASample(ctx context.Context, node *Node, standbys []Member) (*DNASamp
9898
// Verify the primary
9999
primary, err := node.RepMgr.PrimaryMember(ctx, mConn)
100100
if err != nil {
101-
fmt.Printf("failed to resolve primary from standby %s", standby.Hostname)
101+
fmt.Printf("failed to resolve primary from standby %s\n", standby.Hostname)
102102
sample.totalInactive++
103103
continue
104104
}
@@ -157,7 +157,6 @@ func ZombieDiagnosis(s *DNASample) (string, error) {
157157
}
158158

159159
func Quarantine(ctx context.Context, conn *pgx.Conn, n *Node, primary string) error {
160-
fmt.Println("Writing zombie.lock file.")
161160
if err := writeZombieLock(primary); err != nil {
162161
return fmt.Errorf("failed to set zombie lock: %s", err)
163162
}

0 commit comments

Comments
 (0)