@@ -7,16 +7,14 @@ import (
7
7
"time"
8
8
9
9
"github.com/fly-apps/postgres-flex/pkg/flypg"
10
+ "github.com/fly-apps/postgres-flex/pkg/flypg/admin"
11
+ "github.com/jackc/pgx/v4"
10
12
)
11
13
12
- // We need to adjust this to look at replication slots.
13
- // Pull ids from repmgr.show_nodes
14
- // Pull replication_slot ids that are inactive.
15
-
16
- // Remove replication slot if the slot_name id is inactive and is not
17
- // present as a repmgr node.
18
-
19
- var Minute int64 = 60
14
+ var (
15
+ monitorFrequency = time .Minute * 5
16
+ deadMemberRemovalThreshold = time .Hour * 24
17
+ )
20
18
21
19
func main () {
22
20
ctx := context .Background ()
@@ -26,44 +24,91 @@ func main() {
26
24
os .Exit (1 )
27
25
}
28
26
27
+ // TODO - We should connect using the flypgadmin user so we can differentiate between
28
+ // internal admin connection usage and the actual repmgr process.
29
29
conn , err := flypgNode .RepMgr .NewLocalConnection (ctx )
30
30
if err != nil {
31
31
fmt .Printf ("failed to open local connection: %s\n " , err )
32
32
os .Exit (1 )
33
33
}
34
34
35
- ticker := time .NewTicker (5 * time .Second )
36
- defer ticker .Stop ()
35
+ seenAt := map [int ]time.Time {}
37
36
38
- seenAt := map [int ]int64 {}
37
+ ticker := time .NewTicker (monitorFrequency )
38
+ defer ticker .Stop ()
39
39
40
- for _ = range ticker .C {
40
+ for range ticker .C {
41
41
role , err := flypgNode .RepMgr .CurrentRole (ctx , conn )
42
42
if err != nil {
43
- fmt .Printf ("Failed to check role: %s" , err )
43
+ fmt .Printf ("Failed to check role: %s\n " , err )
44
44
continue
45
45
}
46
+
46
47
if role != flypg .PrimaryRoleName {
47
48
continue
48
49
}
50
+
49
51
standbys , err := flypgNode .RepMgr .Standbys (ctx , conn )
50
52
if err != nil {
51
- fmt .Printf ("Failed to get standbys: %s" , err )
53
+ fmt .Printf ("Failed to query standbys: %s\n " , err )
52
54
continue
53
55
}
56
+
54
57
for _ , standby := range standbys {
55
58
newConn , err := flypgNode .RepMgr .NewRemoteConnection (ctx , standby .Ip )
59
+ defer newConn .Close (ctx )
56
60
if err != nil {
57
- if time .Now ().Unix ()- seenAt [standby .Id ] >= 10 * Minute {
61
+ // TODO - Verify the exception that's getting thrown.
62
+ if time .Now ().Sub (seenAt [standby .Id ]) >= deadMemberRemovalThreshold {
58
63
if err := flypg .UnregisterMemberByID (ctx , int32 (standby .Id )); err != nil {
59
- fmt .Println (err .Error ())
64
+ fmt .Printf ("failed to unregister member %d: %v\n " , standby .Id , err .Error ())
65
+ continue
60
66
}
61
67
62
68
delete (seenAt , standby .Id )
63
69
}
64
- } else {
65
- seenAt [standby .Id ] = time .Now ().Unix ()
66
- newConn .Close (ctx )
70
+
71
+ continue
72
+ }
73
+
74
+ seenAt [standby .Id ] = time .Now ()
75
+ }
76
+
77
+ removeOrphanedReplicationSlots (ctx , conn , standbys )
78
+ }
79
+ }
80
+
81
+ func removeOrphanedReplicationSlots (ctx context.Context , conn * pgx.Conn , standbys []flypg.Standby ) {
82
+ var orphanedSlots []admin.ReplicationSlot
83
+
84
+ slots , err := admin .ListReplicationSlots (ctx , conn )
85
+ if err != nil {
86
+ fmt .Printf ("failed to list replication slots: %s" , err )
87
+ }
88
+
89
+ // Identify orphaned replication slots. active replication slots with the listed standbys
90
+ for _ , slot := range slots {
91
+ matchFound := false
92
+ for _ , standby := range standbys {
93
+ if slot .MemberID == int32 (standby .Id ) {
94
+ matchFound = true
95
+ }
96
+ }
97
+
98
+ if ! matchFound && ! slot .Active {
99
+ orphanedSlots = append (orphanedSlots , slot )
100
+ }
101
+ }
102
+
103
+ if len (orphanedSlots ) > 0 {
104
+ fmt .Printf ("%d orphaned replication slots detected" , len (orphanedSlots ))
105
+
106
+ for _ , slot := range orphanedSlots {
107
+ fmt .Printf ("dropping replication slot: %s" , slot .Name )
108
+
109
+ if err := admin .DropReplicationSlot (ctx , conn , slot .Name ); err != nil {
110
+ fmt .Printf ("failed to drop replication slot %s: %v" , slot .Name , err )
111
+ continue
67
112
}
68
113
}
69
114
}
0 commit comments