Skip to content

Commit d8c88b2

Browse files
authored
Bug fix: Resolve member by id rather than by ip (#186)
* Resolve member by id rather than by ip * Reregister ip * Bug fix * Fix lint
1 parent 46c7d02 commit d8c88b2

File tree

3 files changed

+54
-12
lines changed

3 files changed

+54
-12
lines changed

bin/restart-repmgrd

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
#!/bin/bash
2+
3+
kill `cat /tmp/repmgrd.pid`

internal/flypg/node.go

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -260,6 +260,10 @@ func (n *Node) PostInit(ctx context.Context) error {
260260
return fmt.Errorf("failed to resolve member role: %s", err)
261261
}
262262

263+
// Restart repmgrd in the event the IP changes for an already registered node.
264+
// This can happen if the underlying volume is moved to a different node.
265+
daemonRestartRequired := n.RepMgr.daemonRestartRequired(member)
266+
263267
switch member.Role {
264268
case PrimaryRoleName:
265269
// Verify cluster state to ensure we are the actual primary and not a zombie.
@@ -290,15 +294,20 @@ func (n *Node) PostInit(ctx context.Context) error {
290294
)
291295
}
292296

297+
// Re-register primary to apply any configuration changes.
298+
if err := n.RepMgr.registerPrimary(daemonRestartRequired); err != nil {
299+
return fmt.Errorf("failed to re-register existing primary: %s", err)
300+
}
301+
293302
// Readonly lock is set when disk capacity is dangerously high.
294303
if !ReadOnlyLockExists() {
295304
if err := BroadcastReadonlyChange(ctx, n, false); err != nil {
296305
return fmt.Errorf("failed to unset read-only: %s", err)
297306
}
298307
}
299308
case StandbyRoleName:
300-
// Register existing standby to take-on any configuration changes.
301-
if err := n.RepMgr.registerStandby(); err != nil {
309+
// Register existing standby to apply any configuration changes.
310+
if err := n.RepMgr.registerStandby(daemonRestartRequired); err != nil {
302311
return fmt.Errorf("failed to register existing standby: %s", err)
303312
}
304313
case WitnessRoleName:
@@ -307,7 +316,7 @@ func (n *Node) PostInit(ctx context.Context) error {
307316
return fmt.Errorf("failed to resolve primary member when updating witness: %s", err)
308317
}
309318

310-
// Register existing witness to take-on any configuration changes.
319+
// Register existing witness to apply any configuration changes.
311320
if err := n.RepMgr.registerWitness(primary.Hostname); err != nil {
312321
return fmt.Errorf("failed to register existing witness: %s", err)
313322
}
@@ -357,7 +366,7 @@ func (n *Node) PostInit(ctx context.Context) error {
357366
}
358367

359368
// Register ourself as the primary
360-
if err := n.RepMgr.registerPrimary(); err != nil {
369+
if err := n.RepMgr.registerPrimary(false); err != nil {
361370
return fmt.Errorf("failed to register repmgr primary: %s", err)
362371
}
363372

@@ -395,7 +404,7 @@ func (n *Node) PostInit(ctx context.Context) error {
395404
}
396405
} else {
397406
log.Println("Registering standby")
398-
if err := n.RepMgr.registerStandby(); err != nil {
407+
if err := n.RepMgr.registerStandby(false); err != nil {
399408
return fmt.Errorf("failed to register new standby: %s", err)
400409
}
401410
}

internal/flypg/repmgr.go

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -217,18 +217,34 @@ func (r *RepMgr) resolveNodeID() (string, error) {
217217
return nodeID, nil
218218
}
219219

220-
func (r *RepMgr) registerPrimary() error {
220+
func (r *RepMgr) registerPrimary(restartDaemon bool) error {
221221
cmdStr := fmt.Sprintf("repmgr primary register -f %s -F", r.ConfigPath)
222-
_, err := utils.RunCommand(cmdStr, "postgres")
222+
if _, err := utils.RunCommand(cmdStr, "postgres"); err != nil {
223+
return fmt.Errorf("failed to register primary: %s", err)
224+
}
223225

224-
return err
226+
if restartDaemon {
227+
if err := r.restartDaemon(); err != nil {
228+
return fmt.Errorf("failed to restart repmgr daemon: %s", err)
229+
}
230+
}
231+
232+
return nil
225233
}
226234

227-
func (r *RepMgr) registerStandby() error {
235+
func (r *RepMgr) registerStandby(restartDaemon bool) error {
228236
cmdStr := fmt.Sprintf("repmgr standby register -f %s -F", r.ConfigPath)
229-
_, err := utils.RunCommand(cmdStr, "postgres")
237+
if _, err := utils.RunCommand(cmdStr, "postgres"); err != nil {
238+
return fmt.Errorf("failed to register standby: %s", err)
239+
}
230240

231-
return err
241+
if restartDaemon {
242+
if err := r.restartDaemon(); err != nil {
243+
return fmt.Errorf("failed to restart repmgr daemon: %s", err)
244+
}
245+
}
246+
247+
return nil
232248
}
233249

234250
func (r *RepMgr) registerWitness(primaryHostname string) error {
@@ -252,6 +268,15 @@ func (r *RepMgr) unregisterStandby(id int) error {
252268
return err
253269
}
254270

271+
func (*RepMgr) restartDaemon() error {
272+
_, err := utils.RunCommand("restart-repmgrd", "postgres")
273+
return err
274+
}
275+
276+
func (r *RepMgr) daemonRestartRequired(m *Member) bool {
277+
return m.Hostname != r.PrivateIP
278+
}
279+
255280
func (r *RepMgr) unregisterWitness(id int) error {
256281
cmdStr := fmt.Sprintf("repmgr witness unregister -f %s --node-id=%d", r.ConfigPath, id)
257282
_, err := utils.RunCommand(cmdStr, "postgres")
@@ -325,13 +350,18 @@ func (*RepMgr) Members(ctx context.Context, pg *pgx.Conn) ([]Member, error) {
325350
}
326351

327352
func (r *RepMgr) Member(ctx context.Context, conn *pgx.Conn) (*Member, error) {
353+
myID, err := r.resolveNodeID()
354+
if err != nil {
355+
return nil, fmt.Errorf("failed to resolve node id: %s", err)
356+
}
357+
328358
members, err := r.Members(ctx, conn)
329359
if err != nil {
330360
return nil, err
331361
}
332362

333363
for _, member := range members {
334-
if member.Hostname == r.PrivateIP {
364+
if fmt.Sprint(member.ID) == myID {
335365
return &member, nil
336366
}
337367
}

0 commit comments

Comments
 (0)