Skip to content

Commit e6ad546

Browse files
committed
Adjusting how Zombie state is evaluated on boot.
1 parent ce2f772 commit e6ad546

File tree

2 files changed

+13
-12
lines changed

2 files changed

+13
-12
lines changed

internal/flypg/node.go

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -231,14 +231,6 @@ func (n *Node) Init(ctx context.Context) error {
231231

232232
// PostInit are operations that need to be executed against a running Postgres on boot.
233233
func (n *Node) PostInit(ctx context.Context) error {
234-
if ZombieLockExists() {
235-
log.Println("[ERROR] Manual intervention required.")
236-
log.Println("[ERROR] If a new primary has been established, consider adding a new replica with `fly machines clone <primary-machine-id>` and then remove this member.")
237-
log.Println("[ERROR] Sleeping for 5 minutes.")
238-
time.Sleep(5 * time.Minute)
239-
return fmt.Errorf("unrecoverable zombie")
240-
}
241-
242234
// Use the Postgres user on boot, since our internal user may not have been created yet.
243235
conn, err := n.NewLocalConnection(ctx, "postgres", n.OperatorCredentials)
244236
if err != nil {
@@ -297,14 +289,22 @@ func (n *Node) PostInit(ctx context.Context) error {
297289
return fmt.Errorf("failed to run zombie diagnosis: %s", err)
298290
}
299291

300-
// This should never happen
301-
if primary != n.PrivateIP {
292+
// This should never happen, but check anyways for correctness
293+
if primary != n.Hostname() {
302294
return fmt.Errorf("resolved primary '%s' does not match ourself '%s'. this should not happen",
303295
primary,
304-
n.PrivateIP,
296+
n.Hostname(),
305297
)
306298
}
307299

300+
// Clear the zombie lock if it exists.
301+
if ZombieLockExists() {
302+
log.Println("[INFO] Clearing zombie lock and re-enabling read/write")
303+
if err := RemoveZombieLock(); err != nil {
304+
return fmt.Errorf("failed to remove zombie lock: %s", err)
305+
}
306+
}
307+
308308
// Re-register primary to apply any configuration changes.
309309
if err := n.RepMgr.registerPrimary(daemonRestartRequired); err != nil {
310310
return fmt.Errorf("failed to re-register existing primary: %s", err)

internal/flypg/zombie.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,7 @@ func TakeDNASample(ctx context.Context, node *Node, standbys []Member) (*DNASamp
117117
sample.totalActive++
118118

119119
// Record conflict when primary name does not match our machine ID
120-
if primary.Hostname != node.Hostname() {
120+
if primary.Hostname != node.Hostname() && primary.Hostname != node.PrivateIP {
121121
sample.totalConflicts++
122122
sample.conflictMap[primary.Hostname]++
123123
}
@@ -244,6 +244,7 @@ func handleZombieLock(ctx context.Context, n *Node) error {
244244
// TODO - Provide link to documentation on how to address this
245245
log.Println("[WARN] Zombie lock file does not contain a hostname.")
246246
log.Println("[WARN] This likely means that we were unable to determine who the real primary is.")
247+
log.Println("[WARN] If a new primary has been established, consider adding a new replica with `fly machines clone <primary-machine-id>` and then remove this member.")
247248
}
248249

249250
return nil

0 commit comments

Comments
 (0)