Skip to content

Commit c0da3a0

Browse files
authored
PBM-1426: Sporadic failures on physical restore with PITR (#1061)
Ensure that oplog is applied on Primary node during Physical Restore When applying oplog chunks on the the Primary node, it might happen that Primary node is still not reached Primary state, simply because after its restart, it is in transition towards Primary: STARTUP -> STARTUP2 -> RECOVERING -> SECONDARY -> PRIMARY To fix such a situation, PBM checks if the Primary state is reached. After the Primay state is reached, oplog is applied on that Primary.
1 parent cb26f75 commit c0da3a0

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

pbm/restore/physical.go

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,39 @@ func waitMgoShutdown(dbpath string) error {
324324
return nil
325325
}
326326

327+
// waitToBecomePrimary pause execution until RS member becomes primary node.
328+
// Error is returned in case of timeout.
329+
// Unexpected error while getting node info is just logged.
330+
func (r *PhysRestore) waitToBecomePrimary(ctx context.Context, m *mongo.Client) error {
331+
tk := time.NewTicker(time.Second)
332+
defer tk.Stop()
333+
334+
tout := time.NewTimer(2 * time.Minute)
335+
defer tout.Stop()
336+
337+
for {
338+
select {
339+
case <-tk.C:
340+
inf, err := topo.GetNodeInfo(ctx, m)
341+
if err != nil {
342+
r.log.Debug("get node info error while waiting to become primary: %v", err)
343+
continue
344+
}
345+
346+
if inf.IsPrimary {
347+
return nil
348+
}
349+
r.log.Debug("node: %s is still not primary, waiting for another cycle", inf.Me)
350+
351+
case <-tout.C:
352+
return errors.New("timeout while waiting the node to become primary")
353+
354+
case <-ctx.Done():
355+
return ctx.Err()
356+
}
357+
}
358+
}
359+
327360
// toState moves cluster to the given restore state.
328361
// All communication happens via files in the restore dir on storage.
329362
//
@@ -1347,6 +1380,11 @@ func (r *PhysRestore) replayOplog(
13471380
return errors.Wrap(err, "define mongo version")
13481381
}
13491382

1383+
err = r.waitToBecomePrimary(ctx, nodeConn)
1384+
if err != nil {
1385+
return errors.Wrap(err, "wait to become primary before applying oplog")
1386+
}
1387+
13501388
oplogOption := applyOplogOption{
13511389
start: &from,
13521390
end: &to,

0 commit comments

Comments
 (0)