Skip to content

Commit 97431be

Browse files
authored
PBM-1553: Improve mongod restart procedure during physical restore (#1132)
* Add timeout for waiting for mongod.lock file * Add check that mongod port is released: an additional check is performed to be sure that mongod frees its port. During subsequent restarts of mongod, it can happen that mongod start fails due to the used port. * Add tests and doc for waitMgoFreePort
1 parent 51f3c7d commit 97431be

File tree

2 files changed

+93
-14
lines changed

2 files changed

+93
-14
lines changed

pbm/restore/physical.go

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@ const (
5555

5656
defaultPort = 27017
5757

58-
tryConnCount = 5
59-
tryConnTimeout = 5 * time.Minute
58+
tryConnCount = 5
59+
tryConnTimeout = 5 * time.Minute
60+
mongodLockTimeout = 30 * time.Minute
61+
mongodPortTimeout = 5 * time.Minute
6062

6163
internalMongodLog = "pbm.restore.log"
6264
)
@@ -452,22 +454,53 @@ func nodeShutdown(ctx context.Context, m *mongo.Client) error {
452454
return err
453455
}
454456

457+
// waitMgoShutdown waits until mongod releases mongod.lock file within dbpath dir.
458+
// In case of timeout or unexpected error it'll return error.
455459
func waitMgoShutdown(dbpath string) error {
456460
tk := time.NewTicker(time.Second)
457461
defer tk.Stop()
458462

459-
for range tk.C {
460-
f, err := os.Stat(path.Join(dbpath, mongofslock))
461-
if err != nil {
462-
return errors.Wrapf(err, "check for lock file %s", path.Join(dbpath, mongofslock))
463-
}
463+
to := time.After(mongodLockTimeout)
464464

465-
if f.Size() == 0 {
466-
return nil
465+
for {
466+
select {
467+
case <-tk.C:
468+
f, err := os.Stat(path.Join(dbpath, mongofslock))
469+
if err != nil {
470+
return errors.Wrapf(err, "check for lock file %s", path.Join(dbpath, mongofslock))
471+
}
472+
473+
if f.Size() == 0 {
474+
return nil
475+
}
476+
477+
case <-to:
478+
return errors.Errorf("timeout during waiting for lock file %s", path.Join(dbpath, mongofslock))
467479
}
468480
}
481+
}
469482

470-
return nil
483+
// waitMgoFreePort waits for port p to be free.
484+
// It case of timeout it'll return error.
485+
func waitMgoFreePort(p int) error {
486+
tk := time.NewTicker(time.Second)
487+
defer tk.Stop()
488+
489+
to := time.After(mongodPortTimeout)
490+
491+
for {
492+
select {
493+
case <-tk.C:
494+
ln, err := net.Listen("tcp", ":"+strconv.Itoa(p))
495+
if err == nil {
496+
ln.Close()
497+
return nil
498+
}
499+
500+
case <-to:
501+
return errors.Errorf("timeout during waiting for mongod free port %d", p)
502+
}
503+
}
471504
}
472505

473506
// waitToBecomePrimary pause execution until RS member becomes primary node.
@@ -1450,11 +1483,11 @@ func (r *PhysRestore) prepareData() error {
14501483
}
14511484

14521485
func (r *PhysRestore) shutdown(c *mongo.Client) error {
1453-
err := shutdownImpl(c, r.dbpath, false)
1486+
err := shutdownImpl(c, r.dbpath, false, r.tmpPort)
14541487
if err != nil {
14551488
if strings.Contains(err.Error(), "ConflictingOperationInProgress") {
14561489
r.log.Warning("try force shutdown. reason: %v", err)
1457-
err = shutdownImpl(c, r.dbpath, true)
1490+
err = shutdownImpl(c, r.dbpath, true, r.tmpPort)
14581491
return errors.Wrap(err, "force shutdown mongo")
14591492
}
14601493

@@ -1464,7 +1497,7 @@ func (r *PhysRestore) shutdown(c *mongo.Client) error {
14641497
return nil
14651498
}
14661499

1467-
func shutdownImpl(c *mongo.Client, dbpath string, force bool) error {
1500+
func shutdownImpl(c *mongo.Client, dbpath string, force bool, port int) error {
14681501
res := c.Database("admin").RunCommand(context.TODO(),
14691502
bson.D{{"shutdown", 1}, {"force", force}})
14701503
err := res.Err()
@@ -1474,7 +1507,11 @@ func shutdownImpl(c *mongo.Client, dbpath string, force bool) error {
14741507

14751508
err = waitMgoShutdown(dbpath)
14761509
if err != nil {
1477-
return errors.Wrap(err, "wait")
1510+
return errors.Wrap(err, "wait mongod shutdown")
1511+
}
1512+
err = waitMgoFreePort(port)
1513+
if err != nil {
1514+
return errors.Wrap(err, "wait mongod free port")
14781515
}
14791516

14801517
return nil

pbm/restore/physical_test.go

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,12 @@ package restore
22

33
import (
44
"fmt"
5+
"net"
56
"os"
67
"path/filepath"
78
"strings"
89
"testing"
10+
"time"
911

1012
"github.com/percona/percona-backup-mongodb/pbm/log"
1113
)
@@ -139,3 +141,43 @@ func TestMoveAll(t *testing.T) {
139141
}
140142
})
141143
}
144+
145+
func TestWaitMgoFreePort(t *testing.T) {
146+
t.Run("wait for the port a bit", func(t *testing.T) {
147+
ln, err := net.Listen("tcp", ":0")
148+
if err != nil {
149+
t.Fatalf("failed to bind port: %v", err)
150+
}
151+
port := ln.Addr().(*net.TCPAddr).Port
152+
153+
portUsed := 5 * time.Second
154+
go func() {
155+
time.Sleep(portUsed)
156+
ln.Close()
157+
}()
158+
159+
start := time.Now()
160+
err = waitMgoFreePort(port)
161+
duration := time.Since(start)
162+
163+
if err != nil {
164+
t.Fatalf("error while waiting for the free port: %v", err)
165+
}
166+
if duration < portUsed-time.Second ||
167+
duration > portUsed+time.Second {
168+
t.Fatalf("wrong duration time, want~=%v, got=%v", portUsed, duration)
169+
}
170+
})
171+
172+
t.Run("sequential check", func(t *testing.T) {
173+
var err error
174+
for i := range 10 {
175+
start := time.Now()
176+
err = waitMgoFreePort(8088)
177+
if err != nil {
178+
t.Fatalf("free port err: %v", err)
179+
}
180+
t.Logf("finish %d round, with duration %s", i+1, time.Since(start))
181+
}
182+
})
183+
}

0 commit comments

Comments
 (0)