Skip to content

Commit 081faf4

Browse files
authored
Merge pull request #1137 from percona/PBM-1550-phys-restore-started-without-conditions
PBM-1550: Physical restore started without conditions
2 parents cbfe2b9 + fa8015f commit 081faf4

File tree

2 files changed

+72
-30
lines changed

2 files changed

+72
-30
lines changed

pbm/restore/physical.go

Lines changed: 33 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -210,8 +210,9 @@ func peekTmpPort(current int) (int, error) {
210210
return -1, errors.Errorf("can't find unused port in range [%d, %d]", current, current+rng)
211211
}
212212

213-
// Close releases object resources.
214-
// Should be run to avoid leaks.
213+
// close cleans up all temp restore files.
214+
// Based on error status it does cleanup of the node's db path or it
215+
// applies fallback sync recovery strategy.
215216
func (r *PhysRestore) close(noerr, cleanup bool) {
216217
if r.tmpConf != nil {
217218
r.log.Debug("rm tmp conf")
@@ -221,7 +222,8 @@ func (r *PhysRestore) close(noerr, cleanup bool) {
221222
}
222223
}
223224

224-
// if there is no error clean-up internal restore files, internal log file(s) should stay
225+
// if there is no error, clean-up internal restore files
226+
// internal log file(s) should stay in any case
225227
if noerr {
226228
extMeta := filepath.Join(r.dbpath,
227229
fmt.Sprintf(defs.ExternalRsMetaFile, util.MakeReverseRSMapFunc(r.rsMap)(r.nodeInfo.SetName)))
@@ -240,24 +242,24 @@ func (r *PhysRestore) close(noerr, cleanup bool) {
240242
if err != nil {
241243
r.log.Warning("waiting for cluster status during cleanup: %v", err)
242244
}
243-
if cStatus == defs.StatusError {
245+
if cStatus == defs.StatusError && cleanup {
244246
r.log.Warning("apply db data from %s", fallbackDir)
245247
err := r.migrateFromFallbackDirToDBDir()
246248
if err != nil {
247249
r.log.Error("migrate from fallback dir: %v", err)
248250
}
249-
} else if cleanup { // clean-up dbpath on err if needed (cluster is done or partlyDone)
251+
} else if (cStatus == defs.StatusDone || cStatus == defs.StatusPartlyDone) && cleanup {
250252
r.log.Debug("clean-up dbpath")
251253
err := removeAll(r.dbpath, r.log, getInternalLogFileSkipRule())
252254
if err != nil {
253255
r.log.Error("flush dbpath %s: %v", r.dbpath, err)
254256
}
255-
} else { // free space by just deleting fallback dir in any other case
256-
r.log.Debug("remove fallback dir")
257-
err := r.removeFallback()
258-
if err != nil {
259-
r.log.Error("flush fallback: %v", err)
260-
}
257+
}
258+
259+
// free space by just deleting fallback dir in any other case
260+
err = r.removeFallback()
261+
if err != nil {
262+
r.log.Error("flush fallback: %v", err)
261263
}
262264

263265
if r.stopHB != nil {
@@ -440,6 +442,7 @@ func (r *PhysRestore) moveToFallback() error {
440442

441443
// removeFallback removes fallback dir
442444
func (r *PhysRestore) removeFallback() error {
445+
r.log.Debug("remove fallback dir")
443446
dbpath := filepath.Clean(r.dbpath)
444447
fallbackPath := filepath.Join(dbpath, fallbackDir)
445448
err := os.RemoveAll(fallbackPath)
@@ -853,7 +856,20 @@ const (
853856
restoreDone
854857
)
855858

856-
func (n nodeStatus) is(s nodeStatus) bool { return n&s != 0 }
859+
// isForCleanup returns true when internal node state if so
860+
// any sort of clean-up: full cleanup or fallbacksync.
861+
// Status indicates that content of db path contains state
862+
// which is not correct, and therefore it needs to be
863+
// discarded.
864+
func (n nodeStatus) isForCleanup() bool {
865+
return n&restoreStared != 0 && n&restoreDone == 0
866+
}
867+
868+
// isFailed returns true when internal node state didn't reach
869+
// done state, no matter whether it was started or not.
870+
func (n nodeStatus) isFailed() bool {
871+
return n&restoreDone == 0
872+
}
857873

858874
// log buffer that will dump content to the storage on restore
859875
// finish (whether it's successful or not). It also dumps content
@@ -958,11 +974,12 @@ func (r *PhysRestore) Snapshot(
958974
defer func() {
959975
// set failed status of node on error, but
960976
// don't mark node as failed after the local restore succeed
961-
if err != nil && !progress.is(restoreDone) && !errors.Is(err, ErrNoDataForShard) {
962-
r.MarkFailed(meta, err)
977+
restoreFailed := progress.isFailed()
978+
if err != nil && !errors.Is(err, ErrNoDataForShard) && restoreFailed {
979+
r.MarkFailed(err)
963980
}
964981

965-
r.close(err == nil, progress.is(restoreStared) && !progress.is(restoreDone))
982+
r.close(err == nil, progress.isForCleanup())
966983
}()
967984

968985
err = r.init(ctx, cmd.Name, opid, l)
@@ -2525,20 +2542,7 @@ func (r *PhysRestore) checkMongod(needVersion string) (version string, err error
25252542
}
25262543

25272544
// MarkFailed sets the restore and rs state as failed with the given message
2528-
func (r *PhysRestore) MarkFailed(meta *RestoreMeta, e error) {
2529-
var nerr nodeError
2530-
if errors.As(e, &nerr) {
2531-
e = nerr
2532-
meta.Replsets = []RestoreReplset{{
2533-
Name: nerr.node,
2534-
Status: defs.StatusError,
2535-
Error: nerr.msg,
2536-
}}
2537-
} else if len(meta.Replsets) > 0 {
2538-
meta.Replsets[0].Status = defs.StatusError
2539-
meta.Replsets[0].Error = e.Error()
2540-
}
2541-
2545+
func (r *PhysRestore) MarkFailed(e error) {
25422546
err := util.RetryableWrite(r.stg,
25432547
r.syncPathNode+"."+string(defs.StatusError), errStatus(e))
25442548
if err != nil {

pbm/restore/physical_test.go

Lines changed: 39 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,44 @@ import (
1313
"github.com/percona/percona-backup-mongodb/pbm/log"
1414
)
1515

16+
func TestNodeStatus(t *testing.T) {
17+
t.Run("isForCleanup", func(t *testing.T) {
18+
var progress nodeStatus
19+
20+
if progress.isForCleanup() {
21+
t.Errorf("in initial progress phase, node is not for the cleanup")
22+
}
23+
24+
progress |= restoreStared
25+
if !progress.isForCleanup() {
26+
t.Errorf("in point of no return phase cleanup should be done")
27+
}
28+
29+
progress |= restoreDone
30+
if progress.isForCleanup() {
31+
t.Errorf("in done phase, node is not for the cleanup")
32+
}
33+
})
34+
35+
t.Run("isFailed", func(t *testing.T) {
36+
var progress nodeStatus
37+
38+
if !progress.isFailed() {
39+
t.Errorf("node is int initial progress phase, so it should be marked as failed")
40+
}
41+
42+
progress |= restoreStared
43+
if !progress.isFailed() {
44+
t.Errorf("node is in started phase, so it should be marked as failed")
45+
}
46+
47+
progress |= restoreDone
48+
if progress.isFailed() {
49+
t.Errorf("in done phase, node shouldn't be marked as failed")
50+
}
51+
})
52+
}
53+
1654
func TestMoveAll(t *testing.T) {
1755
t.Run("move all files and dir", func(t *testing.T) {
1856
tempSrc, _ := os.MkdirTemp("", "src")
@@ -165,7 +203,7 @@ func TestWaitMgoFreePort(t *testing.T) {
165203
t.Fatalf("error while waiting for the free port: %v", err)
166204
}
167205
if duration < portUsed-time.Second ||
168-
duration > portUsed+time.Second {
206+
duration > portUsed+2*time.Second {
169207
t.Fatalf("wrong duration time, want~=%v, got=%v", portUsed, duration)
170208
}
171209
})

0 commit comments

Comments
 (0)