4
4
"context"
5
5
"time"
6
6
7
+ "golang.org/x/sync/errgroup"
8
+
7
9
"github.com/percona/percona-backup-mongodb/internal/backup"
8
10
"github.com/percona/percona-backup-mongodb/internal/config"
9
11
"github.com/percona/percona-backup-mongodb/internal/ctrl"
@@ -17,36 +19,27 @@ import (
17
19
)
18
20
19
21
type currentBackup struct {
20
- header * ctrl.BackupCmd
21
22
cancel context.CancelFunc
22
23
}
23
24
24
- func (a * Agent ) setBcp (b * currentBackup ) bool {
25
+ func (a * Agent ) setBcp (b * currentBackup ) {
25
26
a .mx .Lock ()
26
27
defer a .mx .Unlock ()
27
- if a .bcp != nil {
28
- return false
29
- }
30
28
31
29
a .bcp = b
32
- return true
33
- }
34
-
35
- func (a * Agent ) unsetBcp () {
36
- a .mx .Lock ()
37
- a .bcp = nil
38
- a .mx .Unlock ()
39
30
}
40
31
41
32
// CancelBackup cancels current backup
42
33
func (a * Agent ) CancelBackup () {
43
34
a .mx .Lock ()
44
35
defer a .mx .Unlock ()
36
+
45
37
if a .bcp == nil {
46
38
return
47
39
}
48
40
49
41
a .bcp .cancel ()
42
+ a .bcp = nil
50
43
}
51
44
52
45
// Backup starts backup
@@ -67,6 +60,7 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
67
60
l .Error ("get node info: %v" , err )
68
61
return
69
62
}
63
+ // TODO: do the check on the agent start only
70
64
if nodeInfo .IsStandalone () {
71
65
l .Error ("mongod node can not be used to fetch a consistent backup because it has no oplog. " +
72
66
"Please restart it as a primary in a single-node replicaset " +
@@ -78,7 +72,7 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
78
72
canRunBackup , err := topo .NodeSuitsExt (ctx , a .nodeConn , nodeInfo , cmd .Type )
79
73
if err != nil {
80
74
l .Error ("node check: %v" , err )
81
- if ! isClusterLeader {
75
+ if errors . Is ( err , context . Canceled ) || ! isClusterLeader {
82
76
return
83
77
}
84
78
}
@@ -89,10 +83,8 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
89
83
}
90
84
}
91
85
92
- // wakeup the slicer not to wait for the tick
93
- if p := a .getPitr (); p != nil {
94
- p .w <- & opid
95
- }
86
+ // wakeup the slicer to not wait for the tick
87
+ go a .sliceNow (opid )
96
88
97
89
var bcp * backup.Backup
98
90
switch cmd .Type {
@@ -117,6 +109,8 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
117
109
l .Error ("backups cannot be saved because PBM storage configuration hasn't been set yet" )
118
110
return
119
111
}
112
+
113
+ bcp .SetSlicerInterval (cfg .BackupSlicerInterval ())
120
114
bcp .SetTimeouts (cfg .Backup .Timeouts )
121
115
122
116
if isClusterLeader {
@@ -187,21 +181,28 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
187
181
l .Error ("get cluster members: %v" , err )
188
182
return
189
183
}
190
- for _ , sh := range shards {
191
- go func (rs string ) {
192
- err := a .nominateRS (ctx , cmd .Name , rs , nodes .RS (rs ), l )
193
- if err != nil {
194
- l .Error ("nodes nomination for %s: %v" , rs , err )
195
- }
196
- }(sh .RS )
184
+
185
+ errGrp , grpCtx := errgroup .WithContext (ctx )
186
+ for i := range shards {
187
+ rs := shards [i ].RS
188
+
189
+ errGrp .Go (func () error {
190
+ err := a .nominateRS (grpCtx , cmd .Name , rs , nodes .RS (rs ))
191
+ return errors .Wrapf (err , "nodes nomination for %s" , rs )
192
+ })
193
+ }
194
+
195
+ err = errGrp .Wait ()
196
+ if err != nil {
197
+ l .Error (err .Error ())
198
+ return
197
199
}
198
200
}
199
201
200
- nominated , err := a .waitNomination (ctx , cmd .Name , nodeInfo .SetName , nodeInfo .Me , l )
202
+ nominated , err := a .waitNomination (ctx , cmd .Name , nodeInfo .SetName , nodeInfo .Me )
201
203
if err != nil {
202
204
l .Error ("wait for nomination: %v" , err )
203
205
}
204
-
205
206
if ! nominated {
206
207
l .Debug ("skip after nomination, probably started by another node" )
207
208
return
@@ -216,13 +217,7 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
216
217
Epoch : & epoch ,
217
218
})
218
219
219
- // install a backup lock despite having PITR one
220
- got , err := a .acquireLock (ctx , lck , l , func (ctx context.Context ) (bool , error ) {
221
- return lck .Rewrite (ctx , & lock.LockHeader {
222
- Replset : a .brief .SetName ,
223
- Type : ctrl .CmdPITR ,
224
- })
225
- })
220
+ got , err := a .acquireLock (ctx , lck , l )
226
221
if err != nil {
227
222
l .Error ("acquiring lock: %v" , err )
228
223
return
@@ -231,20 +226,27 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
231
226
l .Debug ("skip: lock not acquired" )
232
227
return
233
228
}
229
+ defer func () {
230
+ l .Debug ("releasing lock" )
231
+ err = lck .Release ()
232
+ if err != nil {
233
+ l .Error ("unable to release backup lock %v: %v" , lck , err )
234
+ }
235
+ }()
234
236
235
237
err = backup .SetRSNomineeACK (ctx , a .leadConn , cmd .Name , nodeInfo .SetName , nodeInfo .Me )
236
238
if err != nil {
237
239
l .Warning ("set nominee ack: %v" , err )
238
240
}
239
241
240
242
bcpCtx , cancel := context .WithCancel (ctx )
241
- a .setBcp (& currentBackup {
242
- header : cmd ,
243
- cancel : cancel ,
244
- })
243
+ defer cancel ()
244
+
245
+ a .setBcp (& currentBackup {cancel : cancel })
246
+ defer a .setBcp (nil )
247
+
245
248
l .Info ("backup started" )
246
249
err = bcp .Run (bcpCtx , cmd , opid , l )
247
- a .unsetBcp ()
248
250
if err != nil {
249
251
if errors .Is (err , storage .ErrCancelled ) || errors .Is (err , context .Canceled ) {
250
252
l .Info ("backup was canceled" )
@@ -254,18 +256,14 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
254
256
} else {
255
257
l .Info ("backup finished" )
256
258
}
257
-
258
- l .Debug ("releasing lock" )
259
- err = lck .Release ()
260
- if err != nil {
261
- l .Error ("unable to release backup lock %v: %v" , lck , err )
262
- }
263
259
}
264
260
265
261
const renominationFrame = 5 * time .Second
266
262
267
- func (a * Agent ) nominateRS (ctx context.Context , bcp , rs string , nodes [][]string , l log.LogEvent ) error {
263
+ func (a * Agent ) nominateRS (ctx context.Context , bcp , rs string , nodes [][]string ) error {
264
+ l := log .LogEventFromContext (ctx )
268
265
l .Debug ("nomination list for %s: %v" , rs , nodes )
266
+
269
267
err := backup .SetRSNomination (ctx , a .leadConn , bcp , rs )
270
268
if err != nil {
271
269
return errors .Wrap (err , "set nomination meta" )
@@ -298,7 +296,9 @@ func (a *Agent) nominateRS(ctx context.Context, bcp, rs string, nodes [][]string
298
296
return nil
299
297
}
300
298
301
- func (a * Agent ) waitNomination (ctx context.Context , bcp , rs , node string , l log.LogEvent ) (bool , error ) {
299
+ func (a * Agent ) waitNomination (ctx context.Context , bcp , rs , node string ) (bool , error ) {
300
+ l := log .LogEventFromContext (ctx )
301
+
302
302
tk := time .NewTicker (time .Millisecond * 500 )
303
303
defer tk .Stop ()
304
304
0 commit comments