@@ -34,7 +34,7 @@ type Agent struct {
34
34
35
35
brief topo.NodeBrief
36
36
37
- dumpConns int
37
+ numParallelColls int
38
38
39
39
closeCMD chan struct {}
40
40
pauseHB int32
@@ -44,7 +44,12 @@ type Agent struct {
44
44
monStopSig chan struct {}
45
45
}
46
46
47
- func newAgent (ctx context.Context , leadConn connect.Client , uri string , dumpConns int ) (* Agent , error ) {
47
+ func newAgent (
48
+ ctx context.Context ,
49
+ leadConn connect.Client ,
50
+ uri string ,
51
+ numParallelColls int ,
52
+ ) (* Agent , error ) {
48
53
nodeConn , err := connect .MongoConnect (ctx , uri , connect .Direct (true ))
49
54
if err != nil {
50
55
return nil , err
@@ -72,7 +77,7 @@ func newAgent(ctx context.Context, leadConn connect.Client, uri string, dumpConn
72
77
ConfigSvr : info .IsConfigSrv (),
73
78
Version : mongoVersion ,
74
79
},
75
- dumpConns : dumpConns ,
80
+ numParallelColls : numParallelColls ,
76
81
}
77
82
return a , nil
78
83
}
@@ -104,16 +109,16 @@ func (a *Agent) CanStart(ctx context.Context) error {
104
109
return ErrDelayedNode
105
110
}
106
111
107
- ver , err := version . GetMongoVersion ( ctx , a . leadConn . MongoClient ())
108
- if err != nil {
109
- return errors . Wrap ( err , "get mongo version" )
110
- }
111
- if err := version .FeatureSupport (ver ).PBMSupport (); err != nil {
112
+ return nil
113
+ }
114
+
115
+ func ( a * Agent ) showIncompatibilityWarning ( ctx context. Context ) {
116
+ if err := version .FeatureSupport (a . brief . Version ).PBMSupport (); err != nil {
112
117
log .FromContext (ctx ).
113
118
Warning ("" , "" , "" , primitive.Timestamp {}, "WARNING: %v" , err )
114
119
}
115
120
116
- if ver .IsShardedTimeseriesSupported () {
121
+ if a . brief . Sharded && a . brief . Version .IsShardedTimeseriesSupported () {
117
122
tss , err := topo .ListShardedTimeseries (ctx , a .leadConn )
118
123
if err != nil {
119
124
log .FromContext (ctx ).
@@ -127,7 +132,18 @@ func (a *Agent) CanStart(ctx context.Context) error {
127
132
}
128
133
}
129
134
130
- return nil
135
+ if a .brief .Sharded && a .brief .Version .IsConfigShardSupported () {
136
+ hasConfigShard , err := topo .HasConfigShard (ctx , a .leadConn )
137
+ if err != nil {
138
+ log .FromContext (ctx ).
139
+ Error ("" , "" , "" , primitive.Timestamp {},
140
+ "failed to check for Config Shard: %v" , err )
141
+ } else if hasConfigShard {
142
+ log .FromContext (ctx ).
143
+ Warning ("" , "" , "" , primitive.Timestamp {},
144
+ "WARNING: selective backup and restore is not supported with Config Shard" )
145
+ }
146
+ }
131
147
}
132
148
133
149
// Start starts listening the commands stream.
@@ -151,7 +167,7 @@ func (a *Agent) Start(ctx context.Context) error {
151
167
return nil
152
168
}
153
169
154
- logger .Printf ("got command %s" , cmd )
170
+ logger .Printf ("got command %s, opid: %s " , cmd , cmd . OPID )
155
171
156
172
ep , err := config .GetEpoch (ctx , a .leadConn )
157
173
if err != nil {
@@ -266,8 +282,17 @@ func (a *Agent) HbStatus(ctx context.Context) {
266
282
MongoVer : nodeVersion .VersionString ,
267
283
PerconaVer : nodeVersion .PSMDBVersion ,
268
284
}
285
+
286
+ updateAgentStat (ctx , a , l , true , & hb )
287
+ err = topo .SetAgentStatus (ctx , a .leadConn , & hb )
288
+ if err != nil {
289
+ l .Error ("set status: %v" , err )
290
+ }
291
+
269
292
defer func () {
270
- if err := topo .RemoveAgentStatus (ctx , a .leadConn , hb ); err != nil {
293
+ l .Debug ("deleting agent status" )
294
+ err := topo .RemoveAgentStatus (context .Background (), a .leadConn , hb )
295
+ if err != nil {
271
296
logger := logger .NewEvent ("agentCheckup" , "" , "" , primitive.Timestamp {})
272
297
logger .Error ("remove agent heartbeat: %v" , err )
273
298
}
@@ -276,74 +301,128 @@ func (a *Agent) HbStatus(ctx context.Context) {
276
301
tk := time .NewTicker (defs .AgentsStatCheckRange )
277
302
defer tk .Stop ()
278
303
304
+ storageCheckTime := time .Now ()
305
+ parallelAgentCheckTime := time .Now ()
306
+
279
307
// check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647)
280
- const checkStoreIn = int (60 / (defs .AgentsStatCheckRange / time .Second ))
281
- cc := 0
282
- for range tk .C {
283
- // don't check if on pause (e.g. physical restore)
284
- if ! a .HbIsRun () {
285
- continue
286
- }
308
+ const storageCheckInterval = 15 * time .Second
309
+ const parallelAgentCheckInternval = time .Minute
287
310
288
- hb .PBMStatus = a .pbmStatus (ctx )
289
- logHbStatus ("PBM connection" , hb .PBMStatus , l )
311
+ for {
312
+ select {
313
+ case <- ctx .Done ():
314
+ return
315
+ case <- tk .C :
316
+ // don't check if on pause (e.g. physical restore)
317
+ if ! a .HbIsRun () {
318
+ continue
319
+ }
290
320
291
- hb .NodeStatus = a .nodeStatus (ctx )
292
- logHbStatus ("node connection" , hb .NodeStatus , l )
321
+ now := time .Now ()
322
+ if now .Sub (parallelAgentCheckTime ) >= parallelAgentCheckInternval {
323
+ a .warnIfParallelAgentDetected (ctx , l , hb .Heartbeat )
324
+ parallelAgentCheckTime = now
325
+ }
293
326
294
- cc ++
295
- hb .StorageStatus = a .storStatus (ctx , l , cc == checkStoreIn )
296
- logHbStatus ("storage connection" , hb .StorageStatus , l )
297
- if cc == checkStoreIn {
298
- cc = 0
327
+ if now .Sub (storageCheckTime ) >= storageCheckInterval {
328
+ updateAgentStat (ctx , a , l , true , & hb )
329
+ err = topo .SetAgentStatus (ctx , a .leadConn , & hb )
330
+ if err == nil {
331
+ storageCheckTime = now
332
+ }
333
+ } else {
334
+ updateAgentStat (ctx , a , l , false , & hb )
335
+ err = topo .SetAgentStatus (ctx , a .leadConn , & hb )
336
+ }
337
+ if err != nil {
338
+ l .Error ("set status: %v" , err )
339
+ }
299
340
}
341
+ }
342
+ }
300
343
301
- hb .Err = ""
302
- hb .Hidden = false
303
- hb .Passive = false
344
+ func updateAgentStat (
345
+ ctx context.Context ,
346
+ agent * Agent ,
347
+ l log.LogEvent ,
348
+ checkStore bool ,
349
+ hb * topo.AgentStat ,
350
+ ) {
351
+ hb .PBMStatus = agent .pbmStatus (ctx )
352
+ logHbStatus ("PBM connection" , hb .PBMStatus , l )
304
353
305
- inf , err := topo .GetNodeInfo (ctx , a .nodeConn )
306
- if err != nil {
307
- l .Error ("get NodeInfo: %v" , err )
308
- hb .Err += fmt .Sprintf ("get NodeInfo: %v" , err )
354
+ hb .NodeStatus = agent .nodeStatus (ctx )
355
+ logHbStatus ("node connection" , hb .NodeStatus , l )
356
+
357
+ hb .StorageStatus = agent .storStatus (ctx , l , checkStore , hb )
358
+ logHbStatus ("storage connection" , hb .StorageStatus , l )
359
+
360
+ hb .Err = ""
361
+ hb .Hidden = false
362
+ hb .Passive = false
363
+
364
+ inf , err := topo .GetNodeInfo (ctx , agent .nodeConn )
365
+ if err != nil {
366
+ l .Error ("get NodeInfo: %v" , err )
367
+ hb .Err += fmt .Sprintf ("get NodeInfo: %v" , err )
368
+ } else {
369
+ hb .Hidden = inf .Hidden
370
+ hb .Passive = inf .Passive
371
+ hb .Arbiter = inf .ArbiterOnly
372
+ if inf .SecondaryDelayOld != 0 {
373
+ hb .DelaySecs = inf .SecondaryDelayOld
309
374
} else {
310
- hb .Hidden = inf .Hidden
311
- hb .Passive = inf .Passive
312
- hb .Arbiter = inf .ArbiterOnly
313
- if inf .SecondaryDelayOld != 0 {
314
- hb .DelaySecs = inf .SecondaryDelayOld
315
- } else {
316
- hb .DelaySecs = inf .SecondaryDelaySecs
317
- }
375
+ hb .DelaySecs = inf .SecondaryDelaySecs
318
376
}
319
377
320
- if inf != nil && inf .ArbiterOnly {
321
- hb .State = defs .NodeStateArbiter
322
- hb .StateStr = "ARBITER"
378
+ hb .Heartbeat , err = topo .ClusterTimeFromNodeInfo (inf )
379
+ if err != nil {
380
+ hb .Err += fmt .Sprintf ("get cluster time: %v" , err )
381
+ }
382
+ }
383
+
384
+ if inf != nil && inf .ArbiterOnly {
385
+ hb .State = defs .NodeStateArbiter
386
+ hb .StateStr = "ARBITER"
387
+ } else {
388
+ n , err := topo .GetNodeStatus (ctx , agent .nodeConn , agent .brief .Me )
389
+ if err != nil {
390
+ l .Error ("get replSetGetStatus: %v" , err )
391
+ hb .Err += fmt .Sprintf ("get replSetGetStatus: %v" , err )
392
+ hb .State = defs .NodeStateUnknown
393
+ hb .StateStr = "UNKNOWN"
323
394
} else {
324
- n , err := topo .GetNodeStatus (ctx , a .nodeConn , a .brief .Me )
325
- if err != nil {
326
- l .Error ("get replSetGetStatus: %v" , err )
327
- hb .Err += fmt .Sprintf ("get replSetGetStatus: %v" , err )
328
- hb .State = defs .NodeStateUnknown
329
- hb .StateStr = "UNKNOWN"
330
- } else {
331
- hb .State = n .State
332
- hb .StateStr = n .StateStr
395
+ hb .State = n .State
396
+ hb .StateStr = n .StateStr
333
397
334
- rLag , err := topo .ReplicationLag (ctx , a .nodeConn , a .brief .Me )
335
- if err != nil {
336
- l .Error ("get replication lag: %v" , err )
337
- hb .Err += fmt .Sprintf ("get replication lag: %v" , err )
338
- }
339
- hb .ReplicationLag = rLag
398
+ rLag , err := topo .ReplicationLag (ctx , agent .nodeConn , agent .brief .Me )
399
+ if err != nil {
400
+ l .Error ("get replication lag: %v" , err )
401
+ hb .Err += fmt .Sprintf ("get replication lag: %v" , err )
340
402
}
403
+ hb .ReplicationLag = rLag
341
404
}
405
+ }
406
+ }
342
407
343
- err = topo .SetAgentStatus (ctx , a .leadConn , hb )
344
- if err != nil {
345
- l .Error ("set status: %v" , err )
408
+ func (a * Agent ) warnIfParallelAgentDetected (
409
+ ctx context.Context ,
410
+ l log.LogEvent ,
411
+ lastHeartbeat primitive.Timestamp ,
412
+ ) {
413
+ s , err := topo .GetAgentStatus (ctx , a .leadConn , a .brief .SetName , a .brief .Me )
414
+ if err != nil {
415
+ if errors .Is (err , mongo .ErrNoDocuments ) {
416
+ return
346
417
}
418
+ l .Error ("detecting parallel agent: get status: %v" , err )
419
+ return
420
+ }
421
+ if ! s .Heartbeat .Equal (lastHeartbeat ) {
422
+ l .Warning ("detected possible parallel agent for the node: " +
423
+ "expected last heartbeat to be %d.%d, actual is %d.%d" ,
424
+ lastHeartbeat .T , lastHeartbeat .I , s .Heartbeat .T , s .Heartbeat .I )
425
+ return
347
426
}
348
427
}
349
428
@@ -365,13 +444,14 @@ func (a *Agent) nodeStatus(ctx context.Context) topo.SubsysStatus {
365
444
return topo.SubsysStatus {OK : true }
366
445
}
367
446
368
- func (a * Agent ) storStatus (ctx context.Context , log log.LogEvent , forceCheckStorage bool ) topo.SubsysStatus {
447
+ func (a * Agent ) storStatus (
448
+ ctx context.Context ,
449
+ log log.LogEvent ,
450
+ forceCheckStorage bool ,
451
+ stat * topo.AgentStat ,
452
+ ) topo.SubsysStatus {
369
453
// check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647)
370
454
// but if storage was(is) failed, check it always
371
- stat , err := topo .GetAgentStatus (ctx , a .leadConn , a .brief .SetName , a .brief .Me )
372
- if err != nil {
373
- log .Warning ("get current storage status: %v" , err )
374
- }
375
455
if ! forceCheckStorage && stat .StorageStatus .OK {
376
456
return topo.SubsysStatus {OK : true }
377
457
}
0 commit comments