Skip to content

Commit 94ad374

Browse files
authored
Merge pull request #1031 from percona/release-2.7.0
Release 2.7.0
2 parents f2cf881 + 957ac50 commit 94ad374

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

55 files changed

+1070
-601
lines changed

cmd/pbm-agent/agent.go

Lines changed: 149 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ type Agent struct {
3434

3535
brief topo.NodeBrief
3636

37-
dumpConns int
37+
numParallelColls int
3838

3939
closeCMD chan struct{}
4040
pauseHB int32
@@ -44,7 +44,12 @@ type Agent struct {
4444
monStopSig chan struct{}
4545
}
4646

47-
func newAgent(ctx context.Context, leadConn connect.Client, uri string, dumpConns int) (*Agent, error) {
47+
func newAgent(
48+
ctx context.Context,
49+
leadConn connect.Client,
50+
uri string,
51+
numParallelColls int,
52+
) (*Agent, error) {
4853
nodeConn, err := connect.MongoConnect(ctx, uri, connect.Direct(true))
4954
if err != nil {
5055
return nil, err
@@ -72,7 +77,7 @@ func newAgent(ctx context.Context, leadConn connect.Client, uri string, dumpConn
7277
ConfigSvr: info.IsConfigSrv(),
7378
Version: mongoVersion,
7479
},
75-
dumpConns: dumpConns,
80+
numParallelColls: numParallelColls,
7681
}
7782
return a, nil
7883
}
@@ -104,16 +109,16 @@ func (a *Agent) CanStart(ctx context.Context) error {
104109
return ErrDelayedNode
105110
}
106111

107-
ver, err := version.GetMongoVersion(ctx, a.leadConn.MongoClient())
108-
if err != nil {
109-
return errors.Wrap(err, "get mongo version")
110-
}
111-
if err := version.FeatureSupport(ver).PBMSupport(); err != nil {
112+
return nil
113+
}
114+
115+
func (a *Agent) showIncompatibilityWarning(ctx context.Context) {
116+
if err := version.FeatureSupport(a.brief.Version).PBMSupport(); err != nil {
112117
log.FromContext(ctx).
113118
Warning("", "", "", primitive.Timestamp{}, "WARNING: %v", err)
114119
}
115120

116-
if ver.IsShardedTimeseriesSupported() {
121+
if a.brief.Sharded && a.brief.Version.IsShardedTimeseriesSupported() {
117122
tss, err := topo.ListShardedTimeseries(ctx, a.leadConn)
118123
if err != nil {
119124
log.FromContext(ctx).
@@ -127,7 +132,18 @@ func (a *Agent) CanStart(ctx context.Context) error {
127132
}
128133
}
129134

130-
return nil
135+
if a.brief.Sharded && a.brief.Version.IsConfigShardSupported() {
136+
hasConfigShard, err := topo.HasConfigShard(ctx, a.leadConn)
137+
if err != nil {
138+
log.FromContext(ctx).
139+
Error("", "", "", primitive.Timestamp{},
140+
"failed to check for Config Shard: %v", err)
141+
} else if hasConfigShard {
142+
log.FromContext(ctx).
143+
Warning("", "", "", primitive.Timestamp{},
144+
"WARNING: selective backup and restore is not supported with Config Shard")
145+
}
146+
}
131147
}
132148

133149
// Start starts listening the commands stream.
@@ -151,7 +167,7 @@ func (a *Agent) Start(ctx context.Context) error {
151167
return nil
152168
}
153169

154-
logger.Printf("got command %s", cmd)
170+
logger.Printf("got command %s, opid: %s", cmd, cmd.OPID)
155171

156172
ep, err := config.GetEpoch(ctx, a.leadConn)
157173
if err != nil {
@@ -266,8 +282,17 @@ func (a *Agent) HbStatus(ctx context.Context) {
266282
MongoVer: nodeVersion.VersionString,
267283
PerconaVer: nodeVersion.PSMDBVersion,
268284
}
285+
286+
updateAgentStat(ctx, a, l, true, &hb)
287+
err = topo.SetAgentStatus(ctx, a.leadConn, &hb)
288+
if err != nil {
289+
l.Error("set status: %v", err)
290+
}
291+
269292
defer func() {
270-
if err := topo.RemoveAgentStatus(ctx, a.leadConn, hb); err != nil {
293+
l.Debug("deleting agent status")
294+
err := topo.RemoveAgentStatus(context.Background(), a.leadConn, hb)
295+
if err != nil {
271296
logger := logger.NewEvent("agentCheckup", "", "", primitive.Timestamp{})
272297
logger.Error("remove agent heartbeat: %v", err)
273298
}
@@ -276,74 +301,128 @@ func (a *Agent) HbStatus(ctx context.Context) {
276301
tk := time.NewTicker(defs.AgentsStatCheckRange)
277302
defer tk.Stop()
278303

304+
storageCheckTime := time.Now()
305+
parallelAgentCheckTime := time.Now()
306+
279307
// check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647)
280-
const checkStoreIn = int(60 / (defs.AgentsStatCheckRange / time.Second))
281-
cc := 0
282-
for range tk.C {
283-
// don't check if on pause (e.g. physical restore)
284-
if !a.HbIsRun() {
285-
continue
286-
}
308+
const storageCheckInterval = 15 * time.Second
309+
const parallelAgentCheckInternval = time.Minute
287310

288-
hb.PBMStatus = a.pbmStatus(ctx)
289-
logHbStatus("PBM connection", hb.PBMStatus, l)
311+
for {
312+
select {
313+
case <-ctx.Done():
314+
return
315+
case <-tk.C:
316+
// don't check if on pause (e.g. physical restore)
317+
if !a.HbIsRun() {
318+
continue
319+
}
290320

291-
hb.NodeStatus = a.nodeStatus(ctx)
292-
logHbStatus("node connection", hb.NodeStatus, l)
321+
now := time.Now()
322+
if now.Sub(parallelAgentCheckTime) >= parallelAgentCheckInternval {
323+
a.warnIfParallelAgentDetected(ctx, l, hb.Heartbeat)
324+
parallelAgentCheckTime = now
325+
}
293326

294-
cc++
295-
hb.StorageStatus = a.storStatus(ctx, l, cc == checkStoreIn)
296-
logHbStatus("storage connection", hb.StorageStatus, l)
297-
if cc == checkStoreIn {
298-
cc = 0
327+
if now.Sub(storageCheckTime) >= storageCheckInterval {
328+
updateAgentStat(ctx, a, l, true, &hb)
329+
err = topo.SetAgentStatus(ctx, a.leadConn, &hb)
330+
if err == nil {
331+
storageCheckTime = now
332+
}
333+
} else {
334+
updateAgentStat(ctx, a, l, false, &hb)
335+
err = topo.SetAgentStatus(ctx, a.leadConn, &hb)
336+
}
337+
if err != nil {
338+
l.Error("set status: %v", err)
339+
}
299340
}
341+
}
342+
}
300343

301-
hb.Err = ""
302-
hb.Hidden = false
303-
hb.Passive = false
344+
func updateAgentStat(
345+
ctx context.Context,
346+
agent *Agent,
347+
l log.LogEvent,
348+
checkStore bool,
349+
hb *topo.AgentStat,
350+
) {
351+
hb.PBMStatus = agent.pbmStatus(ctx)
352+
logHbStatus("PBM connection", hb.PBMStatus, l)
304353

305-
inf, err := topo.GetNodeInfo(ctx, a.nodeConn)
306-
if err != nil {
307-
l.Error("get NodeInfo: %v", err)
308-
hb.Err += fmt.Sprintf("get NodeInfo: %v", err)
354+
hb.NodeStatus = agent.nodeStatus(ctx)
355+
logHbStatus("node connection", hb.NodeStatus, l)
356+
357+
hb.StorageStatus = agent.storStatus(ctx, l, checkStore, hb)
358+
logHbStatus("storage connection", hb.StorageStatus, l)
359+
360+
hb.Err = ""
361+
hb.Hidden = false
362+
hb.Passive = false
363+
364+
inf, err := topo.GetNodeInfo(ctx, agent.nodeConn)
365+
if err != nil {
366+
l.Error("get NodeInfo: %v", err)
367+
hb.Err += fmt.Sprintf("get NodeInfo: %v", err)
368+
} else {
369+
hb.Hidden = inf.Hidden
370+
hb.Passive = inf.Passive
371+
hb.Arbiter = inf.ArbiterOnly
372+
if inf.SecondaryDelayOld != 0 {
373+
hb.DelaySecs = inf.SecondaryDelayOld
309374
} else {
310-
hb.Hidden = inf.Hidden
311-
hb.Passive = inf.Passive
312-
hb.Arbiter = inf.ArbiterOnly
313-
if inf.SecondaryDelayOld != 0 {
314-
hb.DelaySecs = inf.SecondaryDelayOld
315-
} else {
316-
hb.DelaySecs = inf.SecondaryDelaySecs
317-
}
375+
hb.DelaySecs = inf.SecondaryDelaySecs
318376
}
319377

320-
if inf != nil && inf.ArbiterOnly {
321-
hb.State = defs.NodeStateArbiter
322-
hb.StateStr = "ARBITER"
378+
hb.Heartbeat, err = topo.ClusterTimeFromNodeInfo(inf)
379+
if err != nil {
380+
hb.Err += fmt.Sprintf("get cluster time: %v", err)
381+
}
382+
}
383+
384+
if inf != nil && inf.ArbiterOnly {
385+
hb.State = defs.NodeStateArbiter
386+
hb.StateStr = "ARBITER"
387+
} else {
388+
n, err := topo.GetNodeStatus(ctx, agent.nodeConn, agent.brief.Me)
389+
if err != nil {
390+
l.Error("get replSetGetStatus: %v", err)
391+
hb.Err += fmt.Sprintf("get replSetGetStatus: %v", err)
392+
hb.State = defs.NodeStateUnknown
393+
hb.StateStr = "UNKNOWN"
323394
} else {
324-
n, err := topo.GetNodeStatus(ctx, a.nodeConn, a.brief.Me)
325-
if err != nil {
326-
l.Error("get replSetGetStatus: %v", err)
327-
hb.Err += fmt.Sprintf("get replSetGetStatus: %v", err)
328-
hb.State = defs.NodeStateUnknown
329-
hb.StateStr = "UNKNOWN"
330-
} else {
331-
hb.State = n.State
332-
hb.StateStr = n.StateStr
395+
hb.State = n.State
396+
hb.StateStr = n.StateStr
333397

334-
rLag, err := topo.ReplicationLag(ctx, a.nodeConn, a.brief.Me)
335-
if err != nil {
336-
l.Error("get replication lag: %v", err)
337-
hb.Err += fmt.Sprintf("get replication lag: %v", err)
338-
}
339-
hb.ReplicationLag = rLag
398+
rLag, err := topo.ReplicationLag(ctx, agent.nodeConn, agent.brief.Me)
399+
if err != nil {
400+
l.Error("get replication lag: %v", err)
401+
hb.Err += fmt.Sprintf("get replication lag: %v", err)
340402
}
403+
hb.ReplicationLag = rLag
341404
}
405+
}
406+
}
342407

343-
err = topo.SetAgentStatus(ctx, a.leadConn, hb)
344-
if err != nil {
345-
l.Error("set status: %v", err)
408+
func (a *Agent) warnIfParallelAgentDetected(
409+
ctx context.Context,
410+
l log.LogEvent,
411+
lastHeartbeat primitive.Timestamp,
412+
) {
413+
s, err := topo.GetAgentStatus(ctx, a.leadConn, a.brief.SetName, a.brief.Me)
414+
if err != nil {
415+
if errors.Is(err, mongo.ErrNoDocuments) {
416+
return
346417
}
418+
l.Error("detecting parallel agent: get status: %v", err)
419+
return
420+
}
421+
if !s.Heartbeat.Equal(lastHeartbeat) {
422+
l.Warning("detected possible parallel agent for the node: "+
423+
"expected last heartbeat to be %d.%d, actual is %d.%d",
424+
lastHeartbeat.T, lastHeartbeat.I, s.Heartbeat.T, s.Heartbeat.I)
425+
return
347426
}
348427
}
349428

@@ -365,13 +444,14 @@ func (a *Agent) nodeStatus(ctx context.Context) topo.SubsysStatus {
365444
return topo.SubsysStatus{OK: true}
366445
}
367446

368-
func (a *Agent) storStatus(ctx context.Context, log log.LogEvent, forceCheckStorage bool) topo.SubsysStatus {
447+
func (a *Agent) storStatus(
448+
ctx context.Context,
449+
log log.LogEvent,
450+
forceCheckStorage bool,
451+
stat *topo.AgentStat,
452+
) topo.SubsysStatus {
369453
// check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647)
370454
// but if storage was(is) failed, check it always
371-
stat, err := topo.GetAgentStatus(ctx, a.leadConn, a.brief.SetName, a.brief.Me)
372-
if err != nil {
373-
log.Warning("get current storage status: %v", err)
374-
}
375455
if !forceCheckStorage && stat.StorageStatus.OK {
376456
return topo.SubsysStatus{OK: true}
377457
}

cmd/pbm-agent/backup.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,11 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID,
114114
case defs.LogicalBackup:
115115
fallthrough
116116
default:
117-
bcp = backup.New(a.leadConn, a.nodeConn, a.brief, a.dumpConns)
117+
numParallelColls := a.numParallelColls
118+
if cfg.Backup != nil && cfg.Backup.NumParallelCollections > 0 {
119+
numParallelColls = cfg.Backup.NumParallelCollections
120+
}
121+
bcp = backup.New(a.leadConn, a.nodeConn, a.brief, numParallelColls)
118122
}
119123

120124
bcp.SetConfig(cfg)

cmd/pbm-agent/delete.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ func (a *Agent) Cleanup(ctx context.Context, d *ctrl.CleanupCmd, opid ctrl.OPID,
290290
bcp := &cr.Backups[i]
291291

292292
eg.Go(func() error {
293-
err := backup.DeleteBackupFiles(bcp, stg)
293+
err := backup.DeleteBackupFiles(stg, bcp.Name)
294294
return errors.Wrapf(err, "delete backup files %q", bcp.Name)
295295
})
296296
}

cmd/pbm-agent/main.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
stdlog "log"
77
"os"
8+
"os/signal"
89
"runtime"
910
"strconv"
1011
"strings"
@@ -32,7 +33,8 @@ func main() {
3233
Envar("PBM_MONGODB_URI").
3334
Required().
3435
String()
35-
dumpConns = pbmAgentCmd.Flag("dump-parallel-collections", "Number of collections to dump in parallel").
36+
dumpConns = pbmAgentCmd.
37+
Flag("dump-parallel-collections", "Number of collections to dump in parallel").
3638
Envar("PBM_DUMP_PARALLEL_COLLECTIONS").
3739
Default(strconv.Itoa(runtime.NumCPU() / 2)).
3840
Int()
@@ -85,7 +87,7 @@ func runAgent(mongoURI string, dumpConns int) error {
8587
mtLog.SetDateFormat(log.LogTimeFormat)
8688
mtLog.SetVerbosity(&options.Verbosity{VLevel: mtLog.DebugLow})
8789

88-
ctx, cancel := context.WithCancel(context.Background())
90+
ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill)
8991
defer cancel()
9092

9193
leadConn, err := connect.Connect(ctx, mongoURI, "pbm-agent")
@@ -117,6 +119,8 @@ func runAgent(mongoURI string, dumpConns int) error {
117119
return errors.Wrap(err, "setup pbm collections")
118120
}
119121

122+
agent.showIncompatibilityWarning(ctx)
123+
120124
if canRunSlicer {
121125
go agent.PITR(ctx)
122126
}

cmd/pbm-agent/oplog.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -70,8 +70,16 @@ func (a *Agent) OplogReplay(ctx context.Context, r *ctrl.ReplayCmd, opID ctrl.OP
7070
}
7171
}()
7272

73+
cfg, err := config.GetConfig(ctx, a.leadConn)
74+
if err != nil {
75+
l.Error("get PBM config: %v", err)
76+
return
77+
}
78+
7379
l.Info("oplog replay started")
74-
if err := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap).ReplayOplog(ctx, r, opID, l); err != nil {
80+
rr := restore.New(a.leadConn, a.nodeConn, a.brief, cfg, r.RSMap, 0)
81+
err = rr.ReplayOplog(ctx, r, opID, l)
82+
if err != nil {
7583
if errors.Is(err, restore.ErrNoDataForShard) {
7684
l.Info("no oplog for the shard, skipping")
7785
} else {

0 commit comments

Comments
 (0)