Skip to content

Commit 489ac3d

Browse files
authored
PBM-1391: Enabling PITR after physical restore causes PSMDB to crash (#1019)
* Fix inconsistent data after physical restore with PITR After PITR physical restore, there was inconsistent data between Primary and Secondaries nodes. The reason was that PITR oplog and dropping collections are applied in reverse order: - on Primary: [PITR oplog apply] -> [dropping PBM databases] - on each Secondary: [dropping PBM databases] -> [catch up from Primary including oplog apply] Not using DDL operations (drop in this case) for PBM's system collections fixes the problem. * Expand setupNewDB logic with all PBM's collections That ensures that collection will not be created during PITR and by doing that we eliminate the possible problem of having different UUIDs.
1 parent 00f395b commit 489ac3d

File tree

2 files changed

+32
-5
lines changed

2 files changed

+32
-5
lines changed

cmd/pbm-agent/setup.go

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@ func setupNewDB(ctx context.Context, conn connect.Client) error {
110110
return errors.Wrap(err, "ensure pitr chunks index")
111111
}
112112

113+
err = conn.AdminCommand(
114+
ctx,
115+
bson.D{{"create", defs.PITRCollection}},
116+
).Err()
117+
if err != nil && !strings.Contains(err.Error(), "already exists") {
118+
return errors.Wrap(err, "ensure pitr collection")
119+
}
120+
113121
_, err = conn.BcpCollection().Indexes().CreateMany(
114122
ctx,
115123
[]mongo.IndexModel{
@@ -124,6 +132,25 @@ func setupNewDB(ctx context.Context, conn connect.Client) error {
124132
},
125133
},
126134
)
135+
if err != nil && !strings.Contains(err.Error(), "already exists") {
136+
return errors.Wrap(err, "ensure backup collection index")
137+
}
138+
139+
err = conn.AdminCommand(
140+
ctx,
141+
bson.D{{"create", defs.RestoresCollection}},
142+
).Err()
143+
if err != nil && !strings.Contains(err.Error(), "already exists") {
144+
return errors.Wrap(err, "ensure restore collection")
145+
}
146+
147+
err = conn.AdminCommand(
148+
ctx,
149+
bson.D{{"create", defs.AgentsStatusCollection}},
150+
).Err()
151+
if err != nil && !strings.Contains(err.Error(), "already exists") {
152+
return errors.Wrap(err, "ensure agent status collection")
153+
}
127154

128-
return err
155+
return nil
129156
}

pbm/restore/physical.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1524,13 +1524,13 @@ func (r *PhysRestore) resetRS() error {
15241524
return errors.Wrap(err, "turn off pitr")
15251525
}
15261526

1527-
r.dropPBMCollections(ctx, c)
1527+
r.cleanUpPBMCollections(ctx, c)
15281528
}
15291529

15301530
return r.shutdown(c)
15311531
}
15321532

1533-
func (r *PhysRestore) dropPBMCollections(ctx context.Context, c *mongo.Client) {
1533+
func (r *PhysRestore) cleanUpPBMCollections(ctx context.Context, c *mongo.Client) {
15341534
pbmCollections := []string{
15351535
defs.LockCollection,
15361536
defs.LogCollection,
@@ -1554,9 +1554,9 @@ func (r *PhysRestore) dropPBMCollections(ctx context.Context, c *mongo.Client) {
15541554
defer wg.Done()
15551555

15561556
r.log.Debug("dropping 'admin.%s'", coll)
1557-
err := c.Database(defs.DB).Collection(coll).Drop(ctx)
1557+
_, err := c.Database(defs.DB).Collection(coll).DeleteMany(ctx, bson.D{})
15581558
if err != nil {
1559-
r.log.Warning("failed to drop 'admin.%s': %v", coll, err)
1559+
r.log.Warning("failed to delete all from 'admin.%s': %v", coll, err)
15601560
}
15611561
}()
15621562
}

0 commit comments

Comments
 (0)