Skip to content

Commit d1476d5

Browse files
authored
feat (server) : introduce no point in time replication (#5103)
* introduce no point in time replication algorithm, under flag point_in_time_snapshot Signed-off-by: adi_holden <[email protected]>
1 parent 327d3c6 commit d1476d5

File tree

8 files changed

+235
-45
lines changed

8 files changed

+235
-45
lines changed

src/core/dash.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,9 @@ class DashTable : public detail::DashTableBase {
268268
const_bucket_iterator CursorToBucketIt(Cursor c) const {
269269
return const_bucket_iterator{this, c.segment_id(global_depth_), c.bucket_id(), 0};
270270
}
271+
bucket_iterator CursorToBucketIt(Cursor c) {
272+
return bucket_iterator{this, c.segment_id(global_depth_), c.bucket_id(), 0};
273+
}
271274

272275
// Capture Version Change. Runs cb(it) on every bucket! (not entry) in the table whose version
273276
// would potentially change upon insertion of 'k'.
@@ -932,8 +935,8 @@ void DashTable<_Key, _Value, Policy>::Split(uint32_t seg_id, EvictionPolicy& ev)
932935
std::move(hash_fn), target,
933936
[&](uint32_t segment_from, detail::PhysicalBid from, uint32_t segment_to,
934937
detail::PhysicalBid to) {
935-
// OnMove is used to notify eviction policy about the moves across buckets/segments
936-
// during the split.
938+
// OnMove is used to notify eviction policy about the moves across
939+
// buckets/segments during the split.
937940
ev.OnMove(Cursor{global_depth_, segment_from, from}, Cursor{global_depth_, segment_to, to});
938941
});
939942

src/server/db_slice.cc

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class PrimeEvictionPolicy {
9999
DVLOG(2) << "split: " << segment->SlowSize() << "/" << segment->capacity();
100100
}
101101
void OnMove(PrimeTable::Cursor source, PrimeTable::Cursor dest) {
102+
moved_items_.push_back(std::make_pair(source, dest));
102103
}
103104

104105
bool CanGrow(const PrimeTable& tbl) const;
@@ -113,8 +114,12 @@ class PrimeEvictionPolicy {
113114
unsigned checked() const {
114115
return checked_;
115116
}
117+
const DbSlice::MovedItemsVec& moved_items() {
118+
return moved_items_;
119+
}
116120

117121
private:
122+
DbSlice::MovedItemsVec moved_items_;
118123
DbSlice* db_slice_;
119124
ssize_t mem_offset_;
120125
ssize_t soft_limit_ = 0;
@@ -364,7 +369,15 @@ class DbSlice::PrimeBumpPolicy {
364369
return !obj.IsSticky();
365370
}
366371
void OnMove(PrimeTable::Cursor source, PrimeTable::Cursor dest) {
372+
moved_items_.push_back(std::make_pair(source, dest));
373+
}
374+
375+
const DbSlice::MovedItemsVec& moved_items() {
376+
return moved_items_;
367377
}
378+
379+
private:
380+
DbSlice::MovedItemsVec moved_items_;
368381
};
369382

370383
DbSlice::DbSlice(uint32_t index, bool cache_mode, EngineShard* owner)
@@ -708,6 +721,7 @@ OpResult<DbSlice::ItAndUpdater> DbSlice::AddOrFindInternal(const Context& cntx,
708721
events_.insertion_rejections++;
709722
return OpStatus::OUT_OF_MEMORY;
710723
}
724+
CallMovedCallbacks(cntx.db_index, evp.moved_items());
711725

712726
events_.mutations++;
713727
ssize_t table_increase = db.prime.mem_usage() - table_before;
@@ -1252,6 +1266,12 @@ uint64_t DbSlice::RegisterOnChange(ChangeCallback cb) {
12521266
return change_cb_.emplace_back(NextVersion(), std::move(cb)).first;
12531267
}
12541268

1269+
uint64_t DbSlice::RegisterOnMove(MovedCallback cb) {
1270+
++next_moved_id_;
1271+
moved_cb_.emplace_back(next_moved_id_, cb);
1272+
return next_moved_id_;
1273+
}
1274+
12551275
void DbSlice::FlushChangeToEarlierCallbacks(DbIndex db_ind, Iterator it, uint64_t upper_bound) {
12561276
unique_lock<LocalLatch> lk(serialization_latch_);
12571277

@@ -1284,6 +1304,14 @@ void DbSlice::UnregisterOnChange(uint64_t id) {
12841304
change_cb_.erase(it);
12851305
}
12861306

1307+
void DbSlice::UnregisterOnMoved(uint64_t id) {
1308+
serialization_latch_.Wait();
1309+
auto it =
1310+
find_if(moved_cb_.begin(), moved_cb_.end(), [id](const auto& cb) { return cb.first == id; });
1311+
CHECK(it != moved_cb_.end());
1312+
moved_cb_.erase(it);
1313+
}
1314+
12871315
auto DbSlice::DeleteExpiredStep(const Context& cntx, unsigned count) -> DeleteExpiredStats {
12881316
auto& db = *db_arr_[cntx.db_index];
12891317
DeleteExpiredStats result;
@@ -1754,6 +1782,7 @@ void DbSlice::OnCbFinishBlocking() {
17541782
if (bump_it != it) { // the item was bumped
17551783
++events_.bumpups;
17561784
}
1785+
CallMovedCallbacks(db_index, policy.moved_items());
17571786
}
17581787
}
17591788

@@ -1777,4 +1806,21 @@ void DbSlice::CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const {
17771806
}
17781807
}
17791808

1809+
void DbSlice::CallMovedCallbacks(
1810+
DbIndex id, const std::vector<std::pair<PrimeTable::Cursor, PrimeTable::Cursor>>& moved_items) {
1811+
if (moved_cb_.empty())
1812+
return;
1813+
1814+
// does not preempt, just increments the counter.
1815+
unique_lock<LocalLatch> lk(serialization_latch_);
1816+
1817+
const size_t limit = moved_cb_.size();
1818+
auto ccb = moved_cb_.begin();
1819+
for (size_t i = 0; i < limit; ++i) {
1820+
CHECK(ccb->second);
1821+
ccb->second(id, moved_items);
1822+
++ccb;
1823+
}
1824+
}
1825+
17801826
} // namespace dfly

src/server/db_slice.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -434,12 +434,20 @@ class DbSlice {
434434
}
435435

436436
using ChangeCallback = std::function<void(DbIndex, const ChangeReq&)>;
437+
// Holds pairs of source and destination cursors for items moved in the dash table
438+
using MovedItemsVec = std::vector<std::pair<PrimeTable::Cursor, PrimeTable::Cursor>>;
439+
using MovedCallback = std::function<void(DbIndex, const MovedItemsVec&)>;
437440

438441
//! Registers the callback to be called for each change.
439442
//! Returns the registration id which is also the unique version of the dbslice
440443
//! at a time of the call.
441444
uint64_t RegisterOnChange(ChangeCallback cb);
442445

446+
//! Registers the callback to be called after items are moved in table.
447+
//! Returns the registration id which is also the unique version of the dbslice
448+
//! at a time of the call.
449+
uint64_t RegisterOnMove(MovedCallback cb);
450+
443451
bool HasRegisteredCallbacks() const {
444452
return !change_cb_.empty();
445453
}
@@ -450,6 +458,8 @@ class DbSlice {
450458
//! Unregisters the callback.
451459
void UnregisterOnChange(uint64_t id);
452460

461+
void UnregisterOnMoved(uint64_t id);
462+
453463
struct DeleteExpiredStats {
454464
uint32_t deleted = 0; // number of deleted items due to expiry (less than traversed).
455465
uint32_t deleted_bytes = 0; // total bytes of deleted items.
@@ -605,6 +615,7 @@ class DbSlice {
605615
}
606616

607617
void CallChangeCallbacks(DbIndex id, const ChangeReq& cr) const;
618+
void CallMovedCallbacks(DbIndex id, const MovedItemsVec& moved_items);
608619

609620
// We need this because registered callbacks might yield and when they do so we want
610621
// to avoid Heartbeat or Flushing the db.
@@ -620,6 +631,7 @@ class DbSlice {
620631
bool expire_allowed_ = true;
621632

622633
uint64_t version_ = 1; // Used to version entries in the PrimeTable.
634+
uint64_t next_moved_id_ = 1;
623635
ssize_t memory_budget_ = SSIZE_MAX / 2;
624636
size_t bytes_per_object_ = 0;
625637
size_t soft_budget_limit_ = 0;
@@ -649,6 +661,8 @@ class DbSlice {
649661
// ordered from the smallest to largest version.
650662
std::list<std::pair<uint64_t, ChangeCallback>> change_cb_;
651663

664+
std::list<std::pair<uint32_t, MovedCallback>> moved_cb_;
665+
652666
// Used in temporary computations in Find item and CbFinish
653667
// This set is used to hold fingerprints of key accessed during the run of
654668
// a transaction callback (not the whole transaction).
@@ -676,7 +690,8 @@ class DbSlice {
676690
// and polymorphic allocator (new C++ features)
677691
// the declarations below meant to say:
678692
// absl::flat_hash_map<std::string,
679-
// absl::flat_hash_set<facade::Connection::WeakRef, Hash>> client_tracking_map_
693+
// absl::flat_hash_set<facade::Connection::WeakRef, Hash>>
694+
// client_tracking_map_
680695
using HashSetAllocator = PMR_NS::polymorphic_allocator<facade::Connection::WeakRef>;
681696

682697
using ConnectionHashSet =

src/server/replica.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -847,6 +847,13 @@ void DflyShardReplica::FullSyncDflyFb(std::string eof_token, BlockingCounter bc,
847847
}
848848
});
849849

850+
// In the no point-in-time replication flow, it's possible to serialize a journal change
851+
// before serializing the bucket that the key was updated in on the master side. As a result,
852+
// when loading the serialized bucket data on the replica, it may overwrite the earlier entry
853+
// added by the journal change. This is an expected and valid scenario, so to avoid unnecessary
854+
// warnings, we enable SetOverrideExistingKeys(true).
855+
rdb_loader_->SetOverrideExistingKeys(true);
856+
850857
// Load incoming rdb stream.
851858
if (std::error_code ec = rdb_loader_->Load(&ps); ec) {
852859
cntx->ReportError(ec, "Error loading rdb format");

src/server/server_family.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ void ServerFamily::Init(util::AcceptServer* acceptor, std::vector<facade::Listen
885885
config_registry.RegisterMutable("tls_ca_cert_dir");
886886
config_registry.RegisterMutable("replica_priority");
887887
config_registry.RegisterMutable("lua_undeclared_keys_shas");
888+
config_registry.RegisterMutable("point_in_time_snapshot");
888889

889890
pb_task_ = shard_set->pool()->GetNextProactor();
890891
if (pb_task_->GetKind() == ProactorBase::EPOLL) {

0 commit comments

Comments
 (0)