Skip to content

Commit c72a6ad

Browse files
authored
Merge pull request ceph#62214 from Matan-B/wip-matanb-revert-60753
Revert "os/bluestore: Fix problem with deferred writes replay" Reviewed-by: Adam Kupczyk <[email protected]>
2 parents f9522f5 + bef277a commit c72a6ad

File tree

3 files changed

+96
-352
lines changed

3 files changed

+96
-352
lines changed

src/os/bluestore/BlueStore.cc

Lines changed: 55 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -7361,7 +7361,7 @@ int BlueStore::_init_alloc()
73617361
<< ", free 0x" << alloc->get_free()
73627362
<< ", fragmentation " << alloc->get_fragmentation()
73637363
<< std::dec << dendl;
7364-
if (tracepoint_debug_init_alloc_done) tracepoint_debug_init_alloc_done();
7364+
73657365
return 0;
73667366
}
73677367

@@ -7769,19 +7769,9 @@ int BlueStore::_is_bluefs(bool create, bool* ret)
77697769
* opens both DB and dependant super_meta, FreelistManager and allocator
77707770
* in the proper order
77717771
*/
7772-
int BlueStore::_open_db_and_around(
7773-
bool read_only,
7774-
bool to_repair,
7775-
bool apply_deferred,
7776-
bool remove_deferred)
7777-
{
7778-
dout(5) << __func__ << "read_only=" << read_only
7779-
<< ", to_repair=" << to_repair
7780-
<< ", deferred=" << (apply_deferred?"apply;":"noapply;")
7781-
<< (remove_deferred?"remove":"noremove") << dendl;
7782-
ceph_assert(remove_deferred == false || apply_deferred == true);
7783-
ceph_assert(read_only == false || remove_deferred == false);
7784-
std::vector<std::string> keys_to_remove;
7772+
int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
7773+
{
7774+
dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
77857775
{
77867776
string type;
77877777
int r = read_meta("type", &type);
@@ -7841,11 +7831,6 @@ int BlueStore::_open_db_and_around(
78417831
if (bdev_label_multi) {
78427832
_main_bdev_label_try_reserve();
78437833
}
7844-
// This is the place where we can apply deferred writes
7845-
// without risk of some interaction with RocksDB allocating.
7846-
if (apply_deferred) {
7847-
_deferred_replay(remove_deferred ? &keys_to_remove : nullptr);
7848-
}
78497834

78507835
// Re-open in the proper mode(s).
78517836

@@ -7863,14 +7848,6 @@ int BlueStore::_open_db_and_around(
78637848
_post_init_alloc();
78647849
}
78657850

7866-
if (remove_deferred && !keys_to_remove.empty()) {
7867-
KeyValueDB::Transaction deferred_keys_remove_txn = db->get_transaction();
7868-
for (auto& s : keys_to_remove) {
7869-
deferred_keys_remove_txn->rmkey(PREFIX_DEFERRED, s);
7870-
}
7871-
db->submit_transaction_sync(deferred_keys_remove_txn);
7872-
}
7873-
78747851
// when function is called in repair mode (to_repair=true) we skip db->open()/create()
78757852
// we can't change bluestore allocation so no need to invlidate allocation-file
78767853
if (fm->is_null_manager() && !read_only && !to_repair) {
@@ -9252,7 +9229,7 @@ int BlueStore::mount_readonly()
92529229
}
92539230
});
92549231

9255-
r = _deferred_replay(nullptr);
9232+
r = _deferred_replay();
92569233
if (r < 0) {
92579234
return r;
92589235
}
@@ -9388,7 +9365,8 @@ int BlueStore::_mount()
93889365
return -EINVAL;
93899366
}
93909367

9391-
int r = _open_db_and_around(false, false, true, true);
9368+
dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
9369+
int r = _open_db_and_around(false);
93929370
if (r < 0) {
93939371
return r;
93949372
}
@@ -9426,6 +9404,11 @@ int BlueStore::_mount()
94269404
}
94279405
});
94289406

9407+
r = _deferred_replay();
9408+
if (r < 0) {
9409+
return r;
9410+
}
9411+
94299412
mempool_thread.init();
94309413

94319414
if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
@@ -10781,7 +10764,7 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
1078110764

1078210765
// in deep mode we need R/W write access to be able to replay deferred ops
1078310766
const bool read_only = !(repair || depth == FSCK_DEEP);
10784-
int r = _open_db_and_around(read_only, false, !read_only, !read_only);
10767+
int r = _open_db_and_around(read_only);
1078510768
if (r < 0) {
1078610769
return r;
1078710770
}
@@ -10807,6 +10790,17 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
1080710790
mempool_thread.shutdown();
1080810791
_shutdown_cache();
1080910792
});
10793+
// we need finisher and kv_{sync,finalize}_thread *just* for replay
10794+
// enable in repair or deep mode modes only
10795+
if (!read_only) {
10796+
_kv_start();
10797+
r = _deferred_replay();
10798+
_kv_stop();
10799+
}
10800+
10801+
if (r < 0) {
10802+
return r;
10803+
}
1081010804
return _fsck_on_open(depth, repair);
1081110805
}
1081210806

@@ -14959,7 +14953,6 @@ void BlueStore::_kv_sync_thread()
1495914953
deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
1496014954
std::unique_lock l{kv_lock};
1496114955
ceph_assert(!kv_sync_started);
14962-
ceph_assert(!db_was_opened_read_only);
1496314956
kv_sync_started = true;
1496414957
kv_cond.notify_all();
1496514958

@@ -15126,7 +15119,7 @@ void BlueStore::_kv_sync_thread()
1512615119
auto sync_start = mono_clock::now();
1512715120
#endif
1512815121
// submit synct synchronously (block and wait for it to commit)
15129-
int r = cct->_conf->bluestore_debug_omit_kv_commit ?
15122+
int r = db_was_opened_read_only || cct->_conf->bluestore_debug_omit_kv_commit ?
1513015123
0 : db->submit_transaction_sync(synct);
1513115124
ceph_assert(r == 0);
1513215125

@@ -15511,7 +15504,7 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
1551115504
}
1551215505
}
1551315506

15514-
int BlueStore::_deferred_replay(std::vector<std::string>* keys_to_remove)
15507+
int BlueStore::_deferred_replay()
1551515508
{
1551615509
dout(10) << __func__ << " start" << dendl;
1551715510
int count = 0;
@@ -15525,49 +15518,48 @@ int BlueStore::_deferred_replay(std::vector<std::string>* keys_to_remove)
1552515518
}
1552615519
);
1552715520
}
15528-
if (tracepoint_debug_deferred_replay_start) tracepoint_debug_deferred_replay_start();
15529-
IOContext ioctx(cct, nullptr);
15521+
CollectionRef ch = _get_collection(coll_t::meta());
15522+
bool fake_ch = false;
15523+
if (!ch) {
15524+
// hmm, replaying initial mkfs?
15525+
ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
15526+
fake_ch = true;
15527+
}
15528+
OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
1553015529
KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
15531-
for (it->lower_bound(string()); it->valid(); /*iterator update outside*/) {
15530+
for (it->lower_bound(string()); it->valid(); it->next(), ++count) {
1553215531
dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
1553315532
<< dendl;
15534-
if (keys_to_remove) {
15535-
keys_to_remove->push_back(it->key());
15536-
}
15537-
bluestore_deferred_transaction_t deferred_txn;
15533+
bluestore_deferred_transaction_t *deferred_txn =
15534+
new bluestore_deferred_transaction_t;
1553815535
bufferlist bl = it->value();
1553915536
auto p = bl.cbegin();
1554015537
try {
15541-
decode(deferred_txn, p);
15542-
15543-
bool has_some = _eliminate_outdated_deferred(&deferred_txn, bluefs_extents);
15544-
if (has_some) {
15545-
if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(deferred_txn);
15546-
for (auto& op: deferred_txn.ops) {
15547-
for (auto& e : op.extents) {
15548-
bufferlist t;
15549-
op.data.splice(0, e.length, &t);
15550-
bdev->aio_write(e.offset, t, &ioctx, false);
15551-
}
15552-
}
15553-
}
15538+
decode(*deferred_txn, p);
1555415539
} catch (ceph::buffer::error& e) {
1555515540
derr << __func__ << " failed to decode deferred txn "
1555615541
<< pretty_binary_string(it->key()) << dendl;
15542+
delete deferred_txn;
1555715543
r = -EIO;
15544+
goto out;
1555815545
}
15559-
// update loop iteration here, so we can inject action
15560-
++count;
15561-
it->next();
15562-
if (ioctx.num_pending.load() > 100 || !it->valid()) {
15563-
dout(20) << __func__ << "submitting IO batch" << dendl;
15564-
bdev->aio_submit(&ioctx);
15565-
ioctx.aio_wait();
15566-
dout(20) << __func__ << "wait done" << dendl;
15567-
ioctx.release_running_aios();
15546+
bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
15547+
if (has_some) {
15548+
TransContext *txc = _txc_create(ch.get(), osr, nullptr);
15549+
txc->deferred_txn = deferred_txn;
15550+
txc->set_state(TransContext::STATE_KV_DONE);
15551+
_txc_state_proc(txc);
15552+
} else {
15553+
delete deferred_txn;
1556815554
}
1556915555
}
15570-
if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end();
15556+
out:
15557+
dout(20) << __func__ << " draining osr" << dendl;
15558+
_osr_register_zombie(osr);
15559+
_osr_drain_all();
15560+
if (fake_ch) {
15561+
new_coll_map.clear();
15562+
}
1557115563
dout(10) << __func__ << " completed " << count << " events" << dendl;
1557215564
return r;
1557315565
}

src/os/bluestore/BlueStore.h

Lines changed: 2 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2833,11 +2833,7 @@ class BlueStore : public ObjectStore,
28332833
* opens both DB and dependant super_meta, FreelistManager and allocator
28342834
* in the proper order
28352835
*/
2836-
int _open_db_and_around(
2837-
bool read_only,
2838-
bool to_repair = false,
2839-
bool apply_deferred = false,
2840-
bool remove_deferred = false);
2836+
int _open_db_and_around(bool read_only, bool to_repair = false);
28412837
void _close_db_and_around();
28422838
void _close_around_db();
28432839

@@ -2961,7 +2957,7 @@ class BlueStore : public ObjectStore,
29612957
private:
29622958
void _deferred_submit_unlock(OpSequencer *osr);
29632959
void _deferred_aio_finish(OpSequencer *osr);
2964-
int _deferred_replay(std::vector<std::string>* keys_to_remove);
2960+
int _deferred_replay();
29652961
bool _eliminate_outdated_deferred(bluestore_deferred_transaction_t* deferred_txn,
29662962
interval_set<uint64_t>& bluefs_extents);
29672963

@@ -3608,19 +3604,6 @@ class BlueStore : public ObjectStore,
36083604
void debug_set_prefer_deferred_size(uint64_t s) {
36093605
prefer_deferred_size = s;
36103606
}
3611-
void set_tracepoint_debug_deferred_replay_start(std::function<void()> action) {
3612-
tracepoint_debug_deferred_replay_start = action;
3613-
}
3614-
void set_tracepoint_debug_deferred_replay_end(std::function<void()> action) {
3615-
tracepoint_debug_deferred_replay_end = action;
3616-
}
3617-
void set_tracepoint_debug_deferred_replay_track(
3618-
std::function<void(const bluestore_deferred_transaction_t&)> action) {
3619-
tracepoint_debug_deferred_replay_track = action;
3620-
}
3621-
void set_tracepoint_debug_init_alloc_done(std::function<void()> action) {
3622-
tracepoint_debug_init_alloc_done = action;
3623-
}
36243607
inline void log_latency(const char* name,
36253608
int idx,
36263609
const ceph::timespan& lat,
@@ -3636,11 +3619,6 @@ class BlueStore : public ObjectStore,
36363619
int idx2 = l_bluestore_first);
36373620

36383621
private:
3639-
std::function<void()> tracepoint_debug_deferred_replay_start = nullptr;
3640-
std::function<void()> tracepoint_debug_deferred_replay_end = nullptr;
3641-
std::function<void(const bluestore_deferred_transaction_t&)>
3642-
tracepoint_debug_deferred_replay_track = nullptr;
3643-
std::function<void()> tracepoint_debug_init_alloc_done = nullptr;
36443622
bool _debug_data_eio(const ghobject_t& o) {
36453623
if (!cct->_conf->bluestore_debug_inject_read_err) {
36463624
return false;

0 commit comments

Comments
 (0)