@@ -7361,7 +7361,7 @@ int BlueStore::_init_alloc()
73617361 << ", free 0x" << alloc->get_free()
73627362 << ", fragmentation " << alloc->get_fragmentation()
73637363 << std::dec << dendl;
7364- if (tracepoint_debug_init_alloc_done) tracepoint_debug_init_alloc_done();
7364+
73657365 return 0;
73667366}
73677367
@@ -7769,19 +7769,9 @@ int BlueStore::_is_bluefs(bool create, bool* ret)
77697769* opens both DB and dependant super_meta, FreelistManager and allocator
77707770* in the proper order
77717771*/
7772- int BlueStore::_open_db_and_around(
7773- bool read_only,
7774- bool to_repair,
7775- bool apply_deferred,
7776- bool remove_deferred)
7777- {
7778- dout(5) << __func__ << "read_only=" << read_only
7779- << ", to_repair=" << to_repair
7780- << ", deferred=" << (apply_deferred?"apply;":"noapply;")
7781- << (remove_deferred?"remove":"noremove") << dendl;
7782- ceph_assert(remove_deferred == false || apply_deferred == true);
7783- ceph_assert(read_only == false || remove_deferred == false);
7784- std::vector<std::string> keys_to_remove;
7772+ int BlueStore::_open_db_and_around(bool read_only, bool to_repair)
7773+ {
7774+ dout(5) << __func__ << "::NCB::read_only=" << read_only << ", to_repair=" << to_repair << dendl;
77857775 {
77867776 string type;
77877777 int r = read_meta("type", &type);
@@ -7841,11 +7831,6 @@ int BlueStore::_open_db_and_around(
78417831 if (bdev_label_multi) {
78427832 _main_bdev_label_try_reserve();
78437833 }
7844- // This is the place where we can apply deferred writes
7845- // without risk of some interaction with RocksDB allocating.
7846- if (apply_deferred) {
7847- _deferred_replay(remove_deferred ? &keys_to_remove : nullptr);
7848- }
78497834
78507835 // Re-open in the proper mode(s).
78517836
@@ -7863,14 +7848,6 @@ int BlueStore::_open_db_and_around(
78637848 _post_init_alloc();
78647849 }
78657850
7866- if (remove_deferred && !keys_to_remove.empty()) {
7867- KeyValueDB::Transaction deferred_keys_remove_txn = db->get_transaction();
7868- for (auto& s : keys_to_remove) {
7869- deferred_keys_remove_txn->rmkey(PREFIX_DEFERRED, s);
7870- }
7871- db->submit_transaction_sync(deferred_keys_remove_txn);
7872- }
7873-
78747851 // when function is called in repair mode (to_repair=true) we skip db->open()/create()
78757852 // we can't change bluestore allocation so no need to invlidate allocation-file
78767853 if (fm->is_null_manager() && !read_only && !to_repair) {
@@ -9252,7 +9229,7 @@ int BlueStore::mount_readonly()
92529229 }
92539230 });
92549231
9255- r = _deferred_replay(nullptr );
9232+ r = _deferred_replay();
92569233 if (r < 0) {
92579234 return r;
92589235 }
@@ -9388,7 +9365,8 @@ int BlueStore::_mount()
93889365 return -EINVAL;
93899366 }
93909367
9391- int r = _open_db_and_around(false, false, true, true);
9368+ dout(5) << __func__ << "::NCB::calling open_db_and_around(read/write)" << dendl;
9369+ int r = _open_db_and_around(false);
93929370 if (r < 0) {
93939371 return r;
93949372 }
@@ -9426,6 +9404,11 @@ int BlueStore::_mount()
94269404 }
94279405 });
94289406
9407+ r = _deferred_replay();
9408+ if (r < 0) {
9409+ return r;
9410+ }
9411+
94299412 mempool_thread.init();
94309413
94319414 if ((!per_pool_stat_collection || per_pool_omap != OMAP_PER_PG) &&
@@ -10781,7 +10764,7 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
1078110764
1078210765 // in deep mode we need R/W write access to be able to replay deferred ops
1078310766 const bool read_only = !(repair || depth == FSCK_DEEP);
10784- int r = _open_db_and_around(read_only, false, !read_only, !read_only );
10767+ int r = _open_db_and_around(read_only);
1078510768 if (r < 0) {
1078610769 return r;
1078710770 }
@@ -10807,6 +10790,17 @@ int BlueStore::_fsck(BlueStore::FSCKDepth depth, bool repair)
1080710790 mempool_thread.shutdown();
1080810791 _shutdown_cache();
1080910792 });
10793+ // we need finisher and kv_{sync,finalize}_thread *just* for replay
10794+ // enable in repair or deep mode modes only
10795+ if (!read_only) {
10796+ _kv_start();
10797+ r = _deferred_replay();
10798+ _kv_stop();
10799+ }
10800+
10801+ if (r < 0) {
10802+ return r;
10803+ }
1081010804 return _fsck_on_open(depth, repair);
1081110805}
1081210806
@@ -14959,7 +14953,6 @@ void BlueStore::_kv_sync_thread()
1495914953 deque<DeferredBatch*> deferred_stable_queue; ///< deferred ios done + stable
1496014954 std::unique_lock l{kv_lock};
1496114955 ceph_assert(!kv_sync_started);
14962- ceph_assert(!db_was_opened_read_only);
1496314956 kv_sync_started = true;
1496414957 kv_cond.notify_all();
1496514958
@@ -15126,7 +15119,7 @@ void BlueStore::_kv_sync_thread()
1512615119 auto sync_start = mono_clock::now();
1512715120#endif
1512815121 // submit synct synchronously (block and wait for it to commit)
15129- int r = cct->_conf->bluestore_debug_omit_kv_commit ?
15122+ int r = db_was_opened_read_only || cct->_conf->bluestore_debug_omit_kv_commit ?
1513015123 0 : db->submit_transaction_sync(synct);
1513115124 ceph_assert(r == 0);
1513215125
@@ -15511,7 +15504,7 @@ void BlueStore::_deferred_aio_finish(OpSequencer *osr)
1551115504 }
1551215505}
1551315506
15514- int BlueStore::_deferred_replay(std::vector<std::string>* keys_to_remove )
15507+ int BlueStore::_deferred_replay()
1551515508{
1551615509 dout(10) << __func__ << " start" << dendl;
1551715510 int count = 0;
@@ -15525,49 +15518,48 @@ int BlueStore::_deferred_replay(std::vector<std::string>* keys_to_remove)
1552515518 }
1552615519 );
1552715520 }
15528- if (tracepoint_debug_deferred_replay_start) tracepoint_debug_deferred_replay_start();
15529- IOContext ioctx(cct, nullptr);
15521+ CollectionRef ch = _get_collection(coll_t::meta());
15522+ bool fake_ch = false;
15523+ if (!ch) {
15524+ // hmm, replaying initial mkfs?
15525+ ch = static_cast<Collection*>(create_new_collection(coll_t::meta()).get());
15526+ fake_ch = true;
15527+ }
15528+ OpSequencer *osr = static_cast<OpSequencer*>(ch->osr.get());
1553015529 KeyValueDB::Iterator it = db->get_iterator(PREFIX_DEFERRED);
15531- for (it->lower_bound(string()); it->valid(); /*iterator update outside*/ ) {
15530+ for (it->lower_bound(string()); it->valid(); it->next(), ++count ) {
1553215531 dout(20) << __func__ << " replay " << pretty_binary_string(it->key())
1553315532 << dendl;
15534- if (keys_to_remove) {
15535- keys_to_remove->push_back(it->key());
15536- }
15537- bluestore_deferred_transaction_t deferred_txn;
15533+ bluestore_deferred_transaction_t *deferred_txn =
15534+ new bluestore_deferred_transaction_t;
1553815535 bufferlist bl = it->value();
1553915536 auto p = bl.cbegin();
1554015537 try {
15541- decode(deferred_txn, p);
15542-
15543- bool has_some = _eliminate_outdated_deferred(&deferred_txn, bluefs_extents);
15544- if (has_some) {
15545- if (tracepoint_debug_deferred_replay_track) tracepoint_debug_deferred_replay_track(deferred_txn);
15546- for (auto& op: deferred_txn.ops) {
15547- for (auto& e : op.extents) {
15548- bufferlist t;
15549- op.data.splice(0, e.length, &t);
15550- bdev->aio_write(e.offset, t, &ioctx, false);
15551- }
15552- }
15553- }
15538+ decode(*deferred_txn, p);
1555415539 } catch (ceph::buffer::error& e) {
1555515540 derr << __func__ << " failed to decode deferred txn "
1555615541 << pretty_binary_string(it->key()) << dendl;
15542+ delete deferred_txn;
1555715543 r = -EIO;
15544+ goto out;
1555815545 }
15559- // update loop iteration here, so we can inject action
15560- ++count;
15561- it->next();
15562- if (ioctx.num_pending.load() > 100 || !it->valid()) {
15563- dout(20) << __func__ << "submitting IO batch" << dendl;
15564- bdev->aio_submit(&ioctx);
15565- ioctx.aio_wait();
15566- dout(20) << __func__ << "wait done" << dendl;
15567- ioctx.release_running_aios();
15546+ bool has_some = _eliminate_outdated_deferred(deferred_txn, bluefs_extents);
15547+ if (has_some) {
15548+ TransContext *txc = _txc_create(ch.get(), osr, nullptr);
15549+ txc->deferred_txn = deferred_txn;
15550+ txc->set_state(TransContext::STATE_KV_DONE);
15551+ _txc_state_proc(txc);
15552+ } else {
15553+ delete deferred_txn;
1556815554 }
1556915555 }
15570- if (tracepoint_debug_deferred_replay_end) tracepoint_debug_deferred_replay_end();
15556+ out:
15557+ dout(20) << __func__ << " draining osr" << dendl;
15558+ _osr_register_zombie(osr);
15559+ _osr_drain_all();
15560+ if (fake_ch) {
15561+ new_coll_map.clear();
15562+ }
1557115563 dout(10) << __func__ << " completed " << count << " events" << dendl;
1557215564 return r;
1557315565}
0 commit comments