Skip to content

Commit c6da0c5

Browse files
authored
Merge pull request ceph#44913 from benhanokh/safe_shutdown_v2
OSD::Modify OSD Fast-Shutdown to work safely i.e. quiesce all activit… Reviewed-by: Josh Durgin <[email protected]> Reviewed-by: Adam Kupczyk <[email protected]>
2 parents 36b0dc0 + 8d05255 commit c6da0c5

File tree

7 files changed

+151
-36
lines changed

7 files changed

+151
-36
lines changed

src/common/options/global.yaml.in

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3266,6 +3266,13 @@ options:
32663266
slow shutdown is primarilyy useful for doing memory leak checking with valgrind.
32673267
default: true
32683268
with_legacy: true
3269+
- name: osd_fast_shutdown_timeout
3270+
type: int
3271+
level: advanced
3272+
desc: timeout in seconds for osd fast-shutdown (0 is unlimited)
3273+
default: 15
3274+
with_legacy: true
3275+
min: 0
32693276
- name: osd_fast_shutdown_notify_mon
32703277
type: bool
32713278
level: advanced
@@ -4937,6 +4944,12 @@ options:
49374944
This setting is used only when OSD is doing ``--mkfs``.
49384945
Next runs of OSD retrieve sharding from disk.
49394946
default: m(3) p(3,0-12) O(3,0-13)=block_cache={type=binned_lru} L P
4947+
- name: bluestore_qfsck_on_mount
4948+
type: bool
4949+
level: dev
4950+
desc: Run quick-fsck at mount comparing allocation-file to RocksDB allocation state
4951+
default: true
4952+
with_legacy: true
49404953
- name: bluestore_fsck_on_mount
49414954
type: bool
49424955
level: dev

src/os/ObjectStore.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,8 @@ class ObjectStore {
288288
virtual bool needs_journal() = 0; //< requires a journal
289289
virtual bool wants_journal() = 0; //< prefers a journal
290290
virtual bool allows_journal() = 0; //< allows a journal
291-
291+
virtual void prepare_for_fast_shutdown() {}
292+
virtual bool has_null_manager() { return false; }
292293
// return store min allocation size, if applicable
293294
virtual uint64_t get_min_alloc_size() const {
294295
return 0;

src/os/bluestore/BlueFS.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2507,6 +2507,9 @@ void BlueFS::_rewrite_log_and_layout_sync_LNF_LD(bool allocate_with_fallback,
25072507
}
25082508
#endif
25092509
_flush_bdev();
2510+
++log.seq_live;
2511+
dirty.seq_live = log.seq_live;
2512+
log.t.seq = log.seq_live;
25102513

25112514
super.memorized_layout = layout;
25122515
super.log_fnode = log_file->fnode;

src/os/bluestore/BlueStore.cc

Lines changed: 32 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -7565,9 +7565,16 @@ void BlueStore::set_cache_shards(unsigned num)
75657565
}
75667566
}
75677567

7568+
//---------------------------------------------
7569+
bool BlueStore::has_null_manager()
7570+
{
7571+
return (fm && fm->is_null_manager());
7572+
}
7573+
75687574
int BlueStore::_mount()
75697575
{
75707576
dout(5) << __func__ << "NCB:: path " << path << dendl;
7577+
75717578
_kv_only = false;
75727579
if (cct->_conf->bluestore_fsck_on_mount) {
75737580
dout(5) << __func__ << "::NCB::calling fsck()" << dendl;
@@ -7681,12 +7688,15 @@ int BlueStore::umount()
76817688
#endif
76827689
dout(20) << __func__ << " stopping kv thread" << dendl;
76837690
_kv_stop();
7684-
_shutdown_cache();
7691+
// skip cache cleanup step on fast shutdown
7692+
if (likely(!m_fast_shutdown)) {
7693+
_shutdown_cache();
7694+
}
76857695
dout(20) << __func__ << " closing" << dendl;
76867696
}
7687-
76887697
_close_db_and_around();
7689-
if (cct->_conf->bluestore_fsck_on_umount) {
7698+
// disable fsck on fast-shutdown
7699+
if (cct->_conf->bluestore_fsck_on_umount && !m_fast_shutdown) {
76907700
int rc = fsck(cct->_conf->bluestore_fsck_on_umount_deep);
76917701
if (rc < 0)
76927702
return rc;
@@ -10305,6 +10315,11 @@ int BlueStore::get_numa_node(
1030510315
return 0;
1030610316
}
1030710317

10318+
void BlueStore::prepare_for_fast_shutdown()
10319+
{
10320+
m_fast_shutdown = true;
10321+
}
10322+
1030810323
int BlueStore::get_devices(set<string> *ls)
1030910324
{
1031010325
if (bdev) {
@@ -10432,7 +10447,8 @@ int BlueStore::pool_statfs(uint64_t pool_id, struct store_statfs_t *buf,
1043210447
string key_prefix;
1043310448
_key_encode_u64(pool_id, &key_prefix);
1043410449
*out_per_pool_omap = per_pool_omap != OMAP_BULK;
10435-
if (*out_per_pool_omap) {
10450+
// stop calls after db was closed
10451+
if (*out_per_pool_omap && db) {
1043610452
auto prefix = per_pool_omap == OMAP_PER_POOL ?
1043710453
PREFIX_PERPOOL_OMAP :
1043810454
PREFIX_PERPG_OMAP;
@@ -18344,11 +18360,10 @@ int BlueStore::store_allocator(Allocator* src_allocator)
1834418360
return -1;
1834518361
}
1834618362
}
18347-
18363+
bluefs->compact_log();
1834818364
// reuse previous file-allocation if exists
1834918365
ret = bluefs->stat(allocator_dir, allocator_file, nullptr, nullptr);
1835018366
bool overwrite_file = (ret == 0);
18351-
//derr << __func__ << "bluefs->open_for_write(" << overwrite_file << ")" << dendl;
1835218367
BlueFS::FileWriter *p_handle = nullptr;
1835318368
ret = bluefs->open_for_write(allocator_dir, allocator_file, &p_handle, overwrite_file);
1835418369
if (ret != 0) {
@@ -18358,8 +18373,9 @@ int BlueStore::store_allocator(Allocator* src_allocator)
1835818373

1835918374
uint64_t file_size = p_handle->file->fnode.size;
1836018375
uint64_t allocated = p_handle->file->fnode.get_allocated();
18361-
dout(5) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
18376+
dout(10) << "file_size=" << file_size << ", allocated=" << allocated << dendl;
1836218377

18378+
bluefs->sync_metadata(false);
1836318379
unique_ptr<Allocator> allocator(clone_allocator_without_bluefs(src_allocator));
1836418380
if (!allocator) {
1836518381
bluefs->close_writer(p_handle);
@@ -18431,12 +18447,11 @@ int BlueStore::store_allocator(Allocator* src_allocator)
1843118447
bluefs->fsync(p_handle);
1843218448

1843318449
utime_t duration = ceph_clock_now() - start_time;
18434-
dout(5) <<"WRITE-extent_count=" << extent_count << ", file_size=" << p_handle->file->fnode.size << dendl;
18450+
dout(5) <<"WRITE-extent_count=" << extent_count << ", allocation_size=" << allocation_size << ", serial=" << s_serial << dendl;
1843518451
dout(5) <<"p_handle->pos=" << p_handle->pos << " WRITE-duration=" << duration << " seconds" << dendl;
1843618452

1843718453
bluefs->close_writer(p_handle);
1843818454
need_to_destage_allocation_file = false;
18439-
dout(10) << "need_to_destage_allocation_file was clear" << dendl;
1844018455
return 0;
1844118456
}
1844218457

@@ -18628,7 +18643,7 @@ int BlueStore::__restore_allocator(Allocator* allocator, uint64_t *num, uint64_t
1862818643
utime_t duration = ceph_clock_now() - start_time;
1862918644
dout(5) << "READ--extent_count=" << extent_count << ", read_alloc_size= "
1863018645
<< read_alloc_size << ", file_size=" << file_size << dendl;
18631-
dout(5) << "READ duration=" << duration << " seconds, s_serial=" << s_serial << dendl;
18646+
dout(5) << "READ duration=" << duration << " seconds, s_serial=" << header.serial << dendl;
1863218647
*num = extent_count;
1863318648
*bytes = read_alloc_size;
1863418649
return 0;
@@ -18923,7 +18938,7 @@ int BlueStore::read_allocation_from_drive_on_startup()
1892318938

1892418939
utime_t start = ceph_clock_now();
1892518940
read_alloc_stats_t stats = {};
18926-
SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
18941+
SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
1892718942
ret = reconstruct_allocations(&sbmap, stats);
1892818943
if (ret != 0) {
1892918944
return ret;
@@ -19025,15 +19040,6 @@ int BlueStore::compare_allocators(Allocator* alloc1, Allocator* alloc2, uint64_t
1902519040
return 0;
1902619041
} else {
1902719042
derr << "mismatch:: idx1=" << idx1 << " idx2=" << idx2 << dendl;
19028-
std::cout << "===================================================================" << std::endl;
19029-
for (uint64_t i = 0; i < idx1; i++) {
19030-
std::cout << "arr1[" << i << "]<" << arr1[i].offset << "," << arr1[i].length << "> " << std::endl;
19031-
}
19032-
19033-
std::cout << "===================================================================" << std::endl;
19034-
for (uint64_t i = 0; i < idx2; i++) {
19035-
std::cout << "arr2[" << i << "]<" << arr2[i].offset << "," << arr2[i].length << "> " << std::endl;
19036-
}
1903719043
return -1;
1903819044
}
1903919045
}
@@ -19081,9 +19087,9 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
1908119087
utime_t start = ceph_clock_now();
1908219088

1908319089
auto shutdown_cache = make_scope_guard([&] {
19084-
std::cout << "Allocation Recovery was completed in " << duration
19085-
<< " seconds; insert_count=" << stats.insert_count
19086-
<< "; extent_count=" << stats.extent_count << std::endl;
19090+
dout(1) << "Allocation Recovery was completed in " << duration
19091+
<< " seconds; insert_count=" << stats.insert_count
19092+
<< "; extent_count=" << stats.extent_count << dendl;
1908719093
_shutdown_cache();
1908819094
_close_db_and_around();
1908919095
});
@@ -19092,7 +19098,7 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
1909219098
auto allocator = unique_ptr<Allocator>(create_bitmap_allocator(bdev->get_size()));
1909319099
//reconstruct allocations into a temp simple-bitmap and copy into allocator
1909419100
{
19095-
SimpleBitmap sbmap(cct, div_round_up(bdev->get_size(), min_alloc_size));
19101+
SimpleBitmap sbmap(cct, (bdev->get_size()/ min_alloc_size));
1909619102
ret = reconstruct_allocations(&sbmap, stats);
1909719103
if (ret != 0) {
1909819104
return ret;
@@ -19113,14 +19119,14 @@ int BlueStore::read_allocation_from_drive_for_bluestore_tool()
1911319119
};
1911419120
allocator->dump(count_entries);
1911519121
ret = compare_allocators(allocator.get(), alloc, stats.insert_count, memory_target);
19116-
if (ret != 0) {
19122+
if (ret == 0) {
1911719123
dout(5) << "Allocator drive - file integrity check OK" << dendl;
1911819124
} else {
1911919125
derr << "FAILURE. Allocator from file and allocator from metadata differ::ret=" << ret << dendl;
1912019126
}
1912119127
}
1912219128

19123-
std::cout << stats << std::endl;
19129+
dout(1) << stats << dendl;
1912419130
return ret;
1912519131
}
1912619132

src/os/bluestore/BlueStore.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2764,7 +2764,7 @@ class BlueStore : public ObjectStore,
27642764

27652765
private:
27662766
int32_t ondisk_format = 0; ///< value detected on mount
2767-
2767+
bool m_fast_shutdown = false;
27682768
int _upgrade_super(); ///< upgrade (called during open_super)
27692769
uint64_t _get_ondisk_reserved() const;
27702770
void _prepare_ondisk_format_super(KeyValueDB::Transaction& t);
@@ -2783,6 +2783,9 @@ class BlueStore : public ObjectStore,
27832783
bool wants_journal() override { return false; };
27842784
bool allows_journal() override { return false; };
27852785

2786+
void prepare_for_fast_shutdown() override;
2787+
virtual bool has_null_manager();
2788+
27862789
uint64_t get_min_alloc_size() const override {
27872790
return min_alloc_size;
27882791
}

src/osd/OSD.cc

Lines changed: 94 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4258,27 +4258,44 @@ PerfCounters* OSD::create_recoverystate_perf()
42584258

42594259
int OSD::shutdown()
42604260
{
4261+
// vstart overwrites osd_fast_shutdown value in the conf file -> force the value here!
4262+
//cct->_conf->osd_fast_shutdown = true;
4263+
4264+
dout(0) << "Fast Shutdown: - cct->_conf->osd_fast_shutdown = "
4265+
<< cct->_conf->osd_fast_shutdown
4266+
<< ", null-fm = " << store->has_null_manager() << dendl;
4267+
4268+
utime_t start_time_func = ceph_clock_now();
4269+
42614270
if (cct->_conf->osd_fast_shutdown) {
42624271
derr << "*** Immediate shutdown (osd_fast_shutdown=true) ***" << dendl;
42634272
if (cct->_conf->osd_fast_shutdown_notify_mon)
42644273
service.prepare_to_stop();
4265-
cct->_log->flush();
4266-
_exit(0);
4267-
}
42684274

4269-
if (!service.prepare_to_stop())
4275+
// There is no state we need to keep wehn running in NULL-FM moode
4276+
if (!store->has_null_manager()) {
4277+
cct->_log->flush();
4278+
_exit(0);
4279+
}
4280+
} else if (!service.prepare_to_stop()) {
42704281
return 0; // already shutting down
4282+
}
4283+
42714284
osd_lock.lock();
42724285
if (is_stopping()) {
42734286
osd_lock.unlock();
42744287
return 0;
42754288
}
4276-
dout(0) << "shutdown" << dendl;
42774289

4290+
if (!cct->_conf->osd_fast_shutdown) {
4291+
dout(0) << "shutdown" << dendl;
4292+
}
4293+
4294+
// don't accept new task for this OSD
42784295
set_state(STATE_STOPPING);
42794296

4280-
// Debugging
4281-
if (cct->_conf.get_val<bool>("osd_debug_shutdown")) {
4297+
// Disabled debugging during fast-shutdown
4298+
if (!cct->_conf->osd_fast_shutdown && cct->_conf.get_val<bool>("osd_debug_shutdown")) {
42824299
cct->_conf.set_val("debug_osd", "100");
42834300
cct->_conf.set_val("debug_journal", "100");
42844301
cct->_conf.set_val("debug_filestore", "100");
@@ -4287,6 +4304,45 @@ int OSD::shutdown()
42874304
cct->_conf.apply_changes(nullptr);
42884305
}
42894306

4307+
if (cct->_conf->osd_fast_shutdown) {
4308+
// first, stop new task from being taken from op_shardedwq
4309+
// and clear all pending tasks
4310+
op_shardedwq.stop_for_fast_shutdown();
4311+
4312+
utime_t start_time_timer = ceph_clock_now();
4313+
tick_timer.shutdown();
4314+
{
4315+
std::lock_guard l(tick_timer_lock);
4316+
tick_timer_without_osd_lock.shutdown();
4317+
}
4318+
4319+
osd_lock.unlock();
4320+
utime_t start_time_osd_drain = ceph_clock_now();
4321+
4322+
// then, wait on osd_op_tp to drain (TBD: should probably add a timeout)
4323+
osd_op_tp.drain();
4324+
osd_op_tp.stop();
4325+
4326+
utime_t start_time_umount = ceph_clock_now();
4327+
store->prepare_for_fast_shutdown();
4328+
std::lock_guard lock(osd_lock);
4329+
// TBD: assert in allocator that nothing is being add
4330+
store->umount();
4331+
4332+
utime_t end_time = ceph_clock_now();
4333+
if (cct->_conf->osd_fast_shutdown_timeout) {
4334+
ceph_assert(end_time - start_time_func < cct->_conf->osd_fast_shutdown_timeout);
4335+
}
4336+
dout(0) <<"Fast Shutdown duration total :" << end_time - start_time_func << " seconds" << dendl;
4337+
dout(0) <<"Fast Shutdown duration osd_drain :" << start_time_umount - start_time_osd_drain << " seconds" << dendl;
4338+
dout(0) <<"Fast Shutdown duration umount :" << end_time - start_time_umount << " seconds" << dendl;
4339+
dout(0) <<"Fast Shutdown duration timer :" << start_time_osd_drain - start_time_timer << " seconds" << dendl;
4340+
cct->_log->flush();
4341+
4342+
// now it is safe to exit
4343+
_exit(0);
4344+
}
4345+
42904346
// stop MgrClient earlier as it's more like an internal consumer of OSD
42914347
mgrc.shutdown();
42924348

@@ -4448,6 +4504,9 @@ int OSD::shutdown()
44484504
hb_front_server_messenger->shutdown();
44494505
hb_back_server_messenger->shutdown();
44504506

4507+
utime_t duration = ceph_clock_now() - start_time_func;
4508+
dout(0) <<"Slow Shutdown duration:" << duration << " seconds" << dendl;
4509+
44514510
tracing::osd::tracer.shutdown();
44524511

44534512
return r;
@@ -11072,6 +11131,11 @@ void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
1107211131
}
1107311132

1107411133
void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
11134+
if (unlikely(m_fast_shutdown) ) {
11135+
// stop enqueing when we are in the middle of a fast shutdown
11136+
return;
11137+
}
11138+
1107511139
uint32_t shard_index =
1107611140
item.get_ordering_token().hash_to_shard(osd->shards.size());
1107711141

@@ -11102,6 +11166,11 @@ void OSD::ShardedOpWQ::_enqueue(OpSchedulerItem&& item) {
1110211166

1110311167
void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
1110411168
{
11169+
if (unlikely(m_fast_shutdown) ) {
11170+
// stop enqueing when we are in the middle of a fast shutdown
11171+
return;
11172+
}
11173+
1110511174
auto shard_index = item.get_ordering_token().hash_to_shard(osd->shards.size());
1110611175
auto& sdata = osd->shards[shard_index];
1110711176
ceph_assert(sdata);
@@ -11128,6 +11197,24 @@ void OSD::ShardedOpWQ::_enqueue_front(OpSchedulerItem&& item)
1112811197
sdata->sdata_cond.notify_one();
1112911198
}
1113011199

11200+
void OSD::ShardedOpWQ::stop_for_fast_shutdown()
11201+
{
11202+
uint32_t shard_index = 0;
11203+
m_fast_shutdown = true;
11204+
11205+
for (; shard_index < osd->num_shards; shard_index++) {
11206+
auto& sdata = osd->shards[shard_index];
11207+
ceph_assert(sdata);
11208+
sdata->shard_lock.lock();
11209+
int work_count = 0;
11210+
while(! sdata->scheduler->empty() ) {
11211+
auto work_item = sdata->scheduler->dequeue();
11212+
work_count++;
11213+
}
11214+
sdata->shard_lock.unlock();
11215+
}
11216+
}
11217+
1113111218
namespace ceph::osd_cmds {
1113211219

1113311220
int heap(CephContext& cct, const cmdmap_t& cmdmap, Formatter& f,

0 commit comments

Comments
 (0)