Skip to content

Commit effaa68

Browse files
committed
os/bluestore: use dev's block size as a minimal BlueFS allocation unit.
Additionall this locks tail of DB/WAL volumes which is unaligned to configured (not minimal!!) BlueFS allocation unit. Effectively replaces changes from ceph#57015 Fixes: https://tracker.ceph.com/issues/68772 Signed-off-by: Igor Fedotov <[email protected]>
1 parent 33b43af commit effaa68

File tree

4 files changed

+91
-78
lines changed

4 files changed

+91
-78
lines changed

src/os/bluestore/BlueFS.cc

Lines changed: 83 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,9 @@ BlueFS::BlueFS(CephContext* cct)
201201
: cct(cct),
202202
bdev(MAX_BDEV),
203203
ioc(MAX_BDEV),
204-
block_reserved(MAX_BDEV),
205204
alloc(MAX_BDEV),
206-
alloc_size(MAX_BDEV, 0)
205+
alloc_size(MAX_BDEV, 0),
206+
locked_alloc(MAX_BDEV)
207207
{
208208
dirty.pending_release.resize(MAX_BDEV);
209209
discard_cb[BDEV_WAL] = wal_discard_cb;
@@ -496,33 +496,28 @@ void BlueFS::_update_logger_stats()
496496
int BlueFS::add_block_device(unsigned id, const string& path, bool trim,
497497
bluefs_shared_alloc_context_t* _shared_alloc)
498498
{
499-
uint64_t reserved;
500499
string dev_name;
501500
switch(id) {
502501
case BDEV_WAL:
503502
case BDEV_NEWWAL:
504-
reserved = BDEV_LABEL_BLOCK_SIZE;
505503
dev_name = "wal";
506504
break;
507505
case BDEV_DB:
508506
case BDEV_NEWDB:
509-
reserved = SUPER_RESERVED;
510507
dev_name = "db";
511508
break;
512509
case BDEV_SLOW:
513-
reserved = 0;
514510
dev_name = "slow";
515511
break;
516512
default:
517513
ceph_assert(false);
518514
}
519515
dout(10) << __func__ << " bdev " << id << " path " << path << " "
520-
<< " reserved " << reserved << dendl;
516+
<< dendl;
521517
ceph_assert(id < bdev.size());
522518
ceph_assert(bdev[id] == NULL);
523519
BlockDevice *b = BlockDevice::create(cct, path, NULL, NULL,
524520
discard_cb[id], static_cast<void*>(this), dev_name.c_str());
525-
block_reserved[id] = reserved;
526521
if (_shared_alloc) {
527522
b->set_no_exclusive_lock();
528523
}
@@ -628,6 +623,35 @@ uint64_t BlueFS::get_free(unsigned id)
628623
return alloc[id]->get_free();
629624
}
630625

626+
uint64_t BlueFS::_get_minimal_reserved(unsigned id) const
627+
{
628+
uint64_t reserved = 0;
629+
switch(id) {
630+
case BDEV_WAL:
631+
case BDEV_NEWWAL:
632+
reserved = BDEV_LABEL_BLOCK_SIZE;
633+
break;
634+
case BDEV_DB:
635+
case BDEV_NEWDB:
636+
reserved = SUPER_RESERVED;
637+
break;
638+
case BDEV_SLOW:
639+
reserved = 0;
640+
break;
641+
default:
642+
ceph_assert(false);
643+
}
644+
return reserved;
645+
}
646+
647+
uint64_t BlueFS::get_full_reserved(unsigned id)
648+
{
649+
if (!is_shared_alloc(id)) {
650+
return locked_alloc[id].length + _get_minimal_reserved(id);
651+
}
652+
return 0;
653+
}
654+
631655
void BlueFS::dump_perf_counters(Formatter *f)
632656
{
633657
f->open_object_section("bluefs_perf_counters");
@@ -684,13 +708,13 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
684708
}
685709

686710
_init_logger();
687-
_init_alloc();
688711

689712
super.version = 0;
690713
super.block_size = bdev[BDEV_DB]->get_block_size();
691714
super.osd_uuid = osd_uuid;
692715
super.uuid.generate_random();
693-
dout(1) << __func__ << " uuid " << super.uuid << dendl;
716+
717+
_init_alloc();
694718

695719
// init log
696720
FileRef log_file = ceph::make_ref<File>();
@@ -715,6 +739,7 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
715739
super.log_fnode = log_file->fnode;
716740
super.memorized_layout = layout;
717741
_write_super(BDEV_DB);
742+
dout(1) << __func__ << " super " << super << dendl;
718743
_flush_bdev();
719744

720745
// clean up
@@ -737,27 +762,10 @@ void BlueFS::_init_alloc()
737762
{
738763
dout(20) << __func__ << dendl;
739764

740-
// 'changed' should keep its previous value if no actual modification occurred
741-
auto change_alloc_size = [this](uint64_t& max_alloc_size,
742-
uint64_t new_alloc, bool& changed) {
743-
if (max_alloc_size == 0 ||
744-
(max_alloc_size > new_alloc && ((new_alloc & (new_alloc -1)) == 0))) {
745-
max_alloc_size = new_alloc;
746-
changed = true;
747-
dout(5) << " changed alloc_size to 0x" << std::hex << new_alloc << dendl;
748-
} else if (max_alloc_size != new_alloc) {
749-
derr << " can not change current alloc_size 0x" << std::hex
750-
<< max_alloc_size << " to new alloc_size 0x" << new_alloc << dendl;
751-
}
752-
};
753-
754-
bool alloc_size_changed = false;
755765
size_t wal_alloc_size = 0;
756766
if (bdev[BDEV_WAL]) {
757767
wal_alloc_size = cct->_conf->bluefs_alloc_size;
758768
alloc_size[BDEV_WAL] = wal_alloc_size;
759-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_WAL],
760-
wal_alloc_size, alloc_size_changed);
761769
}
762770
logger->set(l_bluefs_wal_alloc_unit, wal_alloc_size);
763771

@@ -773,46 +781,27 @@ void BlueFS::_init_alloc()
773781
if (bdev[BDEV_SLOW]) {
774782
alloc_size[BDEV_DB] = cct->_conf->bluefs_alloc_size;
775783
alloc_size[BDEV_SLOW] = shared_alloc_size;
776-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_DB],
777-
cct->_conf->bluefs_alloc_size, alloc_size_changed);
778-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_SLOW],
779-
shared_alloc_size, alloc_size_changed);
780784
} else {
781785
alloc_size[BDEV_DB] = shared_alloc_size;
782786
alloc_size[BDEV_SLOW] = 0;
783-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_DB],
784-
shared_alloc_size, alloc_size_changed);
785787
}
786788
logger->set(l_bluefs_db_alloc_unit, alloc_size[BDEV_DB]);
787789
logger->set(l_bluefs_slow_alloc_unit, alloc_size[BDEV_SLOW]);
788790
// new wal and db devices are never shared
789791
if (bdev[BDEV_NEWWAL]) {
790792
alloc_size[BDEV_NEWWAL] = cct->_conf->bluefs_alloc_size;
791-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_NEWWAL],
792-
cct->_conf->bluefs_alloc_size, alloc_size_changed);
793-
}
794-
if (alloc_size_changed) {
795-
dout(1) << __func__ << " alloc_size changed, the new super is:" << super << dendl;
796-
_write_super(BDEV_DB);
797793
}
798-
799-
alloc_size_changed = false;
800794
if (bdev[BDEV_NEWDB]) {
801795
alloc_size[BDEV_NEWDB] = cct->_conf->bluefs_alloc_size;
802-
change_alloc_size(super.bluefs_max_alloc_size[BDEV_NEWDB],
803-
cct->_conf->bluefs_alloc_size, alloc_size_changed);
804-
}
805-
if (alloc_size_changed) {
806-
dout(1) << __func__ << " alloc_size changed, the new super is:" << super << dendl;
807-
_write_super(BDEV_NEWDB);
808796
}
809797

810798
for (unsigned id = 0; id < bdev.size(); ++id) {
811799
if (!bdev[id]) {
812800
continue;
813801
}
814802
ceph_assert(bdev[id]->get_size());
815-
ceph_assert(super.bluefs_max_alloc_size[id]);
803+
locked_alloc[id] = bluefs_extent_t();
804+
816805
if (is_shared_alloc(id)) {
817806
dout(1) << __func__ << " shared, id " << id << std::hex
818807
<< ", capacity 0x" << bdev[id]->get_size()
@@ -826,22 +815,39 @@ void BlueFS::_init_alloc()
826815
name += devnames[id];
827816
else
828817
name += to_string(uintptr_t(this));
829-
string alloc_type = cct->_conf->bluefs_allocator;
830818

819+
auto reserved = _get_minimal_reserved(id);
820+
uint64_t locked_offs = 0;
821+
{
822+
// Try to lock tailing space at device if allocator controlled space
823+
// isn't aligned with recommended alloc unit.
824+
// Final decision whether locked tail to be maintained is made after
825+
// BlueFS replay depending on existing allocations.
826+
uint64_t size0 = _get_total(id);
827+
uint64_t size = size0 - reserved;
828+
size = p2align(size, alloc_size[id]) + reserved;
829+
if (size < size0) {
830+
locked_offs = size;
831+
locked_alloc[id] = bluefs_extent_t(id, locked_offs, uint32_t(size0 - size));
832+
}
833+
}
834+
string alloc_type = cct->_conf->bluefs_allocator;
831835
dout(1) << __func__ << " new, id " << id << std::hex
832836
<< ", allocator name " << name
833837
<< ", allocator type " << alloc_type
834838
<< ", capacity 0x" << bdev[id]->get_size()
835-
<< ", reserved 0x" << block_reserved[id]
836-
<< ", block size 0x" << alloc_size[id]
837-
<< ", max alloc size 0x" << super.bluefs_max_alloc_size[id]
839+
<< ", reserved 0x" << reserved
840+
<< ", locked 0x" << locked_alloc[id].offset
841+
<< "~" << locked_alloc[id].length
842+
<< ", block size 0x" << bdev[id]->get_block_size()
843+
<< ", alloc unit 0x" << alloc_size[id]
838844
<< std::dec << dendl;
839845
alloc[id] = Allocator::create(cct, alloc_type,
840846
bdev[id]->get_size(),
841-
super.bluefs_max_alloc_size[id],
847+
bdev[id]->get_block_size(),
842848
name);
843-
auto reserved = block_reserved[id];
844-
alloc[id]->init_add_free(reserved, _get_total(id) - reserved);
849+
uint64_t free_len = locked_offs ? locked_offs : _get_total(id) - reserved;
850+
alloc[id]->init_add_free(reserved, free_len);
845851
}
846852
}
847853
}
@@ -1045,6 +1051,7 @@ int BlueFS::mount()
10451051
derr << __func__ << " failed to open super: " << cpp_strerror(r) << dendl;
10461052
goto out;
10471053
}
1054+
dout(5) << __func__ << " super: " << super << dendl;
10481055

10491056
// set volume selector if not provided before/outside
10501057
if (vselector == nullptr) {
@@ -1057,7 +1064,6 @@ int BlueFS::mount()
10571064

10581065
_init_alloc();
10591066

1060-
dout(5) << __func__ << " super: " << super << dendl;
10611067
r = _replay(false, false);
10621068
if (r < 0) {
10631069
derr << __func__ << " failed to replay log: " << cpp_strerror(r) << dendl;
@@ -1075,6 +1081,20 @@ int BlueFS::mount()
10751081
shared_alloc->bluefs_used += q.length;
10761082
alloc[q.bdev]->init_rm_free(q.offset, q.length);
10771083
} else if (!is_shared) {
1084+
if (locked_alloc[q.bdev].length) {
1085+
auto locked_offs = locked_alloc[q.bdev].offset;
1086+
if (q.offset + q.length > locked_offs) {
1087+
// we already have allocated extents in locked range,
1088+
// do not enforce this lock then.
1089+
bluefs_extent_t dummy;
1090+
std::swap(locked_alloc[q.bdev], dummy);
1091+
alloc[q.bdev]->init_add_free(dummy.offset, dummy.length);
1092+
dout(1) << __func__ << std::hex
1093+
<< " unlocked at " << q.bdev
1094+
<< " 0x" << dummy.offset << "~" << dummy.length
1095+
<< std::dec << dendl;
1096+
}
1097+
}
10781098
alloc[q.bdev]->init_rm_free(q.offset, q.length);
10791099
}
10801100
}
@@ -1337,9 +1357,10 @@ int BlueFS::_replay(bool noop, bool to_stdout)
13371357
bool seen_recs = false;
13381358

13391359
boost::dynamic_bitset<uint64_t> used_blocks[MAX_BDEV];
1360+
bool check_allocations = cct->_conf->bluefs_log_replay_check_allocations;
13401361

13411362
if (!noop) {
1342-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1363+
if (check_allocations) {
13431364
for (size_t i = 0; i < MAX_BDEV; ++i) {
13441365
if (bdev[i] != nullptr) {
13451366
// let's use minimal allocation unit we can have
@@ -1671,7 +1692,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
16711692
}
16721693
if (!noop) {
16731694
FileRef f = _get_file(fnode.ino);
1674-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1695+
if (check_allocations) {
16751696
int r = _check_allocations(f->fnode,
16761697
used_blocks, false, "OP_FILE_UPDATE");
16771698
if (r < 0) {
@@ -1687,7 +1708,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
16871708
if (fnode.ino > ino_last) {
16881709
ino_last = fnode.ino;
16891710
}
1690-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1711+
if (check_allocations) {
16911712
int r = _check_allocations(f->fnode,
16921713
used_blocks, true, "OP_FILE_UPDATE");
16931714
if (r < 0) {
@@ -1721,7 +1742,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17211742
// be leanient, if there is no extents just produce error message
17221743
ceph_assert(delta.offset == fnode.allocated || delta.extents.empty());
17231744
}
1724-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1745+
if (check_allocations) {
17251746
int r = _check_allocations(fnode,
17261747
used_blocks, false, "OP_FILE_UPDATE_INC");
17271748
if (r < 0) {
@@ -1746,7 +1767,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17461767
if (fnode.ino > ino_last) {
17471768
ino_last = fnode.ino;
17481769
}
1749-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1770+
if (check_allocations) {
17501771
int r = _check_allocations(f->fnode,
17511772
used_blocks, true, "OP_FILE_UPDATE_INC");
17521773
if (r < 0) {
@@ -1780,7 +1801,7 @@ int BlueFS::_replay(bool noop, bool to_stdout)
17801801
auto p = nodes.file_map.find(ino);
17811802
ceph_assert(p != nodes.file_map.end());
17821803
vselector->sub_usage(p->second->vselector_hint, p->second->fnode);
1783-
if (cct->_conf->bluefs_log_replay_check_allocations) {
1804+
if (check_allocations) {
17841805
int r = _check_allocations(p->second->fnode,
17851806
used_blocks, false, "OP_FILE_REMOVE");
17861807
if (r < 0) {

src/os/bluestore/BlueFS.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -520,9 +520,12 @@ class BlueFS {
520520
*/
521521
std::vector<BlockDevice*> bdev; ///< block devices we can use
522522
std::vector<IOContext*> ioc; ///< IOContexts for bdevs
523-
std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
524523
std::vector<Allocator*> alloc; ///< allocators for bdevs
525524
std::vector<uint64_t> alloc_size; ///< alloc size for each device
525+
std::vector<bluefs_extent_t> locked_alloc; ///< candidate extents for locked alocations,
526+
///< no alloc/release reqs matching these space
527+
///< to be issued to allocator.
528+
526529

527530
//std::vector<interval_set<uint64_t>> block_unused_too_granular;
528531

@@ -554,7 +557,7 @@ class BlueFS {
554557

555558
uint64_t _get_used(unsigned id) const;
556559
uint64_t _get_total(unsigned id) const;
557-
560+
uint64_t _get_minimal_reserved(unsigned id) const;
558561

559562
FileRef _get_file(uint64_t ino);
560563
void _drop_link_D(FileRef f);
@@ -707,6 +710,7 @@ class BlueFS {
707710
uint64_t get_total(unsigned id);
708711
uint64_t get_free(unsigned id);
709712
uint64_t get_used(unsigned id);
713+
uint64_t get_full_reserved(unsigned id);
710714
void dump_perf_counters(ceph::Formatter *f);
711715

712716
void dump_block_extents(std::ostream& out);

0 commit comments

Comments
 (0)