Skip to content

Commit eb461b4

Browse files
committed
os/bluestore: introduce locking for the very first DB/WAL alloc unit.
Signed-off-by: Igor Fedotov <[email protected]>
1 parent 4130c43 commit eb461b4

File tree

6 files changed

+476
-41
lines changed

6 files changed

+476
-41
lines changed

src/os/bluestore/BlueFS.cc

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,8 @@ uint64_t BlueFS::_get_minimal_reserved(unsigned id) const
640640
uint64_t BlueFS::get_full_reserved(unsigned id)
641641
{
642642
if (!is_shared_alloc(id)) {
643-
return locked_alloc[id].length + _get_minimal_reserved(id);
643+
return locked_alloc[id].head_length + locked_alloc[id].tail_length +
644+
_get_minimal_reserved(id);
644645
}
645646
return 0;
646647
}
@@ -709,6 +710,18 @@ int BlueFS::mkfs(uuid_d osd_uuid, const bluefs_layout_t& layout)
709710

710711
_init_alloc();
711712

713+
// temporary lock candidate regions to forbid their use during mkfs
714+
for (uint8_t i = 0; i < MAX_BDEV; i++) {
715+
if (!alloc[i]) continue;
716+
bluefs_locked_extents_t res_la = locked_alloc[i].get_merged();
717+
if (res_la.head_length) {
718+
alloc[i]->init_rm_free(res_la.head_offset, res_la.head_length);
719+
}
720+
if (res_la.tail_length) {
721+
alloc[i]->init_rm_free(res_la.tail_offset, res_la.tail_length);
722+
}
723+
}
724+
712725
// init log
713726
FileRef log_file = ceph::make_ref<File>();
714727
log_file->fnode.ino = 1;
@@ -793,7 +806,7 @@ void BlueFS::_init_alloc()
793806
continue;
794807
}
795808
ceph_assert(bdev[id]->get_size());
796-
locked_alloc[id] = bluefs_extent_t();
809+
locked_alloc[id].reset();
797810

798811
if (is_shared_alloc(id)) {
799812
dout(1) << __func__ << " shared, id " << id << std::hex
@@ -810,37 +823,37 @@ void BlueFS::_init_alloc()
810823
name += to_string(uintptr_t(this));
811824

812825
auto reserved = _get_minimal_reserved(id);
813-
uint64_t locked_offs = 0;
814-
{
815-
// Try to lock tailing space at device if allocator controlled space
816-
// isn't aligned with recommended alloc unit.
817-
// Final decision whether locked tail to be maintained is made after
818-
// BlueFS replay depending on existing allocations.
819-
uint64_t size0 = _get_block_device_size(id);
820-
uint64_t size = size0 - reserved;
821-
size = p2align(size, alloc_size[id]) + reserved;
822-
if (size < size0) {
823-
locked_offs = size;
824-
locked_alloc[id] = bluefs_extent_t(id, locked_offs, uint32_t(size0 - size));
825-
}
826+
uint64_t full_size = _get_block_device_size(id);
827+
uint64_t free_end = p2align(full_size, alloc_size[id]);
828+
829+
// Trying to lock the following extents:
830+
// [reserved, alloc_size] and [p2align(dev_size, alloc_size), dev_size]
831+
// to make all the allocations alligned to alloc_size if possible.
832+
// Final decision whether locked head/tail to be maintained is made after
833+
// BlueFS replay depending on existing allocations.
834+
auto &locked = locked_alloc[id];
835+
locked.head_offset = reserved;
836+
locked.head_length = p2nphase(reserved, alloc_size[id]);
837+
if (free_end < full_size) {
838+
locked.tail_offset = free_end;
839+
locked.tail_length = full_size - free_end;
826840
}
827841
string alloc_type = cct->_conf->bluefs_allocator;
828842
dout(1) << __func__ << " new, id " << id << std::hex
829843
<< ", allocator name " << name
830844
<< ", allocator type " << alloc_type
831-
<< ", capacity 0x" << bdev[id]->get_size()
845+
<< ", capacity 0x" << full_size
832846
<< ", reserved 0x" << reserved
833-
<< ", locked 0x" << locked_alloc[id].offset
834-
<< "~" << locked_alloc[id].length
847+
<< ", maybe locked " << locked
835848
<< ", block size 0x" << bdev[id]->get_block_size()
836849
<< ", alloc unit 0x" << alloc_size[id]
837850
<< std::dec << dendl;
838-
alloc[id] = Allocator::create(cct, alloc_type,
839-
bdev[id]->get_size(),
851+
alloc[id] = Allocator::create(cct,
852+
alloc_type,
853+
full_size,
840854
bdev[id]->get_block_size(),
841855
name);
842-
uint64_t free_len = locked_offs ? locked_offs : _get_block_device_size(id) - reserved;
843-
alloc[id]->init_add_free(reserved, free_len);
856+
alloc[id]->init_add_free(reserved, full_size - reserved);
844857
}
845858
}
846859
}
@@ -1066,32 +1079,34 @@ int BlueFS::mount()
10661079

10671080
// init freelist
10681081
for (auto& p : nodes.file_map) {
1069-
dout(30) << __func__ << " noting alloc for " << p.second->fnode << dendl;
1082+
dout(20) << __func__ << " noting alloc for " << p.second->fnode << dendl;
10701083
for (auto& q : p.second->fnode.extents) {
10711084
bool is_shared = is_shared_alloc(q.bdev);
10721085
ceph_assert(!is_shared || (is_shared && shared_alloc));
10731086
if (is_shared && shared_alloc->need_init && shared_alloc->a) {
10741087
shared_alloc->bluefs_used += q.length;
10751088
alloc[q.bdev]->init_rm_free(q.offset, q.length);
10761089
} else if (!is_shared) {
1077-
if (locked_alloc[q.bdev].length) {
1078-
auto locked_offs = locked_alloc[q.bdev].offset;
1079-
if (q.offset + q.length > locked_offs) {
1080-
// we already have allocated extents in locked range,
1081-
// do not enforce this lock then.
1082-
bluefs_extent_t dummy;
1083-
std::swap(locked_alloc[q.bdev], dummy);
1084-
alloc[q.bdev]->init_add_free(dummy.offset, dummy.length);
1085-
dout(1) << __func__ << std::hex
1086-
<< " unlocked at " << q.bdev
1087-
<< " 0x" << dummy.offset << "~" << dummy.length
1088-
<< std::dec << dendl;
1089-
}
1090-
}
1090+
locked_alloc[q.bdev].reset_intersected(q);
10911091
alloc[q.bdev]->init_rm_free(q.offset, q.length);
10921092
}
10931093
}
10941094
}
1095+
// finalize and apply locked allocation regions
1096+
for (uint8_t i = 0; i < MAX_BDEV; i++) {
1097+
bluefs_locked_extents_t res_la = locked_alloc[i].finalize();
1098+
dout(1) << __func__ << std::hex
1099+
<< " final locked allocations " << (int)i
1100+
<< " " << locked_alloc[i] << " => " << res_la
1101+
<< dendl;
1102+
if (res_la.head_length) {
1103+
alloc[i]->init_rm_free(res_la.head_offset, res_la.head_length);
1104+
}
1105+
if (res_la.tail_length) {
1106+
alloc[i]->init_rm_free(res_la.tail_offset, res_la.tail_length);
1107+
}
1108+
}
1109+
10951110
if (shared_alloc) {
10961111
shared_alloc->need_init = false;
10971112
dout(1) << __func__ << " shared_bdev_used = "

src/os/bluestore/BlueFS.h

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -522,9 +522,11 @@ class BlueFS {
522522
std::vector<IOContext*> ioc; ///< IOContexts for bdevs
523523
std::vector<Allocator*> alloc; ///< allocators for bdevs
524524
std::vector<uint64_t> alloc_size; ///< alloc size for each device
525-
std::vector<bluefs_extent_t> locked_alloc; ///< candidate extents for locked alocations,
526-
///< no alloc/release reqs matching these space
527-
///< to be issued to allocator.
525+
std::vector<bluefs_locked_extents_t> locked_alloc; ///< candidate extents
526+
///< at both dev's head and tail
527+
///< locked for allocations,
528+
///< no alloc/release reqs matching
529+
///< these space to be issued to allocator.
528530

529531

530532
//std::vector<interval_set<uint64_t>> block_unused_too_granular;

src/os/bluestore/bluefs_types.cc

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,102 @@ ostream& operator<<(ostream& out, const bluefs_extent_t& e)
3838
<< std::dec;
3939
}
4040

41+
bluefs_locked_extents_t::bluefs_locked_extents_t(uint64_t head_reserved,
42+
uint64_t full_size, uint64_t alloc_size)
43+
{
44+
// Calculating three extents which are potential candidates for locking:
45+
// [start, end]
46+
// - head: [reserved, p2nphase(reserved, alloc_size)]
47+
// - gray_tail: an area which should be locked if head becomes void
48+
// - tail: [p2align(full_size, alloc_size), full_size]
49+
// Final decision whether locked extents to be maintained is made after
50+
// BlueFS replay depending on existing allocations.
51+
// This class performs that recalculation on reset_intercepted() calls
52+
// which indicate existing allocations to it.
53+
//
54+
55+
head_offset = head_reserved;
56+
head_length = p2nphase(head_reserved, alloc_size);
57+
if (head_reserved) {
58+
ceph_assert(full_size > head_reserved);
59+
uint64_t gray_free_end = p2align(full_size - head_reserved, alloc_size);
60+
gray_free_end += head_reserved;
61+
if (gray_free_end < full_size) {
62+
gray_tail_offset = gray_free_end;
63+
gray_tail_length = full_size - gray_free_end;
64+
}
65+
}
66+
uint64_t free_end = p2align(full_size, alloc_size);
67+
if (free_end < full_size) {
68+
tail_offset = free_end;
69+
tail_length = full_size - free_end;
70+
}
71+
}
72+
73+
void bluefs_locked_extents_t::reset_intersected(const bluefs_extent_t& e)
74+
{
75+
if (e.offset < head_end() && e.end() > head_offset) {
76+
head_offset = 0;
77+
head_length = 0;
78+
}
79+
if (e.offset < gray_tail_end() && e.end() > gray_tail_offset) {
80+
gray_tail_offset = 0;
81+
gray_tail_length = 0;
82+
}
83+
if (e.offset < tail_end() && e.end() > tail_offset) {
84+
tail_offset = 0;
85+
tail_length = 0;
86+
}
87+
}
88+
89+
bluefs_locked_extents_t bluefs_locked_extents_t::get_merged() const
90+
{
91+
bluefs_locked_extents_t res;
92+
res.head_offset = head_offset;
93+
res.head_length = head_length;
94+
if (gray_tail_length) {
95+
if (tail_length) {
96+
ceph_assert(gray_tail_offset > 0);
97+
ceph_assert(tail_offset > 0);
98+
res.tail_offset = std::min(tail_offset, gray_tail_offset);
99+
res.tail_length = std::max(tail_end(), gray_tail_end()) - res.tail_offset;
100+
} else {
101+
res.tail_offset = gray_tail_offset;
102+
res.tail_length = gray_tail_length;
103+
}
104+
} else {
105+
res.tail_offset = tail_offset;
106+
res.tail_length = tail_length;
107+
}
108+
return res;
109+
}
110+
111+
bluefs_locked_extents_t bluefs_locked_extents_t::finalize() const
112+
{
113+
bluefs_locked_extents_t res;
114+
if (head_length) {
115+
res.head_offset = head_offset;
116+
res.head_length = head_length;
117+
if (tail_length) {
118+
res.tail_offset = tail_offset;
119+
res.tail_length = tail_length;
120+
}
121+
} else {
122+
res.tail_offset = gray_tail_offset;
123+
res.tail_length = gray_tail_length;
124+
}
125+
return res;
126+
}
127+
128+
ostream& operator<<(ostream& out, const bluefs_locked_extents_t& e)
129+
{
130+
return out << std::hex
131+
<< "<0x" << e.head_offset << "~" << e.head_length
132+
<< ", [0x" << e.gray_tail_offset << "~" << e.gray_tail_length
133+
<< "], 0x" << e.tail_offset << "~" << e.tail_length << ">"
134+
<< std::dec;
135+
}
136+
41137
// bluefs_layout_t
42138

43139
void bluefs_layout_t::encode(bufferlist& bl) const

src/os/bluestore/bluefs_types.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,37 @@ WRITE_CLASS_DENC(bluefs_extent_t)
3535

3636
std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e);
3737

38+
struct bluefs_locked_extents_t {
39+
uint64_t head_offset = 0;
40+
uint32_t head_length = 0;
41+
42+
uint64_t gray_tail_offset = 0;
43+
uint32_t gray_tail_length = 0;
44+
45+
uint64_t tail_offset = 0;
46+
uint32_t tail_length = 0;
47+
48+
bluefs_locked_extents_t() {}
49+
bluefs_locked_extents_t(uint64_t head_reserved, uint64_t full_size, uint64_t alloc_size);
50+
51+
void reset() {
52+
*this = bluefs_locked_extents_t();
53+
}
54+
uint64_t head_end() const { return head_offset + head_length; }
55+
uint64_t gray_tail_end() const { return gray_tail_offset + gray_tail_length; }
56+
uint64_t tail_end() const { return tail_offset + tail_length; }
57+
58+
void reset_intersected(const bluefs_extent_t& e);
59+
60+
// returns extents in a form where tails are merged
61+
bluefs_locked_extents_t get_merged() const;
62+
63+
// returns final locked extents where head/tail are present only
64+
bluefs_locked_extents_t finalize() const;
65+
};
66+
67+
std::ostream& operator<<(std::ostream& out, const bluefs_locked_extents_t& e);
68+
3869
struct bluefs_fnode_delta_t {
3970
uint64_t ino;
4071
uint64_t size;

src/test/objectstore/store_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11993,7 +11993,7 @@ TEST_P(StoreTestSpecificAUSize, BlueFSReservedTest) {
1199311993
g_conf()->bluefs_alloc_size);
1199411994

1199511995
ASSERT_EQ(fs->get_full_reserved(BlueFS::BDEV_WAL),
11996-
wal_extra);
11996+
g_conf()->bluefs_alloc_size + wal_extra);
1199711997
}
1199811998

1199911999
#endif // WITH_BLUESTORE

0 commit comments

Comments
 (0)