Skip to content

Commit 4eec7e5

Browse files
authored
Merge pull request ceph#62174 from ifed01/wip-ifed-fix-bluefs-reserved2
os/bluestore: use block size (4K) as minimal allocation unit for dedicated DB/WAL volumes Reviewed-by: Adam Kupczyk <[email protected]>
2 parents adfd7e8 + eb461b4 commit 4eec7e5

File tree

8 files changed

+643
-128
lines changed

8 files changed

+643
-128
lines changed

src/os/bluestore/BlueFS.cc

Lines changed: 151 additions & 98 deletions
Large diffs are not rendered by default.

src/os/bluestore/BlueFS.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -520,9 +520,14 @@ class BlueFS {
520520
*/
521521
std::vector<BlockDevice*> bdev; ///< block devices we can use
522522
std::vector<IOContext*> ioc; ///< IOContexts for bdevs
523-
std::vector<uint64_t> block_reserved; ///< starting reserve extent per device
524523
std::vector<Allocator*> alloc; ///< allocators for bdevs
525524
std::vector<uint64_t> alloc_size; ///< alloc size for each device
525+
std::vector<bluefs_locked_extents_t> locked_alloc; ///< candidate extents
526+
///< at both dev's head and tail
527+
///< locked for allocations,
528+
///< no alloc/release reqs matching
529+
///< these space to be issued to allocator.
530+
526531

527532
//std::vector<interval_set<uint64_t>> block_unused_too_granular;
528533

@@ -553,8 +558,8 @@ class BlueFS {
553558
void _pad_bl(ceph::buffer::list& bl, uint64_t pad_size = 0);
554559

555560
uint64_t _get_used(unsigned id) const;
556-
uint64_t _get_total(unsigned id) const;
557-
561+
uint64_t _get_block_device_size(unsigned id) const;
562+
uint64_t _get_minimal_reserved(unsigned id) const;
558563

559564
FileRef _get_file(uint64_t ino);
560565
void _drop_link_D(FileRef f);
@@ -684,6 +689,7 @@ class BlueFS {
684689
int prepare_new_device(int id, const bluefs_layout_t& layout);
685690

686691
int log_dump();
692+
int super_dump();
687693

688694
void collect_metadata(std::map<std::string,std::string> *pm, unsigned skip_bdev_id);
689695
void get_devices(std::set<std::string> *ls);
@@ -704,9 +710,10 @@ class BlueFS {
704710
const bluefs_layout_t& layout);
705711

706712
uint64_t get_used();
707-
uint64_t get_total(unsigned id);
713+
uint64_t get_block_device_size(unsigned id);
708714
uint64_t get_free(unsigned id);
709715
uint64_t get_used(unsigned id);
716+
uint64_t get_full_reserved(unsigned id);
710717
void dump_perf_counters(ceph::Formatter *f);
711718

712719
void dump_block_extents(std::ostream& out);
@@ -768,7 +775,6 @@ class BlueFS {
768775
int add_block_device(unsigned bdev, const std::string& path, bool trim,
769776
bluefs_shared_alloc_context_t* _shared_alloc = nullptr);
770777
bool bdev_support_label(unsigned id);
771-
uint64_t get_block_device_size(unsigned bdev) const;
772778
BlockDevice* get_block_device(unsigned bdev) const;
773779

774780
// handler for discard event

src/os/bluestore/BlueStore.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12041,6 +12041,7 @@ void BlueStore::collect_metadata(map<string,string> *pm)
1204112041
}
1204212042
(*pm)["bluestore_min_alloc_size"] = stringify(min_alloc_size);
1204312043
(*pm)["bluestore_allocation_from_file"] = stringify(fm && fm->is_null_manager());
12044+
(*pm)["bluestore_allocator"] = alloc ? alloc->get_type() : "null";
1204412045
}
1204512046

1204612047
int BlueStore::get_numa_node(
@@ -12151,7 +12152,7 @@ void BlueStore::_get_statfs_overall(struct store_statfs_t *buf)
1215112152
buf->internally_reserved = 0;
1215212153
// include dedicated db, too, if that isn't the shared device.
1215312154
if (bluefs_layout.shared_bdev != BlueFS::BDEV_DB) {
12154-
buf->total += bluefs->get_total(BlueFS::BDEV_DB);
12155+
buf->total += bluefs->get_block_device_size(BlueFS::BDEV_DB);
1215512156
}
1215612157
// call any non-omap bluefs space "internal metadata"
1215712158
buf->internal_metadata =
@@ -19174,7 +19175,7 @@ void BlueStore::_log_alerts(osd_alert_list_t& alerts)
1917419175
bluefs->get_used(BlueFS::BDEV_SLOW) : 0;
1917519176
if (used > 0) {
1917619177
auto db_used = bluefs->get_used(BlueFS::BDEV_DB);
19177-
auto db_total = bluefs->get_total(BlueFS::BDEV_DB);
19178+
auto db_total = bluefs->get_block_device_size(BlueFS::BDEV_DB);
1917819179
ostringstream ss;
1917919180
ss << "spilled over " << byte_u_t(used)
1918019181
<< " metadata from 'db' device (" << byte_u_t(db_used)

src/os/bluestore/bluefs_types.cc

Lines changed: 104 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,102 @@ ostream& operator<<(ostream& out, const bluefs_extent_t& e)
3838
<< std::dec;
3939
}
4040

41+
bluefs_locked_extents_t::bluefs_locked_extents_t(uint64_t head_reserved,
42+
uint64_t full_size, uint64_t alloc_size)
43+
{
44+
// Calculating three extents which are potential candidates for locking:
45+
// [start, end]
46+
// - head: [reserved, p2nphase(reserved, alloc_size)]
47+
// - gray_tail: an area which should be locked if head becomes void
48+
// - tail: [p2align(full_size, alloc_size), full_size]
49+
// Final decision whether locked extents to be maintained is made after
50+
// BlueFS replay depending on existing allocations.
51+
// This class performs that recalculation on reset_intercepted() calls
52+
// which indicate existing allocations to it.
53+
//
54+
55+
head_offset = head_reserved;
56+
head_length = p2nphase(head_reserved, alloc_size);
57+
if (head_reserved) {
58+
ceph_assert(full_size > head_reserved);
59+
uint64_t gray_free_end = p2align(full_size - head_reserved, alloc_size);
60+
gray_free_end += head_reserved;
61+
if (gray_free_end < full_size) {
62+
gray_tail_offset = gray_free_end;
63+
gray_tail_length = full_size - gray_free_end;
64+
}
65+
}
66+
uint64_t free_end = p2align(full_size, alloc_size);
67+
if (free_end < full_size) {
68+
tail_offset = free_end;
69+
tail_length = full_size - free_end;
70+
}
71+
}
72+
73+
void bluefs_locked_extents_t::reset_intersected(const bluefs_extent_t& e)
74+
{
75+
if (e.offset < head_end() && e.end() > head_offset) {
76+
head_offset = 0;
77+
head_length = 0;
78+
}
79+
if (e.offset < gray_tail_end() && e.end() > gray_tail_offset) {
80+
gray_tail_offset = 0;
81+
gray_tail_length = 0;
82+
}
83+
if (e.offset < tail_end() && e.end() > tail_offset) {
84+
tail_offset = 0;
85+
tail_length = 0;
86+
}
87+
}
88+
89+
bluefs_locked_extents_t bluefs_locked_extents_t::get_merged() const
90+
{
91+
bluefs_locked_extents_t res;
92+
res.head_offset = head_offset;
93+
res.head_length = head_length;
94+
if (gray_tail_length) {
95+
if (tail_length) {
96+
ceph_assert(gray_tail_offset > 0);
97+
ceph_assert(tail_offset > 0);
98+
res.tail_offset = std::min(tail_offset, gray_tail_offset);
99+
res.tail_length = std::max(tail_end(), gray_tail_end()) - res.tail_offset;
100+
} else {
101+
res.tail_offset = gray_tail_offset;
102+
res.tail_length = gray_tail_length;
103+
}
104+
} else {
105+
res.tail_offset = tail_offset;
106+
res.tail_length = tail_length;
107+
}
108+
return res;
109+
}
110+
111+
bluefs_locked_extents_t bluefs_locked_extents_t::finalize() const
112+
{
113+
bluefs_locked_extents_t res;
114+
if (head_length) {
115+
res.head_offset = head_offset;
116+
res.head_length = head_length;
117+
if (tail_length) {
118+
res.tail_offset = tail_offset;
119+
res.tail_length = tail_length;
120+
}
121+
} else {
122+
res.tail_offset = gray_tail_offset;
123+
res.tail_length = gray_tail_length;
124+
}
125+
return res;
126+
}
127+
128+
ostream& operator<<(ostream& out, const bluefs_locked_extents_t& e)
129+
{
130+
return out << std::hex
131+
<< "<0x" << e.head_offset << "~" << e.head_length
132+
<< ", [0x" << e.gray_tail_offset << "~" << e.gray_tail_length
133+
<< "], 0x" << e.tail_offset << "~" << e.tail_length << ">"
134+
<< std::dec;
135+
}
136+
41137
// bluefs_layout_t
42138

43139
void bluefs_layout_t::encode(bufferlist& bl) const
@@ -75,69 +171,59 @@ void bluefs_layout_t::generate_test_instances(list<bluefs_layout_t*>& ls)
75171
}
76172

77173
// bluefs_super_t
78-
bluefs_super_t::bluefs_super_t() : version(0), block_size(4096) {
79-
bluefs_max_alloc_size.resize(BlueFS::MAX_BDEV, 0);
174+
bluefs_super_t::bluefs_super_t() : seq(0), block_size(4096) {
80175
}
81176

82177
void bluefs_super_t::encode(bufferlist& bl) const
83178
{
84-
ENCODE_START(3, 1, bl);
179+
ENCODE_START(2, 1, bl);
85180
encode(uuid, bl);
86181
encode(osd_uuid, bl);
87-
encode(version, bl);
182+
encode(seq, bl);
88183
encode(block_size, bl);
89184
encode(log_fnode, bl);
90185
encode(memorized_layout, bl);
91-
encode(bluefs_max_alloc_size, bl);
92186
ENCODE_FINISH(bl);
93187
}
94188

95189
void bluefs_super_t::decode(bufferlist::const_iterator& p)
96190
{
97-
DECODE_START(3, p);
191+
DECODE_START(2, p);
98192
decode(uuid, p);
99193
decode(osd_uuid, p);
100-
decode(version, p);
194+
decode(seq, p);
101195
decode(block_size, p);
102196
decode(log_fnode, p);
103197
if (struct_v >= 2) {
104198
decode(memorized_layout, p);
105199
}
106-
if (struct_v >= 3) {
107-
decode(bluefs_max_alloc_size, p);
108-
} else {
109-
std::fill(bluefs_max_alloc_size.begin(), bluefs_max_alloc_size.end(), 0);
110-
}
111200
DECODE_FINISH(p);
112201
}
113202

114203
void bluefs_super_t::dump(Formatter *f) const
115204
{
116205
f->dump_stream("uuid") << uuid;
117206
f->dump_stream("osd_uuid") << osd_uuid;
118-
f->dump_unsigned("version", version);
207+
f->dump_unsigned("seq", seq);
119208
f->dump_unsigned("block_size", block_size);
120209
f->dump_object("log_fnode", log_fnode);
121-
for (auto& p : bluefs_max_alloc_size)
122-
f->dump_unsigned("max_alloc_size", p);
123210
}
124211

125212
void bluefs_super_t::generate_test_instances(list<bluefs_super_t*>& ls)
126213
{
127214
ls.push_back(new bluefs_super_t);
128215
ls.push_back(new bluefs_super_t);
129-
ls.back()->version = 1;
216+
ls.back()->seq = 1;
130217
ls.back()->block_size = 4096;
131218
}
132219

133220
ostream& operator<<(ostream& out, const bluefs_super_t& s)
134221
{
135222
return out << "super(uuid " << s.uuid
136223
<< " osd " << s.osd_uuid
137-
<< " v " << s.version
224+
<< " seq " << s.seq
138225
<< " block_size 0x" << std::hex << s.block_size
139226
<< " log_fnode 0x" << s.log_fnode
140-
<< " max_alloc_size " << s.bluefs_max_alloc_size
141227
<< std::dec << ")";
142228
}
143229

src/os/bluestore/bluefs_types.h

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,37 @@ WRITE_CLASS_DENC(bluefs_extent_t)
3535

3636
std::ostream& operator<<(std::ostream& out, const bluefs_extent_t& e);
3737

38+
struct bluefs_locked_extents_t {
39+
uint64_t head_offset = 0;
40+
uint32_t head_length = 0;
41+
42+
uint64_t gray_tail_offset = 0;
43+
uint32_t gray_tail_length = 0;
44+
45+
uint64_t tail_offset = 0;
46+
uint32_t tail_length = 0;
47+
48+
bluefs_locked_extents_t() {}
49+
bluefs_locked_extents_t(uint64_t head_reserved, uint64_t full_size, uint64_t alloc_size);
50+
51+
void reset() {
52+
*this = bluefs_locked_extents_t();
53+
}
54+
uint64_t head_end() const { return head_offset + head_length; }
55+
uint64_t gray_tail_end() const { return gray_tail_offset + gray_tail_length; }
56+
uint64_t tail_end() const { return tail_offset + tail_length; }
57+
58+
void reset_intersected(const bluefs_extent_t& e);
59+
60+
// returns extents in a form where tails are merged
61+
bluefs_locked_extents_t get_merged() const;
62+
63+
// returns final locked extents where head/tail are present only
64+
bluefs_locked_extents_t finalize() const;
65+
};
66+
67+
std::ostream& operator<<(std::ostream& out, const bluefs_locked_extents_t& e);
68+
3869
struct bluefs_fnode_delta_t {
3970
uint64_t ino;
4071
uint64_t size;
@@ -213,15 +244,13 @@ WRITE_CLASS_ENCODER(bluefs_layout_t)
213244
struct bluefs_super_t {
214245
uuid_d uuid; ///< unique to this bluefs instance
215246
uuid_d osd_uuid; ///< matches the osd that owns us
216-
uint64_t version;
247+
uint64_t seq; ///< sequence counter
217248
uint32_t block_size;
218249

219250
bluefs_fnode_t log_fnode;
220251

221252
std::optional<bluefs_layout_t> memorized_layout;
222253

223-
std::vector<uint64_t> bluefs_max_alloc_size;
224-
225254
bluefs_super_t();
226255

227256
uint64_t block_mask() const {

src/os/bluestore/bluestore_tool.cc

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,25 @@ void log_dump(
214214
delete fs;
215215
}
216216

217+
void super_dump(
218+
CephContext *cct,
219+
const string& path,
220+
const vector<string>& devs)
221+
{
222+
validate_path(cct, path, true);
223+
BlueFS *fs = new BlueFS(cct);
224+
225+
add_devices(fs, cct, devs);
226+
int r = fs->super_dump();
227+
if (r < 0) {
228+
cerr << "super_dump failed" << ": "
229+
<< cpp_strerror(r) << std::endl;
230+
exit(EXIT_FAILURE);
231+
}
232+
233+
delete fs;
234+
}
235+
217236
void inferring_bluefs_devices(vector<string>& devs, std::string& path)
218237
{
219238
cout << "inferring bluefs devices from bluestore path" << std::endl;
@@ -340,6 +359,7 @@ int main(int argc, char **argv)
340359
"set-label-key, "
341360
"rm-label-key, "
342361
"prime-osd-dir, "
362+
"bluefs-super-dump, "
343363
"bluefs-log-dump, "
344364
"free-dump, "
345365
"free-score, "
@@ -513,6 +533,7 @@ int main(int argc, char **argv)
513533
}
514534
if (action == "bluefs-export" ||
515535
action == "bluefs-import" ||
536+
action == "bluefs-super-dump" ||
516537
action == "bluefs-log-dump") {
517538
if (path.empty()) {
518539
cerr << "must specify bluestore path" << std::endl;
@@ -1001,6 +1022,8 @@ int main(int argc, char **argv)
10011022
delete fs;
10021023
} else if (action == "bluefs-log-dump") {
10031024
log_dump(cct.get(), path, devs);
1025+
} else if (action == "bluefs-super-dump") {
1026+
super_dump(cct.get(), path, devs);
10041027
} else if (action == "bluefs-bdev-new-db" || action == "bluefs-bdev-new-wal") {
10051028
map<string, int> cur_devs_map;
10061029
bool need_db = action == "bluefs-bdev-new-db";

0 commit comments

Comments
 (0)