Skip to content

Commit 0fac673

Browse files
Merge pull request ceph#61843 from ifed01/wip-ifed-fix-expand
os/bluestore: fix bdev expansion and more Reviewed-by: Adam Kupczyk <[email protected]>
2 parents 2ab735f + ac77891 commit 0fac673

File tree

6 files changed

+225
-98
lines changed

6 files changed

+225
-98
lines changed

doc/man/8/ceph-bluestore-tool.rst

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ Synopsis
2121
| **ceph-bluestore-tool** allocmap --path *osd path*
2222
| **ceph-bluestore-tool** restore_cfb --path *osd path*
2323
| **ceph-bluestore-tool** show-label --dev *device* ...
24+
| **ceph-bluestore-tool** show-label-at --dev *device* --offset *lba*...
2425
| **ceph-bluestore-tool** prime-osd-dir --dev *device* --path *osd path*
2526
| **ceph-bluestore-tool** bluefs-export --path *osd path* --out-dir *dir*
2627
| **ceph-bluestore-tool** bluefs-bdev-new-wal --path *osd path* --dev-target *new-device*
@@ -114,6 +115,13 @@ Commands
114115
Show device label(s).
115116
The label may be printed while an OSD is running.
116117

118+
:command:`show-label-at` --dev *device* --offset *lba*[...]
119+
120+
Show device label at specific disk location. Dedicated DB/WAL volumes have a single label at offset 0.
121+
Main device could have valid labels at multiple locations: 0/1GiB/10GiB/100GiB/1000GiB.
122+
The labels at some locations might not exist though.
123+
The label may be printed while an OSD is running.
124+
117125
:command:`free-dump` --path *osd path* [ --allocator block/bluefs-wal/bluefs-db/bluefs-slow ]
118126

119127
Dump all free regions in allocator.
@@ -219,6 +227,8 @@ Device labels
219227
=============
220228

221229
Every BlueStore block device has a block label at the beginning of the device.
230+
Main device might optionaly have additional labels at different locations
231+
for the sake of OSD robustness.
222232
You can dump the contents of the label with::
223233

224234
ceph-bluestore-tool show-label --dev *device*
@@ -227,7 +237,7 @@ The main device will have a lot of metadata, including information
227237
that used to be stored in small files in the OSD data directory. The
228238
auxiliary devices (db and wal) will only have the minimum required
229239
fields (OSD UUID, size, device type, birth time).
230-
The main device contains additional label copies at offsets: 1G, 10G, 100G and 1000G.
240+
The main device contains additional label copies at offsets: 1GiB, 10GiB, 100GiB and 1000GiB.
231241
Corrupted labels are fixed as part of repair::
232242

233243
ceph-bluestore-tool repair --dev *device*

src/os/bluestore/BlueFS.cc

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -639,16 +639,17 @@ void BlueFS::dump_perf_counters(Formatter *f)
639639
void BlueFS::dump_block_extents(ostream& out)
640640
{
641641
for (unsigned i = 0; i < MAX_BDEV; ++i) {
642-
if (!bdev[i]) {
642+
if (!bdev[i] || !alloc[i]) {
643643
continue;
644644
}
645-
auto total = get_total(i);
645+
auto total = get_total(i) + block_reserved[i];
646646
auto free = get_free(i);
647647

648648
out << i << " : device size 0x" << std::hex << total
649+
<< "(" << byte_u_t(total) << ")"
649650
<< " : using 0x" << total - free
650-
<< std::dec << "(" << byte_u_t(total - free) << ")";
651-
out << "\n";
651+
<< "(" << byte_u_t(total - free) << ")"
652+
<< std::dec << std::endl;
652653
}
653654
}
654655

src/os/bluestore/BlueStore.cc

Lines changed: 150 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -6702,13 +6702,18 @@ int BlueStore::_read_bdev_label(
67026702
decode(expected_crc, p);
67036703
}
67046704
catch (ceph::buffer::error& e) {
6705-
derr << __func__ << " " << path.c_str() << " data at " << std::hex << disk_position
6706-
<< std::dec << ", " << "unable to decode label " << dendl;
6705+
// We can still get here in non-erroneous scenarios,
6706+
// hence do not log that as an error
6707+
dout(0) << __func__ << " " << path.c_str() << " data at 0x" << std::hex << disk_position
6708+
<< std::dec << ", " << "unable to decode label "
6709+
<< dendl;
67076710
return -ENOENT;
67086711
}
67096712
if (crc != expected_crc) {
6710-
derr << __func__ << " bad crc on label, expected " << expected_crc
6711-
<< " != actual " << crc << dendl;
6713+
// We can still get here in non-erroneousscenarios,
6714+
// hence do not log that as an error
6715+
dout(0) << __func__ << " bad crc on label, expected " << expected_crc
6716+
<< " != actual " << crc << dendl;
67126717
return -EIO;
67136718
}
67146719
dout(10) << __func__ << " got " << *label << dendl;
@@ -6813,7 +6818,9 @@ int BlueStore::_read_multi_bdev_label(
68136818
return -ENOENT;
68146819
}
68156820
done:
6816-
dout(10) << __func__ << " got " << *out_label << dendl;
6821+
dout(10) << __func__ << " got " << *out_label
6822+
<< (all_labels_valid ? " all labels valid " : " some labels missing ")
6823+
<< dendl;
68176824
return all_labels_valid ? 0 : 1;
68186825
}
68196826

@@ -6828,6 +6835,7 @@ void BlueStore::_main_bdev_label_try_reserve()
68286835
ceph_assert(bdev_label_multi == true);
68296836
vector<uint64_t> candidate_positions;
68306837
vector<uint64_t> accepted_positions;
6838+
dout(20) << __func__ << " input " << bdev_label_valid_locations << dendl;
68316839
uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
68326840
for (uint64_t location : bdev_label_valid_locations) {
68336841
if (location != BDEV_FIRST_LABEL_POSITION) {
@@ -6861,6 +6869,7 @@ void BlueStore::_main_bdev_label_try_reserve()
68616869
<< " occupied by BlueStore object or BlueFS file, disabling" << dendl;
68626870
std::erase(bdev_label_valid_locations, candidate_positions[i]);
68636871
}
6872+
dout(20) << __func__ << " result " << bdev_label_valid_locations << dendl;
68646873
}
68656874

68666875
void BlueStore::_main_bdev_label_remove(Allocator* an_alloc)
@@ -6955,14 +6964,36 @@ int BlueStore::_check_main_bdev_label()
69556964
++valid_locations;
69566965
}
69576966
}
6958-
if (valid_locations != bdev_label_valid_locations.size()) {
6959-
derr << __func__ << " not all labels read properly" << dendl;
6967+
if (valid_locations > bdev_label_valid_locations.size()) {
6968+
derr << __func__ << " not all labels read properly, "
6969+
<< valid_locations << "!=" << bdev_label_valid_locations.size()
6970+
<< dendl;
69606971
return -EIO;
69616972
}
69626973
}
69636974
return 0;
69646975
}
69656976

6977+
int BlueStore::read_bdev_label_at_pos(
6978+
CephContext* cct,
6979+
const std::string &bdev_path,
6980+
uint64_t disk_position,
6981+
bluestore_bdev_label_t *label)
6982+
{
6983+
unique_ptr<BlockDevice> bdev(BlockDevice::create(
6984+
cct, bdev_path, nullptr, nullptr, nullptr, nullptr));
6985+
if (!bdev) {
6986+
return -EIO;
6987+
}
6988+
bdev->set_no_exclusive_lock();
6989+
int r = bdev->open(bdev_path);
6990+
if (r < 0)
6991+
return r;
6992+
r = _read_bdev_label(cct, bdev.get(), bdev_path, label, disk_position);
6993+
bdev->close();
6994+
return r;
6995+
}
6996+
69666997
int BlueStore::read_bdev_label(
69676998
CephContext* cct,
69686999
const std::string &path,
@@ -7296,7 +7327,16 @@ int BlueStore::_init_alloc()
72967327
return -ENOTRECOVERABLE;
72977328
}
72987329
}
7330+
if (before_expansion_bdev_size > 0 &&
7331+
before_expansion_bdev_size < bdev_label.size) {
7332+
// we grow the allocation range, must reflect it in the allocation file
7333+
alloc->init_add_free(before_expansion_bdev_size,
7334+
bdev_label.size - before_expansion_bdev_size);
7335+
need_to_destage_allocation_file = true;
7336+
}
72997337
}
7338+
before_expansion_bdev_size = 0;
7339+
73007340
dout(1) << __func__
73017341
<< " loaded " << byte_u_t(bytes) << " in " << num << " extents"
73027342
<< std::hex
@@ -8926,99 +8966,132 @@ string BlueStore::get_device_path(unsigned id)
89268966
return res;
89278967
}
89288968

8929-
int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
8930-
{
8931-
bluestore_bdev_label_t label;
8932-
int r = _read_bdev_label(cct, bdev, path, &label);
8933-
if (r < 0) {
8934-
derr << "unable to read label for " << path << ": "
8935-
<< cpp_strerror(r) << dendl;
8936-
} else {
8937-
label.size = size;
8938-
r = _write_bdev_label(cct, bdev, path, label);
8939-
if (r < 0) {
8940-
derr << "unable to write label for " << path << ": "
8941-
<< cpp_strerror(r) << dendl;
8942-
}
8943-
}
8944-
return r;
8945-
}
8946-
89478969
int BlueStore::expand_devices(ostream& out)
89488970
{
8971+
// let's open in read-only mode first to be able to recover
8972+
// from the out-of-space state at DB/shared volume(s)
8973+
// Opening in R/W mode might cause extra space allocation
8974+
// which is effectively a show stopper for volume expansion.
89498975
int r = _open_db_and_around(true);
89508976
ceph_assert(r == 0);
89518977
bluefs->dump_block_extents(out);
89528978
out << "Expanding DB/WAL..." << std::endl;
8979+
// updating dedicated devices first
89538980
for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
8954-
if (devid == bluefs_layout.shared_bdev ) {
8981+
if (devid == bluefs_layout.shared_bdev) {
89558982
continue;
89568983
}
8957-
uint64_t size = bluefs->get_block_device_size(devid);
8984+
auto my_bdev = bluefs->get_block_device(devid);
8985+
uint64_t size = my_bdev ? my_bdev->get_size() : 0;
89588986
if (size == 0) {
89598987
// no bdev
89608988
continue;
89618989
}
8962-
8963-
out << devid
8964-
<<" : expanding " << " to 0x" << size << std::dec << std::endl;
8965-
string p = get_device_path(devid);
8966-
const char* path = p.c_str();
8967-
if (path == nullptr) {
8968-
derr << devid
8969-
<<": can't find device path " << dendl;
8970-
continue;
8971-
}
8972-
if (bluefs->bdev_support_label(devid)) {
8973-
if (_set_bdev_label_size(p, size) >= 0) {
8974-
out << devid
8975-
<< " : size label updated to " << size
8976-
<< std::endl;
8990+
if (my_bdev->supported_bdev_label()) {
8991+
string my_path = get_device_path(devid);
8992+
bluestore_bdev_label_t my_label;
8993+
int r = _read_bdev_label(cct, my_bdev, my_path, &my_label);
8994+
if (r < 0) {
8995+
derr << "unable to read label for " << my_path << ": "
8996+
<< cpp_strerror(r) << dendl;
8997+
continue;
8998+
} else {
8999+
if (size == my_label.size) {
9000+
// no need to expand
9001+
out << devid
9002+
<< " : nothing to do, skipped"
9003+
<< std::endl;
9004+
continue;
9005+
} else if (size < my_label.size) {
9006+
// something weird in bdev label
9007+
out << devid
9008+
<<" : ERROR: bdev label is above device size, skipped"
9009+
<< std::endl;
9010+
continue;
9011+
} else {
9012+
my_label.size = size;
9013+
out << devid
9014+
<< " : Expanding to 0x" << std::hex << size
9015+
<< std::dec << "(" << byte_u_t(size) << ")"
9016+
<< std::endl;
9017+
r = _write_bdev_label(cct, my_bdev, my_path, my_label);
9018+
if (r < 0) {
9019+
derr << "unable to write label for " << my_path << ": "
9020+
<< cpp_strerror(r) << dendl;
9021+
} else {
9022+
out << devid
9023+
<< " : size updated to 0x" << std::hex << size
9024+
<< std::dec << "(" << byte_u_t(size) << ")"
9025+
<< std::endl;
9026+
}
9027+
}
89779028
}
89789029
}
89799030
}
9031+
// now proceed with a shared device
89809032
uint64_t size0 = fm->get_size();
89819033
uint64_t size = bdev->get_size();
8982-
if (size0 < size) {
8983-
out << bluefs_layout.shared_bdev
8984-
<< " : expanding " << " from 0x" << std::hex
8985-
<< size0 << " to 0x" << size << std::dec << std::endl;
8986-
_write_out_fm_meta(size);
8987-
if (bdev->supported_bdev_label()) {
8988-
if (_set_bdev_label_size(path, size) >= 0) {
8989-
out << bluefs_layout.shared_bdev
8990-
<< " : size label updated to " << size
8991-
<< std::endl;
8992-
}
8993-
if (bdev_label_multi) {
8994-
uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
8995-
for (uint64_t loc : bdev_label_positions) {
8996-
if ((loc >= size0) && (loc + lsize <= size)) {
8997-
bdev_label_valid_locations.push_back(loc);
9034+
auto devid = bluefs_layout.shared_bdev;
9035+
auto aligned_size = p2align(size, min_alloc_size);
9036+
if (aligned_size == size0) {
9037+
// no need to expand
9038+
out << devid
9039+
<< " : nothing to do, skipped"
9040+
<< std::endl;
9041+
} else if (aligned_size < size0) {
9042+
// something weird in bdev label
9043+
out << devid
9044+
<< " : ERROR: previous device size is above the current one, skipped"
9045+
<< std::endl;
9046+
} else {
9047+
auto my_path = get_device_path(devid);
9048+
out << devid
9049+
<<" : Expanding to 0x" << std::hex << size
9050+
<< std::dec << "(" << byte_u_t(size) << ")"
9051+
<< std::endl;
9052+
r = _write_out_fm_meta(size);
9053+
if (r != 0) {
9054+
derr << "unable to write out fm meta for " << my_path << ": "
9055+
<< cpp_strerror(r) << dendl;
9056+
} else if (bdev->supported_bdev_label()) {
9057+
bdev_label.size = size;
9058+
uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
9059+
for (uint64_t loc : bdev_label_positions) {
9060+
if ((loc >= size0) && (loc + lsize <= size)) {
9061+
bdev_label_valid_locations.push_back(loc);
9062+
if (!bdev_label_multi) {
9063+
break;
89989064
}
89999065
}
9000-
_write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations);
9066+
}
9067+
r = _write_bdev_label(cct, bdev, my_path,
9068+
bdev_label, bdev_label_valid_locations);
9069+
if (r != 0) {
9070+
derr << "unable to write label(s) for " << my_path << ": "
9071+
<< cpp_strerror(r) << dendl;
90019072
}
90029073
}
9003-
_close_db_and_around();
9074+
if (r == 0) {
9075+
out << devid
9076+
<< " : size updated to 0x" << std::hex << size
9077+
<< std::dec << "(" << byte_u_t(size) << ")"
9078+
<< std::endl;
9079+
_close_db_and_around();
90049080

9005-
// mount in read/write to sync expansion changes
9006-
if (bdev_label_multi) {
9007-
// We need not do fsck, because we can be broken - size is increased,
9008-
// but we might not have labels set.
9009-
cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false");
9010-
}
9011-
r = _mount();
9012-
ceph_assert(r == 0);
9013-
if (fm && fm->is_null_manager()) {
9014-
// we grow the allocation range, must reflect it in the allocation file
9015-
alloc->init_add_free(size0, size - size0);
9016-
need_to_destage_allocation_file = true;
9081+
//
9082+
// Mount in read/write to sync expansion changes
9083+
// and make sure everything is all right.
9084+
//
9085+
before_expansion_bdev_size = size0; // preserve orignal size to permit
9086+
// following _db_open_and_around()
9087+
// do some post-init stuff on opened
9088+
// allocator.
9089+
9090+
r = _open_db_and_around(false);
9091+
ceph_assert(r == 0);
90179092
}
9018-
umount();
9019-
} else {
9020-
_close_db_and_around();
90219093
}
9094+
_close_db_and_around();
90229095
return r;
90239096
}
90249097

@@ -10782,7 +10855,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1078210855
if (bdev->supported_bdev_label() && bdev_label_multi) {
1078310856
for (size_t i = 0; i < bdev_label_positions.size(); i++) {
1078410857
uint64_t location = bdev_label_positions[i];
10785-
if (location + BDEV_LABEL_BLOCK_SIZE > bdev->get_size()) {
10858+
if (location + BDEV_LABEL_BLOCK_SIZE > bdev_label.size) {
1078610859
continue;
1078710860
}
1078810861
if (std::find(
@@ -10805,7 +10878,6 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1080510878

1080610879
if (bluefs) {
1080710880
interval_set<uint64_t> bluefs_extents;
10808-
1080910881
bluefs->foreach_block_extents(
1081010882
bluefs_layout.shared_bdev,
1081110883
[&](uint64_t start, uint32_t len) {
@@ -10849,7 +10921,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1084910921
for (size_t i = 0; i < bdev_label_positions.size(); i++) {
1085010922
uint64_t position = bdev_label_positions[i];
1085110923
uint64_t length = std::max<uint64_t>(BDEV_LABEL_BLOCK_SIZE, alloc_size);
10852-
if (position + length <= bdev->get_size()) {
10924+
if (position + length <= bdev_label.size) {
1085310925
apply_for_bitset_range(position, length, alloc_size, used_blocks,
1085410926
[&](uint64_t pos, mempool_dynamic_bitset& bs) {
1085510927
bs.set(pos);

0 commit comments

Comments
 (0)