@@ -6702,13 +6702,18 @@ int BlueStore::_read_bdev_label(
67026702 decode(expected_crc, p);
67036703 }
67046704 catch (ceph::buffer::error& e) {
6705- derr << __func__ << " " << path.c_str() << " data at " << std::hex << disk_position
6706- << std::dec << ", " << "unable to decode label " << dendl;
6705+ // We can still get here in non-erroneous scenarios,
6706+ // hence do not log that as an error
6707+ dout(0) << __func__ << " " << path.c_str() << " data at 0x" << std::hex << disk_position
6708+ << std::dec << ", " << "unable to decode label "
6709+ << dendl;
67076710 return -ENOENT;
67086711 }
67096712 if (crc != expected_crc) {
6710- derr << __func__ << " bad crc on label, expected " << expected_crc
6711- << " != actual " << crc << dendl;
6713+ // We can still get here in non-erroneousscenarios,
6714+ // hence do not log that as an error
6715+ dout(0) << __func__ << " bad crc on label, expected " << expected_crc
6716+ << " != actual " << crc << dendl;
67126717 return -EIO;
67136718 }
67146719 dout(10) << __func__ << " got " << *label << dendl;
@@ -6813,7 +6818,9 @@ int BlueStore::_read_multi_bdev_label(
68136818 return -ENOENT;
68146819 }
68156820 done:
6816- dout(10) << __func__ << " got " << *out_label << dendl;
6821+ dout(10) << __func__ << " got " << *out_label
6822+ << (all_labels_valid ? " all labels valid " : " some labels missing ")
6823+ << dendl;
68176824 return all_labels_valid ? 0 : 1;
68186825}
68196826
@@ -6828,6 +6835,7 @@ void BlueStore::_main_bdev_label_try_reserve()
68286835 ceph_assert(bdev_label_multi == true);
68296836 vector<uint64_t> candidate_positions;
68306837 vector<uint64_t> accepted_positions;
6838+ dout(20) << __func__ << " input " << bdev_label_valid_locations << dendl;
68316839 uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
68326840 for (uint64_t location : bdev_label_valid_locations) {
68336841 if (location != BDEV_FIRST_LABEL_POSITION) {
@@ -6861,6 +6869,7 @@ void BlueStore::_main_bdev_label_try_reserve()
68616869 << " occupied by BlueStore object or BlueFS file, disabling" << dendl;
68626870 std::erase(bdev_label_valid_locations, candidate_positions[i]);
68636871 }
6872+ dout(20) << __func__ << " result " << bdev_label_valid_locations << dendl;
68646873}
68656874
68666875void BlueStore::_main_bdev_label_remove(Allocator* an_alloc)
@@ -6955,14 +6964,36 @@ int BlueStore::_check_main_bdev_label()
69556964 ++valid_locations;
69566965 }
69576966 }
6958- if (valid_locations != bdev_label_valid_locations.size()) {
6959- derr << __func__ << " not all labels read properly" << dendl;
6967+ if (valid_locations > bdev_label_valid_locations.size()) {
6968+ derr << __func__ << " not all labels read properly, "
6969+ << valid_locations << "!=" << bdev_label_valid_locations.size()
6970+ << dendl;
69606971 return -EIO;
69616972 }
69626973 }
69636974 return 0;
69646975}
69656976
6977+ int BlueStore::read_bdev_label_at_pos(
6978+ CephContext* cct,
6979+ const std::string &bdev_path,
6980+ uint64_t disk_position,
6981+ bluestore_bdev_label_t *label)
6982+ {
6983+ unique_ptr<BlockDevice> bdev(BlockDevice::create(
6984+ cct, bdev_path, nullptr, nullptr, nullptr, nullptr));
6985+ if (!bdev) {
6986+ return -EIO;
6987+ }
6988+ bdev->set_no_exclusive_lock();
6989+ int r = bdev->open(bdev_path);
6990+ if (r < 0)
6991+ return r;
6992+ r = _read_bdev_label(cct, bdev.get(), bdev_path, label, disk_position);
6993+ bdev->close();
6994+ return r;
6995+ }
6996+
69666997int BlueStore::read_bdev_label(
69676998 CephContext* cct,
69686999 const std::string &path,
@@ -7296,7 +7327,16 @@ int BlueStore::_init_alloc()
72967327 return -ENOTRECOVERABLE;
72977328 }
72987329 }
7330+ if (before_expansion_bdev_size > 0 &&
7331+ before_expansion_bdev_size < bdev_label.size) {
7332+ // we grow the allocation range, must reflect it in the allocation file
7333+ alloc->init_add_free(before_expansion_bdev_size,
7334+ bdev_label.size - before_expansion_bdev_size);
7335+ need_to_destage_allocation_file = true;
7336+ }
72997337 }
7338+ before_expansion_bdev_size = 0;
7339+
73007340 dout(1) << __func__
73017341 << " loaded " << byte_u_t(bytes) << " in " << num << " extents"
73027342 << std::hex
@@ -8926,99 +8966,132 @@ string BlueStore::get_device_path(unsigned id)
89268966 return res;
89278967}
89288968
8929- int BlueStore::_set_bdev_label_size(const string& path, uint64_t size)
8930- {
8931- bluestore_bdev_label_t label;
8932- int r = _read_bdev_label(cct, bdev, path, &label);
8933- if (r < 0) {
8934- derr << "unable to read label for " << path << ": "
8935- << cpp_strerror(r) << dendl;
8936- } else {
8937- label.size = size;
8938- r = _write_bdev_label(cct, bdev, path, label);
8939- if (r < 0) {
8940- derr << "unable to write label for " << path << ": "
8941- << cpp_strerror(r) << dendl;
8942- }
8943- }
8944- return r;
8945- }
8946-
89478969int BlueStore::expand_devices(ostream& out)
89488970{
8971+ // let's open in read-only mode first to be able to recover
8972+ // from the out-of-space state at DB/shared volume(s)
8973+ // Opening in R/W mode might cause extra space allocation
8974+ // which is effectively a show stopper for volume expansion.
89498975 int r = _open_db_and_around(true);
89508976 ceph_assert(r == 0);
89518977 bluefs->dump_block_extents(out);
89528978 out << "Expanding DB/WAL..." << std::endl;
8979+ // updating dedicated devices first
89538980 for (auto devid : { BlueFS::BDEV_WAL, BlueFS::BDEV_DB}) {
8954- if (devid == bluefs_layout.shared_bdev ) {
8981+ if (devid == bluefs_layout.shared_bdev) {
89558982 continue;
89568983 }
8957- uint64_t size = bluefs->get_block_device_size(devid);
8984+ auto my_bdev = bluefs->get_block_device(devid);
8985+ uint64_t size = my_bdev ? my_bdev->get_size() : 0;
89588986 if (size == 0) {
89598987 // no bdev
89608988 continue;
89618989 }
8962-
8963- out << devid
8964- <<" : expanding " << " to 0x" << size << std::dec << std::endl;
8965- string p = get_device_path(devid);
8966- const char* path = p.c_str();
8967- if (path == nullptr) {
8968- derr << devid
8969- <<": can't find device path " << dendl;
8970- continue;
8971- }
8972- if (bluefs->bdev_support_label(devid)) {
8973- if (_set_bdev_label_size(p, size) >= 0) {
8974- out << devid
8975- << " : size label updated to " << size
8976- << std::endl;
8990+ if (my_bdev->supported_bdev_label()) {
8991+ string my_path = get_device_path(devid);
8992+ bluestore_bdev_label_t my_label;
8993+ int r = _read_bdev_label(cct, my_bdev, my_path, &my_label);
8994+ if (r < 0) {
8995+ derr << "unable to read label for " << my_path << ": "
8996+ << cpp_strerror(r) << dendl;
8997+ continue;
8998+ } else {
8999+ if (size == my_label.size) {
9000+ // no need to expand
9001+ out << devid
9002+ << " : nothing to do, skipped"
9003+ << std::endl;
9004+ continue;
9005+ } else if (size < my_label.size) {
9006+ // something weird in bdev label
9007+ out << devid
9008+ <<" : ERROR: bdev label is above device size, skipped"
9009+ << std::endl;
9010+ continue;
9011+ } else {
9012+ my_label.size = size;
9013+ out << devid
9014+ << " : Expanding to 0x" << std::hex << size
9015+ << std::dec << "(" << byte_u_t(size) << ")"
9016+ << std::endl;
9017+ r = _write_bdev_label(cct, my_bdev, my_path, my_label);
9018+ if (r < 0) {
9019+ derr << "unable to write label for " << my_path << ": "
9020+ << cpp_strerror(r) << dendl;
9021+ } else {
9022+ out << devid
9023+ << " : size updated to 0x" << std::hex << size
9024+ << std::dec << "(" << byte_u_t(size) << ")"
9025+ << std::endl;
9026+ }
9027+ }
89779028 }
89789029 }
89799030 }
9031+ // now proceed with a shared device
89809032 uint64_t size0 = fm->get_size();
89819033 uint64_t size = bdev->get_size();
8982- if (size0 < size) {
8983- out << bluefs_layout.shared_bdev
8984- << " : expanding " << " from 0x" << std::hex
8985- << size0 << " to 0x" << size << std::dec << std::endl;
8986- _write_out_fm_meta(size);
8987- if (bdev->supported_bdev_label()) {
8988- if (_set_bdev_label_size(path, size) >= 0) {
8989- out << bluefs_layout.shared_bdev
8990- << " : size label updated to " << size
8991- << std::endl;
8992- }
8993- if (bdev_label_multi) {
8994- uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
8995- for (uint64_t loc : bdev_label_positions) {
8996- if ((loc >= size0) && (loc + lsize <= size)) {
8997- bdev_label_valid_locations.push_back(loc);
9034+ auto devid = bluefs_layout.shared_bdev;
9035+ auto aligned_size = p2align(size, min_alloc_size);
9036+ if (aligned_size == size0) {
9037+ // no need to expand
9038+ out << devid
9039+ << " : nothing to do, skipped"
9040+ << std::endl;
9041+ } else if (aligned_size < size0) {
9042+ // something weird in bdev label
9043+ out << devid
9044+ << " : ERROR: previous device size is above the current one, skipped"
9045+ << std::endl;
9046+ } else {
9047+ auto my_path = get_device_path(devid);
9048+ out << devid
9049+ <<" : Expanding to 0x" << std::hex << size
9050+ << std::dec << "(" << byte_u_t(size) << ")"
9051+ << std::endl;
9052+ r = _write_out_fm_meta(size);
9053+ if (r != 0) {
9054+ derr << "unable to write out fm meta for " << my_path << ": "
9055+ << cpp_strerror(r) << dendl;
9056+ } else if (bdev->supported_bdev_label()) {
9057+ bdev_label.size = size;
9058+ uint64_t lsize = std::max(BDEV_LABEL_BLOCK_SIZE, min_alloc_size);
9059+ for (uint64_t loc : bdev_label_positions) {
9060+ if ((loc >= size0) && (loc + lsize <= size)) {
9061+ bdev_label_valid_locations.push_back(loc);
9062+ if (!bdev_label_multi) {
9063+ break;
89989064 }
89999065 }
9000- _write_bdev_label(cct, bdev, path + "/block", bdev_label, bdev_label_valid_locations);
9066+ }
9067+ r = _write_bdev_label(cct, bdev, my_path,
9068+ bdev_label, bdev_label_valid_locations);
9069+ if (r != 0) {
9070+ derr << "unable to write label(s) for " << my_path << ": "
9071+ << cpp_strerror(r) << dendl;
90019072 }
90029073 }
9003- _close_db_and_around();
9074+ if (r == 0) {
9075+ out << devid
9076+ << " : size updated to 0x" << std::hex << size
9077+ << std::dec << "(" << byte_u_t(size) << ")"
9078+ << std::endl;
9079+ _close_db_and_around();
90049080
9005- // mount in read/write to sync expansion changes
9006- if (bdev_label_multi) {
9007- // We need not do fsck, because we can be broken - size is increased,
9008- // but we might not have labels set.
9009- cct->_conf.set_val_or_die("bluestore_fsck_on_mount", "false");
9010- }
9011- r = _mount();
9012- ceph_assert(r == 0);
9013- if (fm && fm->is_null_manager()) {
9014- // we grow the allocation range, must reflect it in the allocation file
9015- alloc->init_add_free(size0, size - size0);
9016- need_to_destage_allocation_file = true;
9081+ //
9082+ // Mount in read/write to sync expansion changes
9083+ // and make sure everything is all right.
9084+ //
9085+ before_expansion_bdev_size = size0; // preserve orignal size to permit
9086+ // following _db_open_and_around()
9087+ // do some post-init stuff on opened
9088+ // allocator.
9089+
9090+ r = _open_db_and_around(false);
9091+ ceph_assert(r == 0);
90179092 }
9018- umount();
9019- } else {
9020- _close_db_and_around();
90219093 }
9094+ _close_db_and_around();
90229095 return r;
90239096}
90249097
@@ -10782,7 +10855,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1078210855 if (bdev->supported_bdev_label() && bdev_label_multi) {
1078310856 for (size_t i = 0; i < bdev_label_positions.size(); i++) {
1078410857 uint64_t location = bdev_label_positions[i];
10785- if (location + BDEV_LABEL_BLOCK_SIZE > bdev->get_size() ) {
10858+ if (location + BDEV_LABEL_BLOCK_SIZE > bdev_label.size ) {
1078610859 continue;
1078710860 }
1078810861 if (std::find(
@@ -10805,7 +10878,6 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1080510878
1080610879 if (bluefs) {
1080710880 interval_set<uint64_t> bluefs_extents;
10808-
1080910881 bluefs->foreach_block_extents(
1081010882 bluefs_layout.shared_bdev,
1081110883 [&](uint64_t start, uint32_t len) {
@@ -10849,7 +10921,7 @@ int BlueStore::_fsck_on_open(BlueStore::FSCKDepth depth, bool repair)
1084910921 for (size_t i = 0; i < bdev_label_positions.size(); i++) {
1085010922 uint64_t position = bdev_label_positions[i];
1085110923 uint64_t length = std::max<uint64_t>(BDEV_LABEL_BLOCK_SIZE, alloc_size);
10852- if (position + length <= bdev->get_size() ) {
10924+ if (position + length <= bdev_label.size ) {
1085310925 apply_for_bitset_range(position, length, alloc_size, used_blocks,
1085410926 [&](uint64_t pos, mempool_dynamic_bitset& bs) {
1085510927 bs.set(pos);
0 commit comments