Skip to content

Commit dbb4daf

Browse files
author
Prasanna Kumar Kalever
committed
rbd-nbd: fix stuck with disable request
Problem: ------- Trying to disable any feature on an rbd image mapped with nbd leads to stuck in rbd-nbd. The rbd-nbd registers a watcher callback to detect image resize in NBDWatchCtx::handle_notify(). The handle_notify calls image info method, which calls refresh_if_required and it got stuck there. It is getting stuck in ImageState::refresh_if_required() because DisableFeaturesRequest issues update notifications while still holding onto the exclusive lock with everything that has to do with it blocked. Solution: -------- Set only notify flag as part of NBDWatchCtx::handle_notify() and handle the resize detection part as part of a different thread. Fixes: https://tracker.ceph.com/issues/58740 Signed-off-by: Prasanna Kumar Kalever <[email protected]>
1 parent 81e092f commit dbb4daf

File tree

2 files changed

+91
-31
lines changed

2 files changed

+91
-31
lines changed

qa/workunits/rbd/rbd-nbd.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,16 @@ DEV=
472472
rbd feature disable ${POOL}/${IMAGE} journaling
473473
rbd config image rm ${POOL}/${IMAGE} rbd_discard_granularity_bytes
474474

475+
# test that disabling a feature so that the op is proxied to rbd-nbd
476+
# (arranged here by blkdiscard before "rbd feature disable") doesn't hang
477+
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`
478+
get_pid ${POOL}
479+
rbd feature enable ${POOL}/${IMAGE} journaling
480+
_sudo blkdiscard --offset 0 --length 4096 ${DEV}
481+
rbd feature disable ${POOL}/${IMAGE} journaling
482+
unmap_device ${DEV} ${PID}
483+
DEV=
484+
475485
# test that rbd_op_threads setting takes effect
476486
EXPECTED=`ceph-conf --show-config-value librados_thread_count`
477487
DEV=`_sudo rbd device --device-type nbd map ${POOL}/${IMAGE}`

src/tools/rbd_nbd/rbd-nbd.cc

Lines changed: 81 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -738,7 +738,67 @@ class NBDWatchCtx : public librbd::UpdateWatchCtx
738738
bool use_netlink;
739739
librados::IoCtx &io_ctx;
740740
librbd::Image &image;
741-
unsigned long size;
741+
uint64_t size;
742+
std::thread handle_notify_thread;
743+
ceph::condition_variable cond;
744+
ceph::mutex lock = ceph::make_mutex("NBDWatchCtx::Locker");
745+
bool notify = false;
746+
bool terminated = false;
747+
748+
bool wait_notify() {
749+
dout(10) << __func__ << dendl;
750+
751+
std::unique_lock locker{lock};
752+
cond.wait(locker, [this] { return notify || terminated; });
753+
754+
if (terminated) {
755+
return false;
756+
}
757+
758+
dout(10) << __func__ << ": got notify request" << dendl;
759+
notify = false;
760+
return true;
761+
}
762+
763+
void handle_notify_entry() {
764+
dout(10) << __func__ << dendl;
765+
766+
while (wait_notify()) {
767+
uint64_t new_size;
768+
int ret = image.size(&new_size);
769+
if (ret < 0) {
770+
derr << "getting image size failed: " << cpp_strerror(ret) << dendl;
771+
continue;
772+
}
773+
if (new_size == size) {
774+
continue;
775+
}
776+
dout(5) << "resize detected" << dendl;
777+
if (ioctl(fd, BLKFLSBUF, NULL) < 0) {
778+
derr << "invalidate page cache failed: " << cpp_strerror(errno)
779+
<< dendl;
780+
}
781+
if (use_netlink) {
782+
ret = netlink_resize(nbd_index, new_size);
783+
} else {
784+
ret = ioctl(fd, NBD_SET_SIZE, new_size);
785+
if (ret < 0) {
786+
derr << "resize failed: " << cpp_strerror(errno) << dendl;
787+
}
788+
}
789+
if (!ret) {
790+
size = new_size;
791+
}
792+
if (ioctl(fd, BLKRRPART, NULL) < 0) {
793+
derr << "rescan of partition table failed: " << cpp_strerror(errno)
794+
<< dendl;
795+
}
796+
if (image.invalidate_cache() < 0) {
797+
derr << "invalidate rbd cache failed" << dendl;
798+
}
799+
}
800+
}
801+
742802
public:
743803
NBDWatchCtx(int _fd,
744804
int _nbd_index,
@@ -752,41 +812,31 @@ class NBDWatchCtx : public librbd::UpdateWatchCtx
752812
, io_ctx(_io_ctx)
753813
, image(_image)
754814
, size(_size)
755-
{ }
815+
{
816+
handle_notify_thread = make_named_thread("rbd_handle_notify",
817+
&NBDWatchCtx::handle_notify_entry,
818+
this);
819+
}
756820

757-
~NBDWatchCtx() override {}
821+
~NBDWatchCtx() override
822+
{
823+
dout(10) << __func__ << ": terminating" << dendl;
824+
std::unique_lock locker{lock};
825+
terminated = true;
826+
cond.notify_all();
827+
locker.unlock();
828+
829+
handle_notify_thread.join();
830+
dout(10) << __func__ << ": finish" << dendl;
831+
}
758832

759833
void handle_notify() override
760834
{
761-
librbd::image_info_t info;
762-
if (image.stat(info, sizeof(info)) == 0) {
763-
unsigned long new_size = info.size;
764-
int ret;
765-
766-
if (new_size != size) {
767-
dout(5) << "resize detected" << dendl;
768-
if (ioctl(fd, BLKFLSBUF, NULL) < 0)
769-
derr << "invalidate page cache failed: " << cpp_strerror(errno)
770-
<< dendl;
771-
if (use_netlink) {
772-
ret = netlink_resize(nbd_index, new_size);
773-
} else {
774-
ret = ioctl(fd, NBD_SET_SIZE, new_size);
775-
if (ret < 0)
776-
derr << "resize failed: " << cpp_strerror(errno) << dendl;
777-
}
778-
779-
if (!ret)
780-
size = new_size;
835+
dout(10) << __func__ << dendl;
781836

782-
if (ioctl(fd, BLKRRPART, NULL) < 0) {
783-
derr << "rescan of partition table failed: " << cpp_strerror(errno)
784-
<< dendl;
785-
}
786-
if (image.invalidate_cache() < 0)
787-
derr << "invalidate rbd cache failed" << dendl;
788-
}
789-
}
837+
std::unique_lock locker{lock};
838+
notify = true;
839+
cond.notify_all();
790840
}
791841
};
792842

0 commit comments

Comments
 (0)