Skip to content

Commit e6e2843

Browse files
committed
libceph: fix potential hang in ceph_osdc_notify()
If the cluster becomes unavailable, ceph_osdc_notify() may hang even with osd_request_timeout option set because linger_notify_finish_wait() waits for MWatchNotify NOTIFY_COMPLETE message with no associated OSD request in flight -- it's completely asynchronous. Introduce an additional timeout, derived from the specified notify timeout. While at it, switch both waits to killable which is more correct. Cc: [email protected] Signed-off-by: Ilya Dryomov <[email protected]> Reviewed-by: Dongsheng Yang <[email protected]> Reviewed-by: Xiubo Li <[email protected]>
1 parent 9d01e07 commit e6e2843

File tree

1 file changed

+14
-6
lines changed

1 file changed

+14
-6
lines changed

net/ceph/osd_client.c

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3334,17 +3334,24 @@ static int linger_reg_commit_wait(struct ceph_osd_linger_request *lreq)
33343334
int ret;
33353335

33363336
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
3337-
ret = wait_for_completion_interruptible(&lreq->reg_commit_wait);
3337+
ret = wait_for_completion_killable(&lreq->reg_commit_wait);
33383338
return ret ?: lreq->reg_commit_error;
33393339
}
33403340

3341-
static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq)
3341+
static int linger_notify_finish_wait(struct ceph_osd_linger_request *lreq,
3342+
unsigned long timeout)
33423343
{
3343-
int ret;
3344+
long left;
33443345

33453346
dout("%s lreq %p linger_id %llu\n", __func__, lreq, lreq->linger_id);
3346-
ret = wait_for_completion_interruptible(&lreq->notify_finish_wait);
3347-
return ret ?: lreq->notify_finish_error;
3347+
left = wait_for_completion_killable_timeout(&lreq->notify_finish_wait,
3348+
ceph_timeout_jiffies(timeout));
3349+
if (left <= 0)
3350+
left = left ?: -ETIMEDOUT;
3351+
else
3352+
left = lreq->notify_finish_error; /* completed */
3353+
3354+
return left;
33483355
}
33493356

33503357
/*
@@ -4896,7 +4903,8 @@ int ceph_osdc_notify(struct ceph_osd_client *osdc,
48964903
linger_submit(lreq);
48974904
ret = linger_reg_commit_wait(lreq);
48984905
if (!ret)
4899-
ret = linger_notify_finish_wait(lreq);
4906+
ret = linger_notify_finish_wait(lreq,
4907+
msecs_to_jiffies(2 * timeout * MSEC_PER_SEC));
49004908
else
49014909
dout("lreq %p failed to initiate notify %d\n", lreq, ret);
49024910

0 commit comments

Comments
 (0)