Skip to content

Commit 819f7b8

Browse files
ChaitanayaKulkarniaxboe
authored andcommitted
nvmet: fail outstanding host posted AEN req
In function nvmet_async_event_process() we only process AENs iff there is an open slot on the ctrl->async_event_cmds[] && aen event list posted by the target is not empty. This keeps host posted AEN outstanding if target generated AEN list is empty. We do cleanup the target generated entries from the aen list in nvmet_ctrl_free()-> nvmet_async_events_free() but we don't process AEN posted by the host. This leads to following problem :- When processing admin sq at the time of nvmet_sq_destroy() holds an extra percpu reference(atomic value = 1), so in the following code path after switching to atomic rcu, release function (nvmet_sq_free()) is not getting called which blocks the sq->free_done in nvmet_sq_destroy() :- nvmet_sq_destroy() percpu_ref_kill_and_confirm() - __percpu_ref_switch_mode() -- __percpu_ref_switch_to_atomic() --- call_rcu() -> percpu_ref_switch_to_atomic_rcu() ---- /* calls switch callback */ - percpu_ref_put() -- percpu_ref_put_many(ref, 1) --- else if (unlikely(atomic_long_sub_and_test(nr, &ref->count))) ---- ref->release(ref); <---- Not called. This results in indefinite hang:- void nvmet_sq_destroy(struct nvmet_sq *sq) ... if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq) { nvmet_async_events_process(ctrl, status); percpu_ref_put(&sq->ref); } percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq); wait_for_completion(&sq->confirm_done); wait_for_completion(&sq->free_done); <-- Hang here Which breaks the further disconnect sequence. This problem seems to be introduced after commit 64f5e9c ("nvmet: fix memory leak when removing namespaces and controllers concurrently"). This patch processes ctrl->async_event_cmds[] in the admin sq destroy() context irrespetive of aen_list. Also we get rid of the controller's aen_list processing in the nvmet_sq_destroy() context and just ignore ctrl->aen_list. This results in nvmet_async_events_process() being called from workqueue context so we adjust the code accordingly. Fixes: 64f5e9c ("nvmet: fix memory leak when removing namespaces and controllers concurrently ") Signed-off-by: Chaitanya Kulkarni <[email protected]> Reviewed-by: Sagi Grimberg <[email protected]> Signed-off-by: Christoph Hellwig <[email protected]> Signed-off-by: Jens Axboe <[email protected]>
1 parent b97120b commit 819f7b8

File tree

1 file changed

+20
-7
lines changed

1 file changed

+20
-7
lines changed

drivers/nvme/target/core.c

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,22 @@ static u32 nvmet_async_event_result(struct nvmet_async_event *aen)
129129
return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16);
130130
}
131131

132-
static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
132+
static void nvmet_async_events_failall(struct nvmet_ctrl *ctrl)
133+
{
134+
u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
135+
struct nvmet_req *req;
136+
137+
mutex_lock(&ctrl->lock);
138+
while (ctrl->nr_async_event_cmds) {
139+
req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
140+
mutex_unlock(&ctrl->lock);
141+
nvmet_req_complete(req, status);
142+
mutex_lock(&ctrl->lock);
143+
}
144+
mutex_unlock(&ctrl->lock);
145+
}
146+
147+
static void nvmet_async_events_process(struct nvmet_ctrl *ctrl)
133148
{
134149
struct nvmet_async_event *aen;
135150
struct nvmet_req *req;
@@ -139,15 +154,14 @@ static void nvmet_async_events_process(struct nvmet_ctrl *ctrl, u16 status)
139154
aen = list_first_entry(&ctrl->async_events,
140155
struct nvmet_async_event, entry);
141156
req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds];
142-
if (status == 0)
143-
nvmet_set_result(req, nvmet_async_event_result(aen));
157+
nvmet_set_result(req, nvmet_async_event_result(aen));
144158

145159
list_del(&aen->entry);
146160
kfree(aen);
147161

148162
mutex_unlock(&ctrl->lock);
149163
trace_nvmet_async_event(ctrl, req->cqe->result.u32);
150-
nvmet_req_complete(req, status);
164+
nvmet_req_complete(req, 0);
151165
mutex_lock(&ctrl->lock);
152166
}
153167
mutex_unlock(&ctrl->lock);
@@ -170,7 +184,7 @@ static void nvmet_async_event_work(struct work_struct *work)
170184
struct nvmet_ctrl *ctrl =
171185
container_of(work, struct nvmet_ctrl, async_event_work);
172186

173-
nvmet_async_events_process(ctrl, 0);
187+
nvmet_async_events_process(ctrl);
174188
}
175189

176190
void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
@@ -779,15 +793,14 @@ static void nvmet_confirm_sq(struct percpu_ref *ref)
779793

780794
void nvmet_sq_destroy(struct nvmet_sq *sq)
781795
{
782-
u16 status = NVME_SC_INTERNAL | NVME_SC_DNR;
783796
struct nvmet_ctrl *ctrl = sq->ctrl;
784797

785798
/*
786799
* If this is the admin queue, complete all AERs so that our
787800
* queue doesn't have outstanding requests on it.
788801
*/
789802
if (ctrl && ctrl->sqs && ctrl->sqs[0] == sq)
790-
nvmet_async_events_process(ctrl, status);
803+
nvmet_async_events_failall(ctrl);
791804
percpu_ref_kill_and_confirm(&sq->ref, nvmet_confirm_sq);
792805
wait_for_completion(&sq->confirm_done);
793806
wait_for_completion(&sq->free_done);

0 commit comments

Comments
 (0)