Skip to content

Commit 16fa29a

Browse files
committed
Merge branch 'smc-fixes'
Dust Li says: ==================== net/smc: fix kernel panic caused by race of smc_sock This patchset fixes the race between smc_release triggered by close(2) and cdc_handle triggered by underlaying RDMA device. The race is caused because the smc_connection may been released before the pending tx CDC messages got its CQEs. In order to fix this, I add a counter to track how many pending WRs we have posted through the smc_connection, and only release the smc_connection after there is no pending WRs on the connection. The first patch prevents posting WR on a QP that is not in RTS state. This patch is needed because if we post WR on a QP that is not in RTS state, ib_post_send() may success but no CQE will return, and that will confuse the counter tracking the pending WRs. The second patch add a counter to track how many WRs were posted through the smc_connection, and don't reset the QP on link destroying to prevent leak of the counter. ==================== Signed-off-by: David S. Miller <[email protected]>
2 parents 1b9dadb + 349d431 commit 16fa29a

File tree

10 files changed

+68
-81
lines changed

10 files changed

+68
-81
lines changed

net/smc/smc.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,11 @@ struct smc_connection {
180180
u16 tx_cdc_seq; /* sequence # for CDC send */
181181
u16 tx_cdc_seq_fin; /* sequence # - tx completed */
182182
spinlock_t send_lock; /* protect wr_sends */
183+
atomic_t cdc_pend_tx_wr; /* number of pending tx CDC wqe
184+
* - inc when post wqe,
185+
* - dec on polled tx cqe
186+
*/
187+
wait_queue_head_t cdc_pend_tx_wq; /* wakeup on no cdc_pend_tx_wr*/
183188
struct delayed_work tx_work; /* retry of smc_cdc_msg_send */
184189
u32 tx_off; /* base offset in peer rmb */
185190

net/smc/smc_cdc.c

Lines changed: 24 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,6 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
3131
struct smc_sock *smc;
3232
int diff;
3333

34-
if (!conn)
35-
/* already dismissed */
36-
return;
37-
3834
smc = container_of(conn, struct smc_sock, conn);
3935
bh_lock_sock(&smc->sk);
4036
if (!wc_status) {
@@ -51,6 +47,12 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
5147
conn);
5248
conn->tx_cdc_seq_fin = cdcpend->ctrl_seq;
5349
}
50+
51+
if (atomic_dec_and_test(&conn->cdc_pend_tx_wr) &&
52+
unlikely(wq_has_sleeper(&conn->cdc_pend_tx_wq)))
53+
wake_up(&conn->cdc_pend_tx_wq);
54+
WARN_ON(atomic_read(&conn->cdc_pend_tx_wr) < 0);
55+
5456
smc_tx_sndbuf_nonfull(smc);
5557
bh_unlock_sock(&smc->sk);
5658
}
@@ -107,13 +109,18 @@ int smc_cdc_msg_send(struct smc_connection *conn,
107109
conn->tx_cdc_seq++;
108110
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
109111
smc_host_msg_to_cdc((struct smc_cdc_msg *)wr_buf, conn, &cfed);
112+
113+
atomic_inc(&conn->cdc_pend_tx_wr);
114+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
115+
110116
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
111117
if (!rc) {
112118
smc_curs_copy(&conn->rx_curs_confirmed, &cfed, conn);
113119
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;
114120
} else {
115121
conn->tx_cdc_seq--;
116122
conn->local_tx_ctrl.seqno = conn->tx_cdc_seq;
123+
atomic_dec(&conn->cdc_pend_tx_wr);
117124
}
118125

119126
return rc;
@@ -136,7 +143,18 @@ int smcr_cdc_msg_send_validation(struct smc_connection *conn,
136143
peer->token = htonl(local->token);
137144
peer->prod_flags.failover_validation = 1;
138145

146+
/* We need to set pend->conn here to make sure smc_cdc_tx_handler()
147+
* can handle properly
148+
*/
149+
smc_cdc_add_pending_send(conn, pend);
150+
151+
atomic_inc(&conn->cdc_pend_tx_wr);
152+
smp_mb__after_atomic(); /* Make sure cdc_pend_tx_wr added before post */
153+
139154
rc = smc_wr_tx_send(link, (struct smc_wr_tx_pend_priv *)pend);
155+
if (unlikely(rc))
156+
atomic_dec(&conn->cdc_pend_tx_wr);
157+
140158
return rc;
141159
}
142160

@@ -193,31 +211,9 @@ int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn)
193211
return rc;
194212
}
195213

196-
static bool smc_cdc_tx_filter(struct smc_wr_tx_pend_priv *tx_pend,
197-
unsigned long data)
214+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn)
198215
{
199-
struct smc_connection *conn = (struct smc_connection *)data;
200-
struct smc_cdc_tx_pend *cdc_pend =
201-
(struct smc_cdc_tx_pend *)tx_pend;
202-
203-
return cdc_pend->conn == conn;
204-
}
205-
206-
static void smc_cdc_tx_dismisser(struct smc_wr_tx_pend_priv *tx_pend)
207-
{
208-
struct smc_cdc_tx_pend *cdc_pend =
209-
(struct smc_cdc_tx_pend *)tx_pend;
210-
211-
cdc_pend->conn = NULL;
212-
}
213-
214-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn)
215-
{
216-
struct smc_link *link = conn->lnk;
217-
218-
smc_wr_tx_dismiss_slots(link, SMC_CDC_MSG_TYPE,
219-
smc_cdc_tx_filter, smc_cdc_tx_dismisser,
220-
(unsigned long)conn);
216+
wait_event(conn->cdc_pend_tx_wq, !atomic_read(&conn->cdc_pend_tx_wr));
221217
}
222218

223219
/* Send a SMC-D CDC header.

net/smc/smc_cdc.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ int smc_cdc_get_free_slot(struct smc_connection *conn,
291291
struct smc_wr_buf **wr_buf,
292292
struct smc_rdma_wr **wr_rdma_buf,
293293
struct smc_cdc_tx_pend **pend);
294-
void smc_cdc_tx_dismiss_slots(struct smc_connection *conn);
294+
void smc_cdc_wait_pend_tx_wr(struct smc_connection *conn);
295295
int smc_cdc_msg_send(struct smc_connection *conn, struct smc_wr_buf *wr_buf,
296296
struct smc_cdc_tx_pend *pend);
297297
int smc_cdc_get_slot_and_msg_send(struct smc_connection *conn);

net/smc/smc_core.c

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,7 @@ static void smcr_lgr_link_deactivate_all(struct smc_link_group *lgr)
647647
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
648648
struct smc_link *lnk = &lgr->lnk[i];
649649

650-
if (smc_link_usable(lnk))
650+
if (smc_link_sendable(lnk))
651651
lnk->state = SMC_LNK_INACTIVE;
652652
}
653653
wake_up_all(&lgr->llc_msg_waiter);
@@ -1127,7 +1127,7 @@ void smc_conn_free(struct smc_connection *conn)
11271127
smc_ism_unset_conn(conn);
11281128
tasklet_kill(&conn->rx_tsklet);
11291129
} else {
1130-
smc_cdc_tx_dismiss_slots(conn);
1130+
smc_cdc_wait_pend_tx_wr(conn);
11311131
if (current_work() != &conn->abort_work)
11321132
cancel_work_sync(&conn->abort_work);
11331133
}
@@ -1204,7 +1204,7 @@ void smcr_link_clear(struct smc_link *lnk, bool log)
12041204
smc_llc_link_clear(lnk, log);
12051205
smcr_buf_unmap_lgr(lnk);
12061206
smcr_rtoken_clear_link(lnk);
1207-
smc_ib_modify_qp_reset(lnk);
1207+
smc_ib_modify_qp_error(lnk);
12081208
smc_wr_free_link(lnk);
12091209
smc_ib_destroy_queue_pair(lnk);
12101210
smc_ib_dealloc_protection_domain(lnk);
@@ -1336,7 +1336,7 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
13361336
else
13371337
tasklet_unlock_wait(&conn->rx_tsklet);
13381338
} else {
1339-
smc_cdc_tx_dismiss_slots(conn);
1339+
smc_cdc_wait_pend_tx_wr(conn);
13401340
}
13411341
smc_lgr_unregister_conn(conn);
13421342
smc_close_active_abort(smc);
@@ -1459,11 +1459,16 @@ void smc_smcd_terminate_all(struct smcd_dev *smcd)
14591459
/* Called when an SMCR device is removed or the smc module is unloaded.
14601460
* If smcibdev is given, all SMCR link groups using this device are terminated.
14611461
* If smcibdev is NULL, all SMCR link groups are terminated.
1462+
*
1463+
* We must wait here for QPs been destroyed before we destroy the CQs,
1464+
* or we won't received any CQEs and cdc_pend_tx_wr cannot reach 0 thus
1465+
* smc_sock cannot be released.
14621466
*/
14631467
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
14641468
{
14651469
struct smc_link_group *lgr, *lg;
14661470
LIST_HEAD(lgr_free_list);
1471+
LIST_HEAD(lgr_linkdown_list);
14671472
int i;
14681473

14691474
spin_lock_bh(&smc_lgr_list.lock);
@@ -1475,7 +1480,7 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
14751480
list_for_each_entry_safe(lgr, lg, &smc_lgr_list.list, list) {
14761481
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
14771482
if (lgr->lnk[i].smcibdev == smcibdev)
1478-
smcr_link_down_cond_sched(&lgr->lnk[i]);
1483+
list_move_tail(&lgr->list, &lgr_linkdown_list);
14791484
}
14801485
}
14811486
}
@@ -1487,6 +1492,16 @@ void smc_smcr_terminate_all(struct smc_ib_device *smcibdev)
14871492
__smc_lgr_terminate(lgr, false);
14881493
}
14891494

1495+
list_for_each_entry_safe(lgr, lg, &lgr_linkdown_list, list) {
1496+
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1497+
if (lgr->lnk[i].smcibdev == smcibdev) {
1498+
mutex_lock(&lgr->llc_conf_mutex);
1499+
smcr_link_down_cond(&lgr->lnk[i]);
1500+
mutex_unlock(&lgr->llc_conf_mutex);
1501+
}
1502+
}
1503+
}
1504+
14901505
if (smcibdev) {
14911506
if (atomic_read(&smcibdev->lnk_cnt))
14921507
wait_event(smcibdev->lnks_deleted,
@@ -1586,7 +1601,6 @@ static void smcr_link_down(struct smc_link *lnk)
15861601
if (!lgr || lnk->state == SMC_LNK_UNUSED || list_empty(&lgr->list))
15871602
return;
15881603

1589-
smc_ib_modify_qp_reset(lnk);
15901604
to_lnk = smc_switch_conns(lgr, lnk, true);
15911605
if (!to_lnk) { /* no backup link available */
15921606
smcr_link_clear(lnk, true);
@@ -1824,6 +1838,7 @@ int smc_conn_create(struct smc_sock *smc, struct smc_init_info *ini)
18241838
conn->local_tx_ctrl.common.type = SMC_CDC_MSG_TYPE;
18251839
conn->local_tx_ctrl.len = SMC_WR_TX_SIZE;
18261840
conn->urg_state = SMC_URG_READ;
1841+
init_waitqueue_head(&conn->cdc_pend_tx_wq);
18271842
INIT_WORK(&smc->conn.abort_work, smc_conn_abort_work);
18281843
if (ini->is_smcd) {
18291844
conn->rx_off = sizeof(struct smcd_cdc_msg);

net/smc/smc_core.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,12 @@ static inline bool smc_link_usable(struct smc_link *lnk)
415415
return true;
416416
}
417417

418+
static inline bool smc_link_sendable(struct smc_link *lnk)
419+
{
420+
return smc_link_usable(lnk) &&
421+
lnk->qp_attr.cur_qp_state == IB_QPS_RTS;
422+
}
423+
418424
static inline bool smc_link_active(struct smc_link *lnk)
419425
{
420426
return lnk->state == SMC_LNK_ACTIVE;

net/smc/smc_ib.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,12 +109,12 @@ int smc_ib_modify_qp_rts(struct smc_link *lnk)
109109
IB_QP_MAX_QP_RD_ATOMIC);
110110
}
111111

112-
int smc_ib_modify_qp_reset(struct smc_link *lnk)
112+
int smc_ib_modify_qp_error(struct smc_link *lnk)
113113
{
114114
struct ib_qp_attr qp_attr;
115115

116116
memset(&qp_attr, 0, sizeof(qp_attr));
117-
qp_attr.qp_state = IB_QPS_RESET;
117+
qp_attr.qp_state = IB_QPS_ERR;
118118
return ib_modify_qp(lnk->roce_qp, &qp_attr, IB_QP_STATE);
119119
}
120120

net/smc/smc_ib.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ int smc_ib_create_queue_pair(struct smc_link *lnk);
9090
int smc_ib_ready_link(struct smc_link *lnk);
9191
int smc_ib_modify_qp_rts(struct smc_link *lnk);
9292
int smc_ib_modify_qp_reset(struct smc_link *lnk);
93+
int smc_ib_modify_qp_error(struct smc_link *lnk);
9394
long smc_ib_setup_per_ibdev(struct smc_ib_device *smcibdev);
9495
int smc_ib_get_memory_region(struct ib_pd *pd, int access_flags,
9596
struct smc_buf_desc *buf_slot, u8 link_idx);

net/smc/smc_llc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1630,7 +1630,7 @@ void smc_llc_send_link_delete_all(struct smc_link_group *lgr, bool ord, u32 rsn)
16301630
delllc.reason = htonl(rsn);
16311631

16321632
for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1633-
if (!smc_link_usable(&lgr->lnk[i]))
1633+
if (!smc_link_sendable(&lgr->lnk[i]))
16341634
continue;
16351635
if (!smc_llc_send_message_wait(&lgr->lnk[i], &delllc))
16361636
break;

net/smc/smc_wr.c

Lines changed: 5 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -62,13 +62,9 @@ static inline bool smc_wr_is_tx_pend(struct smc_link *link)
6262
}
6363

6464
/* wait till all pending tx work requests on the given link are completed */
65-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
65+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link)
6666
{
67-
if (wait_event_timeout(link->wr_tx_wait, !smc_wr_is_tx_pend(link),
68-
SMC_WR_TX_WAIT_PENDING_TIME))
69-
return 0;
70-
else /* timeout */
71-
return -EPIPE;
67+
wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link));
7268
}
7369

7470
static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id)
@@ -87,7 +83,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
8783
struct smc_wr_tx_pend pnd_snd;
8884
struct smc_link *link;
8985
u32 pnd_snd_idx;
90-
int i;
9186

9287
link = wc->qp->qp_context;
9388

@@ -128,14 +123,6 @@ static inline void smc_wr_tx_process_cqe(struct ib_wc *wc)
128123
}
129124

130125
if (wc->status) {
131-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
132-
/* clear full struct smc_wr_tx_pend including .priv */
133-
memset(&link->wr_tx_pends[i], 0,
134-
sizeof(link->wr_tx_pends[i]));
135-
memset(&link->wr_tx_bufs[i], 0,
136-
sizeof(link->wr_tx_bufs[i]));
137-
clear_bit(i, link->wr_tx_mask);
138-
}
139126
if (link->lgr->smc_version == SMC_V2) {
140127
memset(link->wr_tx_v2_pend, 0,
141128
sizeof(*link->wr_tx_v2_pend));
@@ -188,7 +175,7 @@ void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context)
188175
static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx)
189176
{
190177
*idx = link->wr_tx_cnt;
191-
if (!smc_link_usable(link))
178+
if (!smc_link_sendable(link))
192179
return -ENOLINK;
193180
for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) {
194181
if (!test_and_set_bit(*idx, link->wr_tx_mask))
@@ -231,7 +218,7 @@ int smc_wr_tx_get_free_slot(struct smc_link *link,
231218
} else {
232219
rc = wait_event_interruptible_timeout(
233220
link->wr_tx_wait,
234-
!smc_link_usable(link) ||
221+
!smc_link_sendable(link) ||
235222
lgr->terminating ||
236223
(smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY),
237224
SMC_WR_TX_WAIT_FREE_SLOT_TIME);
@@ -421,25 +408,6 @@ int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr)
421408
return rc;
422409
}
423410

424-
void smc_wr_tx_dismiss_slots(struct smc_link *link, u8 wr_tx_hdr_type,
425-
smc_wr_tx_filter filter,
426-
smc_wr_tx_dismisser dismisser,
427-
unsigned long data)
428-
{
429-
struct smc_wr_tx_pend_priv *tx_pend;
430-
struct smc_wr_rx_hdr *wr_tx;
431-
int i;
432-
433-
for_each_set_bit(i, link->wr_tx_mask, link->wr_tx_cnt) {
434-
wr_tx = (struct smc_wr_rx_hdr *)&link->wr_tx_bufs[i];
435-
if (wr_tx->type != wr_tx_hdr_type)
436-
continue;
437-
tx_pend = &link->wr_tx_pends[i].priv;
438-
if (filter(tx_pend, data))
439-
dismisser(tx_pend);
440-
}
441-
}
442-
443411
/****************************** receive queue ********************************/
444412

445413
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler)
@@ -675,10 +643,7 @@ void smc_wr_free_link(struct smc_link *lnk)
675643
smc_wr_wakeup_reg_wait(lnk);
676644
smc_wr_wakeup_tx_wait(lnk);
677645

678-
if (smc_wr_tx_wait_no_pending_sends(lnk))
679-
memset(lnk->wr_tx_mask, 0,
680-
BITS_TO_LONGS(SMC_WR_BUF_CNT) *
681-
sizeof(*lnk->wr_tx_mask));
646+
smc_wr_tx_wait_no_pending_sends(lnk);
682647
wait_event(lnk->wr_reg_wait, (!atomic_read(&lnk->wr_reg_refcnt)));
683648
wait_event(lnk->wr_tx_wait, (!atomic_read(&lnk->wr_tx_refcnt)));
684649

net/smc/smc_wr.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
2323

2424
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
25-
#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
2625

2726
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
2827

@@ -62,7 +61,7 @@ static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
6261

6362
static inline bool smc_wr_tx_link_hold(struct smc_link *link)
6463
{
65-
if (!smc_link_usable(link))
64+
if (!smc_link_sendable(link))
6665
return false;
6766
atomic_inc(&link->wr_tx_refcnt);
6867
return true;
@@ -130,7 +129,7 @@ void smc_wr_tx_dismiss_slots(struct smc_link *lnk, u8 wr_rx_hdr_type,
130129
smc_wr_tx_filter filter,
131130
smc_wr_tx_dismisser dismisser,
132131
unsigned long data);
133-
int smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
132+
void smc_wr_tx_wait_no_pending_sends(struct smc_link *link);
134133

135134
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
136135
int smc_wr_rx_post_init(struct smc_link *link);

0 commit comments

Comments
 (0)