Skip to content

Commit 7244b4a

Browse files
hz-chengrleon
authored andcommitted
RDMA/erdma: Refactor the storage structure of MTT entries
Currently our MTT only support inline mtt entries (0 level MTT) and indirect MTT entries (1 level mtt), which will limit the maximum length of MRs. In order to implement a multi-level MTT, we refactor the structure of MTT first. Signed-off-by: Cheng Xu <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Leon Romanovsky <[email protected]>
1 parent d7cfbba commit 7244b4a

File tree

4 files changed

+152
-94
lines changed

4 files changed

+152
-94
lines changed

drivers/infiniband/hw/erdma/erdma_hw.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -228,7 +228,7 @@ struct erdma_cmdq_ext_db_req {
228228

229229
/* create_cq cfg1 */
230230
#define ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK GENMASK(31, 16)
231-
#define ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK BIT(15)
231+
#define ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK BIT(15)
232232
#define ERDMA_CMD_CREATE_CQ_MTT_DB_CFG_MASK BIT(11)
233233
#define ERDMA_CMD_CREATE_CQ_EQN_MASK GENMASK(9, 0)
234234

@@ -258,7 +258,7 @@ struct erdma_cmdq_create_cq_req {
258258

259259
/* regmr cfg2 */
260260
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
261-
#define ERDMA_CMD_REGMR_MTT_TYPE_MASK GENMASK(21, 20)
261+
#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20)
262262
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)
263263

264264
struct erdma_cmdq_reg_mr_req {

drivers/infiniband/hw/erdma/erdma_qp.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -410,7 +410,7 @@ static int erdma_push_one_sqe(struct erdma_qp *qp, u16 *pi,
410410
/* Copy SGLs to SQE content to accelerate */
411411
memcpy(get_queue_entry(qp->kern_qp.sq_buf, idx + 1,
412412
qp->attrs.sq_size, SQEBB_SHIFT),
413-
mr->mem.mtt_buf, MTT_SIZE(mr->mem.mtt_nents));
413+
mr->mem.mtt->buf, MTT_SIZE(mr->mem.mtt_nents));
414414
wqe_size = sizeof(struct erdma_reg_mr_sqe) +
415415
MTT_SIZE(mr->mem.mtt_nents);
416416
} else {

drivers/infiniband/hw/erdma/erdma_verbs.c

Lines changed: 128 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,23 @@
1919
#include "erdma_cm.h"
2020
#include "erdma_verbs.h"
2121

22+
static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg,
23+
u64 *addr0, u64 *addr1)
24+
{
25+
struct erdma_mtt *mtt = mem->mtt;
26+
27+
if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) {
28+
*addr0 = mtt->buf_dma;
29+
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
30+
ERDMA_MR_INDIRECT_MTT);
31+
} else {
32+
*addr0 = mtt->buf[0];
33+
memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1));
34+
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
35+
ERDMA_MR_INLINE_MTT);
36+
}
37+
}
38+
2239
static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
2340
{
2441
struct erdma_dev *dev = to_edev(qp->ibqp.device);
@@ -79,18 +96,16 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
7996

8097
req.sq_mtt_cfg = user_qp->sq_mem.page_offset;
8198
req.sq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
82-
user_qp->sq_mem.mtt_nents) |
83-
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
84-
user_qp->sq_mem.mtt_type);
99+
user_qp->sq_mem.mtt_nents);
85100

86101
req.rq_mtt_cfg = user_qp->rq_mem.page_offset;
87102
req.rq_mtt_cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK,
88-
user_qp->rq_mem.mtt_nents) |
89-
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
90-
user_qp->rq_mem.mtt_type);
103+
user_qp->rq_mem.mtt_nents);
91104

92-
req.sq_buf_addr = user_qp->sq_mem.mtt_entry[0];
93-
req.rq_buf_addr = user_qp->rq_mem.mtt_entry[0];
105+
assemble_qbuf_mtt_for_cmd(&user_qp->sq_mem, &req.sq_mtt_cfg,
106+
&req.sq_buf_addr, req.sq_mtt_entry);
107+
assemble_qbuf_mtt_for_cmd(&user_qp->rq_mem, &req.rq_mtt_cfg,
108+
&req.rq_buf_addr, req.rq_mtt_entry);
94109

95110
req.sq_db_info_dma_addr = user_qp->sq_db_info_dma_addr;
96111
req.rq_db_info_dma_addr = user_qp->rq_db_info_dma_addr;
@@ -117,13 +132,22 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
117132

118133
static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
119134
{
120-
struct erdma_cmdq_reg_mr_req req;
121135
struct erdma_pd *pd = to_epd(mr->ibmr.pd);
122-
u64 *phy_addr;
123-
int i;
136+
struct erdma_cmdq_reg_mr_req req;
137+
u32 mtt_level;
124138

125139
erdma_cmdq_build_reqhdr(&req.hdr, CMDQ_SUBMOD_RDMA, CMDQ_OPCODE_REG_MR);
126140

141+
if (mr->type == ERDMA_MR_TYPE_FRMR ||
142+
mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) {
143+
req.phy_addr[0] = mr->mem.mtt->buf_dma;
144+
mtt_level = ERDMA_MR_INDIRECT_MTT;
145+
} else {
146+
memcpy(req.phy_addr, mr->mem.mtt->buf,
147+
MTT_SIZE(mr->mem.page_cnt));
148+
mtt_level = ERDMA_MR_INLINE_MTT;
149+
}
150+
127151
req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
128152
FIELD_PREP(ERDMA_CMD_MR_KEY_MASK, mr->ibmr.lkey & 0xFF) |
129153
FIELD_PREP(ERDMA_CMD_MR_MPT_IDX_MASK, mr->ibmr.lkey >> 8);
@@ -132,7 +156,7 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
132156
FIELD_PREP(ERDMA_CMD_REGMR_RIGHT_MASK, mr->access);
133157
req.cfg2 = FIELD_PREP(ERDMA_CMD_REGMR_PAGESIZE_MASK,
134158
ilog2(mr->mem.page_size)) |
135-
FIELD_PREP(ERDMA_CMD_REGMR_MTT_TYPE_MASK, mr->mem.mtt_type) |
159+
FIELD_PREP(ERDMA_CMD_REGMR_MTT_LEVEL_MASK, mtt_level) |
136160
FIELD_PREP(ERDMA_CMD_REGMR_MTT_CNT_MASK, mr->mem.page_cnt);
137161

138162
if (mr->type == ERDMA_MR_TYPE_DMA)
@@ -143,16 +167,6 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
143167
req.size = mr->mem.len;
144168
}
145169

146-
if (mr->type == ERDMA_MR_TYPE_FRMR ||
147-
mr->mem.mtt_type == ERDMA_MR_INDIRECT_MTT) {
148-
phy_addr = req.phy_addr;
149-
*phy_addr = mr->mem.mtt_entry[0];
150-
} else {
151-
phy_addr = req.phy_addr;
152-
for (i = 0; i < mr->mem.mtt_nents; i++)
153-
*phy_addr++ = mr->mem.mtt_entry[i];
154-
}
155-
156170
post_cmd:
157171
return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL);
158172
}
@@ -179,7 +193,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
179193
req.qbuf_addr_h = upper_32_bits(cq->kern_cq.qbuf_dma_addr);
180194

181195
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
182-
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
196+
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
183197
ERDMA_MR_INLINE_MTT);
184198

185199
req.first_page_offset = 0;
@@ -191,16 +205,20 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
191205
FIELD_PREP(ERDMA_CMD_CREATE_CQ_PAGESIZE_MASK,
192206
ilog2(mem->page_size) - ERDMA_HW_PAGE_SHIFT);
193207
if (mem->mtt_nents == 1) {
194-
req.qbuf_addr_l = lower_32_bits(*(u64 *)mem->mtt_buf);
195-
req.qbuf_addr_h = upper_32_bits(*(u64 *)mem->mtt_buf);
208+
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf[0]);
209+
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]);
210+
req.cfg1 |=
211+
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
212+
ERDMA_MR_INLINE_MTT);
196213
} else {
197-
req.qbuf_addr_l = lower_32_bits(mem->mtt_entry[0]);
198-
req.qbuf_addr_h = upper_32_bits(mem->mtt_entry[0]);
214+
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma);
215+
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma);
216+
req.cfg1 |=
217+
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
218+
ERDMA_MR_INDIRECT_MTT);
199219
}
200220
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
201221
mem->mtt_nents);
202-
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_TYPE_MASK,
203-
mem->mtt_type);
204222

205223
req.first_page_offset = mem->page_offset;
206224
req.cq_db_info_addr = cq->user_cq.db_info_dma_addr;
@@ -508,12 +526,77 @@ static int init_kernel_qp(struct erdma_dev *dev, struct erdma_qp *qp,
508526
return -ENOMEM;
509527
}
510528

529+
static void erdma_fill_bottom_mtt(struct erdma_dev *dev, struct erdma_mem *mem)
530+
{
531+
struct erdma_mtt *mtt = mem->mtt;
532+
struct ib_block_iter biter;
533+
u32 idx = 0;
534+
535+
while (mtt->low_level)
536+
mtt = mtt->low_level;
537+
538+
rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size)
539+
mtt->buf[idx++] = rdma_block_iter_dma_address(&biter);
540+
}
541+
542+
static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
543+
size_t size)
544+
{
545+
struct erdma_mtt *mtt;
546+
int ret = -ENOMEM;
547+
548+
mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
549+
if (!mtt)
550+
return ERR_PTR(-ENOMEM);
551+
552+
mtt->size = size;
553+
mtt->buf = kzalloc(mtt->size, GFP_KERNEL);
554+
if (!mtt->buf)
555+
goto err_free_mtt;
556+
557+
mtt->continuous = true;
558+
mtt->buf_dma = dma_map_single(&dev->pdev->dev, mtt->buf, mtt->size,
559+
DMA_TO_DEVICE);
560+
if (dma_mapping_error(&dev->pdev->dev, mtt->buf_dma))
561+
goto err_free_mtt_buf;
562+
563+
return mtt;
564+
565+
err_free_mtt_buf:
566+
kfree(mtt->buf);
567+
568+
err_free_mtt:
569+
kfree(mtt);
570+
571+
return ERR_PTR(ret);
572+
}
573+
574+
static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
575+
bool force_continuous)
576+
{
577+
ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size,
578+
force_continuous);
579+
580+
if (force_continuous)
581+
return erdma_create_cont_mtt(dev, size);
582+
583+
return ERR_PTR(-EOPNOTSUPP);
584+
}
585+
586+
static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt)
587+
{
588+
if (mtt->continuous) {
589+
dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size,
590+
DMA_TO_DEVICE);
591+
kfree(mtt->buf);
592+
kfree(mtt);
593+
}
594+
}
595+
511596
static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
512597
u64 start, u64 len, int access, u64 virt,
513598
unsigned long req_page_size, u8 force_indirect_mtt)
514599
{
515-
struct ib_block_iter biter;
516-
uint64_t *phy_addr = NULL;
517600
int ret = 0;
518601

519602
mem->umem = ib_umem_get(&dev->ibdev, start, len, access);
@@ -529,38 +612,13 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
529612
mem->page_offset = start & (mem->page_size - 1);
530613
mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
531614
mem->page_cnt = mem->mtt_nents;
532-
533-
if (mem->page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES ||
534-
force_indirect_mtt) {
535-
mem->mtt_type = ERDMA_MR_INDIRECT_MTT;
536-
mem->mtt_buf =
537-
alloc_pages_exact(MTT_SIZE(mem->page_cnt), GFP_KERNEL);
538-
if (!mem->mtt_buf) {
539-
ret = -ENOMEM;
540-
goto error_ret;
541-
}
542-
phy_addr = mem->mtt_buf;
543-
} else {
544-
mem->mtt_type = ERDMA_MR_INLINE_MTT;
545-
phy_addr = mem->mtt_entry;
546-
}
547-
548-
rdma_umem_for_each_dma_block(mem->umem, &biter, mem->page_size) {
549-
*phy_addr = rdma_block_iter_dma_address(&biter);
550-
phy_addr++;
615+
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true);
616+
if (IS_ERR(mem->mtt)) {
617+
ret = PTR_ERR(mem->mtt);
618+
goto error_ret;
551619
}
552620

553-
if (mem->mtt_type == ERDMA_MR_INDIRECT_MTT) {
554-
mem->mtt_entry[0] =
555-
dma_map_single(&dev->pdev->dev, mem->mtt_buf,
556-
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
557-
if (dma_mapping_error(&dev->pdev->dev, mem->mtt_entry[0])) {
558-
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
559-
mem->mtt_buf = NULL;
560-
ret = -ENOMEM;
561-
goto error_ret;
562-
}
563-
}
621+
erdma_fill_bottom_mtt(dev, mem);
564622

565623
return 0;
566624

@@ -575,11 +633,8 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
575633

576634
static void put_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem)
577635
{
578-
if (mem->mtt_buf) {
579-
dma_unmap_single(&dev->pdev->dev, mem->mtt_entry[0],
580-
MTT_SIZE(mem->page_cnt), DMA_TO_DEVICE);
581-
free_pages_exact(mem->mtt_buf, MTT_SIZE(mem->page_cnt));
582-
}
636+
if (mem->mtt)
637+
erdma_destroy_mtt(dev, mem->mtt);
583638

584639
if (mem->umem) {
585640
ib_umem_release(mem->umem);
@@ -875,33 +930,20 @@ struct ib_mr *erdma_ib_alloc_mr(struct ib_pd *ibpd, enum ib_mr_type mr_type,
875930

876931
mr->mem.page_size = PAGE_SIZE; /* update it later. */
877932
mr->mem.page_cnt = max_num_sg;
878-
mr->mem.mtt_type = ERDMA_MR_INDIRECT_MTT;
879-
mr->mem.mtt_buf =
880-
alloc_pages_exact(MTT_SIZE(mr->mem.page_cnt), GFP_KERNEL);
881-
if (!mr->mem.mtt_buf) {
882-
ret = -ENOMEM;
933+
mr->mem.mtt = erdma_create_mtt(dev, MTT_SIZE(max_num_sg), true);
934+
if (IS_ERR(mr->mem.mtt)) {
935+
ret = PTR_ERR(mr->mem.mtt);
883936
goto out_remove_stag;
884937
}
885938

886-
mr->mem.mtt_entry[0] =
887-
dma_map_single(&dev->pdev->dev, mr->mem.mtt_buf,
888-
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
889-
if (dma_mapping_error(&dev->pdev->dev, mr->mem.mtt_entry[0])) {
890-
ret = -ENOMEM;
891-
goto out_free_mtt;
892-
}
893-
894939
ret = regmr_cmd(dev, mr);
895940
if (ret)
896-
goto out_dma_unmap;
941+
goto out_destroy_mtt;
897942

898943
return &mr->ibmr;
899944

900-
out_dma_unmap:
901-
dma_unmap_single(&dev->pdev->dev, mr->mem.mtt_entry[0],
902-
MTT_SIZE(mr->mem.page_cnt), DMA_TO_DEVICE);
903-
out_free_mtt:
904-
free_pages_exact(mr->mem.mtt_buf, MTT_SIZE(mr->mem.page_cnt));
945+
out_destroy_mtt:
946+
erdma_destroy_mtt(dev, mr->mem.mtt);
905947

906948
out_remove_stag:
907949
erdma_free_idx(&dev->res_cb[ERDMA_RES_TYPE_STAG_IDX],
@@ -920,7 +962,7 @@ static int erdma_set_page(struct ib_mr *ibmr, u64 addr)
920962
if (mr->mem.mtt_nents >= mr->mem.page_cnt)
921963
return -1;
922964

923-
*((u64 *)mr->mem.mtt_buf + mr->mem.mtt_nents) = addr;
965+
mr->mem.mtt->buf[mr->mem.mtt_nents] = addr;
924966
mr->mem.mtt_nents++;
925967

926968
return 0;

drivers/infiniband/hw/erdma/erdma_verbs.h

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ struct erdma_pd {
6565
* MemoryRegion definition.
6666
*/
6767
#define ERDMA_MAX_INLINE_MTT_ENTRIES 4
68-
#define MTT_SIZE(mtt_cnt) (mtt_cnt << 3) /* per mtt entry takes 8 Bytes. */
68+
#define MTT_SIZE(mtt_cnt) ((mtt_cnt) << 3) /* per mtt entry takes 8 Bytes. */
6969
#define ERDMA_MR_MAX_MTT_CNT 524288
7070
#define ERDMA_MTT_ENTRY_SIZE 8
7171

@@ -90,19 +90,35 @@ static inline u8 to_erdma_access_flags(int access)
9090
(access & IB_ACCESS_REMOTE_ATOMIC ? ERDMA_MR_ACC_RA : 0);
9191
}
9292

93+
/* Hierarchical storage structure for MTT entries */
94+
struct erdma_mtt {
95+
u64 *buf;
96+
size_t size;
97+
98+
bool continuous;
99+
union {
100+
dma_addr_t buf_dma;
101+
struct {
102+
struct scatterlist *sglist;
103+
u32 nsg;
104+
u32 level;
105+
};
106+
};
107+
108+
struct erdma_mtt *low_level;
109+
};
110+
93111
struct erdma_mem {
94112
struct ib_umem *umem;
95-
void *mtt_buf;
96-
u32 mtt_type;
113+
struct erdma_mtt *mtt;
114+
97115
u32 page_size;
98116
u32 page_offset;
99117
u32 page_cnt;
100118
u32 mtt_nents;
101119

102120
u64 va;
103121
u64 len;
104-
105-
u64 mtt_entry[ERDMA_MAX_INLINE_MTT_ENTRIES];
106122
};
107123

108124
struct erdma_mr {

0 commit comments

Comments
 (0)