Skip to content

Commit ed10435

Browse files
hz-chengrleon
authored andcommitted
RDMA/erdma: Implement hierarchical MTT
Hierarchical MTT allows large MR registration without the need of continuous physical address. This commit adds the support of hierarchical MTT support for erdma. Signed-off-by: Cheng Xu <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Leon Romanovsky <[email protected]>
1 parent 7244b4a commit ed10435

File tree

3 files changed

+194
-24
lines changed

3 files changed

+194
-24
lines changed

drivers/infiniband/hw/erdma/erdma_hw.h

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ struct erdma_cmdq_create_cq_req {
248248

249249
/* regmr/deregmr cfg0 */
250250
#define ERDMA_CMD_MR_VALID_MASK BIT(31)
251+
#define ERDMA_CMD_MR_VERSION_MASK GENMASK(30, 28)
251252
#define ERDMA_CMD_MR_KEY_MASK GENMASK(27, 20)
252253
#define ERDMA_CMD_MR_MPT_IDX_MASK GENMASK(19, 0)
253254

@@ -258,6 +259,7 @@ struct erdma_cmdq_create_cq_req {
258259

259260
/* regmr cfg2 */
260261
#define ERDMA_CMD_REGMR_PAGESIZE_MASK GENMASK(31, 27)
262+
#define ERDMA_CMD_REGMR_MTT_PAGESIZE_MASK GENMASK(26, 24)
261263
#define ERDMA_CMD_REGMR_MTT_LEVEL_MASK GENMASK(21, 20)
262264
#define ERDMA_CMD_REGMR_MTT_CNT_MASK GENMASK(19, 0)
263265

@@ -268,7 +270,14 @@ struct erdma_cmdq_reg_mr_req {
268270
u64 start_va;
269271
u32 size;
270272
u32 cfg2;
271-
u64 phy_addr[4];
273+
union {
274+
u64 phy_addr[4];
275+
struct {
276+
u64 rsvd;
277+
u32 size_h;
278+
u32 mtt_cnt_h;
279+
};
280+
};
272281
};
273282

274283
struct erdma_cmdq_dereg_mr_req {
@@ -309,7 +318,7 @@ struct erdma_cmdq_modify_qp_req {
309318
/* create qp mtt_cfg */
310319
#define ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK GENMASK(31, 12)
311320
#define ERDMA_CMD_CREATE_QP_MTT_CNT_MASK GENMASK(11, 1)
312-
#define ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK BIT(0)
321+
#define ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK BIT(0)
313322

314323
/* create qp db cfg */
315324
#define ERDMA_CMD_CREATE_QP_SQDB_CFG_MASK GENMASK(31, 16)
@@ -364,6 +373,7 @@ struct erdma_cmdq_reflush_req {
364373

365374
enum {
366375
ERDMA_DEV_CAP_FLAGS_ATOMIC = 1 << 7,
376+
ERDMA_DEV_CAP_FLAGS_MTT_VA = 1 << 5,
367377
ERDMA_DEV_CAP_FLAGS_EXTEND_DB = 1 << 3,
368378
};
369379

drivers/infiniband/hw/erdma/erdma_verbs.c

Lines changed: 180 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,13 @@ static void assemble_qbuf_mtt_for_cmd(struct erdma_mem *mem, u32 *cfg,
2626

2727
if (mem->mtt_nents > ERDMA_MAX_INLINE_MTT_ENTRIES) {
2828
*addr0 = mtt->buf_dma;
29-
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
30-
ERDMA_MR_INDIRECT_MTT);
29+
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
30+
ERDMA_MR_MTT_1LEVEL);
3131
} else {
3232
*addr0 = mtt->buf[0];
3333
memcpy(addr1, mtt->buf + 1, MTT_SIZE(mem->mtt_nents - 1));
34-
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
35-
ERDMA_MR_INLINE_MTT);
34+
*cfg |= FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
35+
ERDMA_MR_MTT_0LEVEL);
3636
}
3737
}
3838

@@ -70,8 +70,8 @@ static int create_qp_cmd(struct erdma_ucontext *uctx, struct erdma_qp *qp)
7070
req.sq_mtt_cfg =
7171
FIELD_PREP(ERDMA_CMD_CREATE_QP_PAGE_OFFSET_MASK, 0) |
7272
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_CNT_MASK, 1) |
73-
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_TYPE_MASK,
74-
ERDMA_MR_INLINE_MTT);
73+
FIELD_PREP(ERDMA_CMD_CREATE_QP_MTT_LEVEL_MASK,
74+
ERDMA_MR_MTT_0LEVEL);
7575
req.rq_mtt_cfg = req.sq_mtt_cfg;
7676

7777
req.rq_buf_addr = qp->kern_qp.rq_buf_dma_addr;
@@ -140,12 +140,17 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
140140

141141
if (mr->type == ERDMA_MR_TYPE_FRMR ||
142142
mr->mem.page_cnt > ERDMA_MAX_INLINE_MTT_ENTRIES) {
143-
req.phy_addr[0] = mr->mem.mtt->buf_dma;
144-
mtt_level = ERDMA_MR_INDIRECT_MTT;
143+
if (mr->mem.mtt->continuous) {
144+
req.phy_addr[0] = mr->mem.mtt->buf_dma;
145+
mtt_level = ERDMA_MR_MTT_1LEVEL;
146+
} else {
147+
req.phy_addr[0] = sg_dma_address(mr->mem.mtt->sglist);
148+
mtt_level = mr->mem.mtt->level;
149+
}
145150
} else {
146151
memcpy(req.phy_addr, mr->mem.mtt->buf,
147152
MTT_SIZE(mr->mem.page_cnt));
148-
mtt_level = ERDMA_MR_INLINE_MTT;
153+
mtt_level = ERDMA_MR_MTT_0LEVEL;
149154
}
150155

151156
req.cfg0 = FIELD_PREP(ERDMA_CMD_MR_VALID_MASK, mr->valid) |
@@ -167,6 +172,14 @@ static int regmr_cmd(struct erdma_dev *dev, struct erdma_mr *mr)
167172
req.size = mr->mem.len;
168173
}
169174

175+
if (!mr->mem.mtt->continuous && mr->mem.mtt->level > 1) {
176+
req.cfg0 |= FIELD_PREP(ERDMA_CMD_MR_VERSION_MASK, 1);
177+
req.cfg2 |= FIELD_PREP(ERDMA_CMD_REGMR_MTT_PAGESIZE_MASK,
178+
PAGE_SHIFT - ERDMA_HW_PAGE_SHIFT);
179+
req.size_h = upper_32_bits(mr->mem.len);
180+
req.mtt_cnt_h = mr->mem.page_cnt >> 20;
181+
}
182+
170183
post_cmd:
171184
return erdma_post_cmd_wait(&dev->cmdq, &req, sizeof(req), NULL, NULL);
172185
}
@@ -194,7 +207,7 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
194207

195208
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK, 1) |
196209
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
197-
ERDMA_MR_INLINE_MTT);
210+
ERDMA_MR_MTT_0LEVEL);
198211

199212
req.first_page_offset = 0;
200213
req.cq_db_info_addr =
@@ -209,13 +222,13 @@ static int create_cq_cmd(struct erdma_ucontext *uctx, struct erdma_cq *cq)
209222
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf[0]);
210223
req.cfg1 |=
211224
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
212-
ERDMA_MR_INLINE_MTT);
225+
ERDMA_MR_MTT_0LEVEL);
213226
} else {
214227
req.qbuf_addr_l = lower_32_bits(mem->mtt->buf_dma);
215228
req.qbuf_addr_h = upper_32_bits(mem->mtt->buf_dma);
216229
req.cfg1 |=
217230
FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_LEVEL_MASK,
218-
ERDMA_MR_INDIRECT_MTT);
231+
ERDMA_MR_MTT_1LEVEL);
219232
}
220233
req.cfg1 |= FIELD_PREP(ERDMA_CMD_CREATE_CQ_MTT_CNT_MASK,
221234
mem->mtt_nents);
@@ -543,7 +556,6 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
543556
size_t size)
544557
{
545558
struct erdma_mtt *mtt;
546-
int ret = -ENOMEM;
547559

548560
mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
549561
if (!mtt)
@@ -565,6 +577,104 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
565577
err_free_mtt_buf:
566578
kfree(mtt->buf);
567579

580+
err_free_mtt:
581+
kfree(mtt);
582+
583+
return ERR_PTR(-ENOMEM);
584+
}
585+
586+
static void erdma_destroy_mtt_buf_sg(struct erdma_dev *dev,
587+
struct erdma_mtt *mtt)
588+
{
589+
dma_unmap_sg(&dev->pdev->dev, mtt->sglist, mtt->nsg, DMA_TO_DEVICE);
590+
vfree(mtt->sglist);
591+
}
592+
593+
static void erdma_destroy_scatter_mtt(struct erdma_dev *dev,
594+
struct erdma_mtt *mtt)
595+
{
596+
erdma_destroy_mtt_buf_sg(dev, mtt);
597+
vfree(mtt->buf);
598+
kfree(mtt);
599+
}
600+
601+
static void erdma_init_middle_mtt(struct erdma_mtt *mtt,
602+
struct erdma_mtt *low_mtt)
603+
{
604+
struct scatterlist *sg;
605+
u32 idx = 0, i;
606+
607+
for_each_sg(low_mtt->sglist, sg, low_mtt->nsg, i)
608+
mtt->buf[idx++] = sg_dma_address(sg);
609+
}
610+
611+
static int erdma_create_mtt_buf_sg(struct erdma_dev *dev, struct erdma_mtt *mtt)
612+
{
613+
struct scatterlist *sglist;
614+
void *buf = mtt->buf;
615+
u32 npages, i, nsg;
616+
struct page *pg;
617+
618+
/* Failed if buf is not page aligned */
619+
if ((uintptr_t)buf & ~PAGE_MASK)
620+
return -EINVAL;
621+
622+
npages = DIV_ROUND_UP(mtt->size, PAGE_SIZE);
623+
sglist = vzalloc(npages * sizeof(*sglist));
624+
if (!sglist)
625+
return -ENOMEM;
626+
627+
sg_init_table(sglist, npages);
628+
for (i = 0; i < npages; i++) {
629+
pg = vmalloc_to_page(buf);
630+
if (!pg)
631+
goto err;
632+
sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
633+
buf += PAGE_SIZE;
634+
}
635+
636+
nsg = dma_map_sg(&dev->pdev->dev, sglist, npages, DMA_TO_DEVICE);
637+
if (!nsg)
638+
goto err;
639+
640+
mtt->sglist = sglist;
641+
mtt->nsg = nsg;
642+
643+
return 0;
644+
err:
645+
vfree(sglist);
646+
647+
return -ENOMEM;
648+
}
649+
650+
static struct erdma_mtt *erdma_create_scatter_mtt(struct erdma_dev *dev,
651+
size_t size)
652+
{
653+
struct erdma_mtt *mtt;
654+
int ret = -ENOMEM;
655+
656+
mtt = kzalloc(sizeof(*mtt), GFP_KERNEL);
657+
if (!mtt)
658+
return NULL;
659+
660+
mtt->size = ALIGN(size, PAGE_SIZE);
661+
mtt->buf = vzalloc(mtt->size);
662+
mtt->continuous = false;
663+
if (!mtt->buf)
664+
goto err_free_mtt;
665+
666+
ret = erdma_create_mtt_buf_sg(dev, mtt);
667+
if (ret)
668+
goto err_free_mtt_buf;
669+
670+
ibdev_dbg(&dev->ibdev, "create scatter mtt, size:%lu, nsg:%u\n",
671+
mtt->size, mtt->nsg);
672+
673+
return mtt;
674+
675+
err_free_mtt_buf:
676+
vfree(mtt->buf);
677+
568678
err_free_mtt:
569679
kfree(mtt);
570680

@@ -574,28 +684,77 @@ static struct erdma_mtt *erdma_create_cont_mtt(struct erdma_dev *dev,
574684
static struct erdma_mtt *erdma_create_mtt(struct erdma_dev *dev, size_t size,
575685
bool force_continuous)
576686
{
687+
struct erdma_mtt *mtt, *tmp_mtt;
688+
int ret, level = 0;
689+
577690
ibdev_dbg(&dev->ibdev, "create_mtt, size:%lu, force cont:%d\n", size,
578691
force_continuous);
579692

693+
if (!(dev->attrs.cap_flags & ERDMA_DEV_CAP_FLAGS_MTT_VA))
694+
force_continuous = true;
695+
580696
if (force_continuous)
581697
return erdma_create_cont_mtt(dev, size);
582698

583-
return ERR_PTR(-EOPNOTSUPP);
699+
mtt = erdma_create_scatter_mtt(dev, size);
700+
if (IS_ERR(mtt))
701+
return mtt;
702+
level = 1;
703+
704+
/* convergence the mtt table. */
705+
while (mtt->nsg != 1 && level <= 3) {
706+
tmp_mtt = erdma_create_scatter_mtt(dev, MTT_SIZE(mtt->nsg));
707+
if (IS_ERR(tmp_mtt)) {
708+
ret = PTR_ERR(tmp_mtt);
709+
goto err_free_mtt;
710+
}
711+
erdma_init_middle_mtt(tmp_mtt, mtt);
712+
tmp_mtt->low_level = mtt;
713+
mtt = tmp_mtt;
714+
level++;
715+
}
716+
717+
if (level > 3) {
718+
ret = -ENOMEM;
719+
goto err_free_mtt;
720+
}
721+
722+
mtt->level = level;
723+
ibdev_dbg(&dev->ibdev, "top mtt: level:%d, dma_addr 0x%llx\n",
724+
mtt->level, mtt->sglist[0].dma_address);
725+
726+
return mtt;
727+
err_free_mtt:
728+
while (mtt) {
729+
tmp_mtt = mtt->low_level;
730+
erdma_destroy_scatter_mtt(dev, mtt);
731+
mtt = tmp_mtt;
732+
}
733+
734+
return ERR_PTR(ret);
584735
}
585736

586737
static void erdma_destroy_mtt(struct erdma_dev *dev, struct erdma_mtt *mtt)
587738
{
739+
struct erdma_mtt *tmp_mtt;
740+
588741
if (mtt->continuous) {
589742
dma_unmap_single(&dev->pdev->dev, mtt->buf_dma, mtt->size,
590743
DMA_TO_DEVICE);
591744
kfree(mtt->buf);
592745
kfree(mtt);
746+
} else {
747+
while (mtt) {
748+
tmp_mtt = mtt->low_level;
749+
erdma_destroy_scatter_mtt(dev, mtt);
750+
mtt = tmp_mtt;
751+
}
593752
}
594753
}
595754

596755
static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
597756
u64 start, u64 len, int access, u64 virt,
598-
unsigned long req_page_size, u8 force_indirect_mtt)
757+
unsigned long req_page_size, bool force_continuous)
599758
{
600759
int ret = 0;
601760

@@ -612,7 +771,8 @@ static int get_mtt_entries(struct erdma_dev *dev, struct erdma_mem *mem,
612771
mem->page_offset = start & (mem->page_size - 1);
613772
mem->mtt_nents = ib_umem_num_dma_blocks(mem->umem, mem->page_size);
614773
mem->page_cnt = mem->mtt_nents;
615-
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt), true);
774+
mem->mtt = erdma_create_mtt(dev, MTT_SIZE(mem->page_cnt),
775+
force_continuous);
616776
if (IS_ERR(mem->mtt)) {
617777
ret = PTR_ERR(mem->mtt);
618778
goto error_ret;
@@ -717,7 +877,7 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx,
717877

718878
ret = get_mtt_entries(qp->dev, &qp->user_qp.sq_mem, va,
719879
qp->attrs.sq_size << SQEBB_SHIFT, 0, va,
720-
(SZ_1M - SZ_4K), 1);
880+
(SZ_1M - SZ_4K), true);
721881
if (ret)
722882
return ret;
723883

@@ -726,7 +886,7 @@ static int init_user_qp(struct erdma_qp *qp, struct erdma_ucontext *uctx,
726886

727887
ret = get_mtt_entries(qp->dev, &qp->user_qp.rq_mem, va + rq_offset,
728888
qp->attrs.rq_size << RQE_SHIFT, 0, va + rq_offset,
729-
(SZ_1M - SZ_4K), 1);
889+
(SZ_1M - SZ_4K), true);
730890
if (ret)
731891
goto put_sq_mtt;
732892

@@ -998,7 +1158,7 @@ struct ib_mr *erdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
9981158
return ERR_PTR(-ENOMEM);
9991159

10001160
ret = get_mtt_entries(dev, &mr->mem, start, len, access, virt,
1001-
SZ_2G - SZ_4K, 0);
1161+
SZ_2G - SZ_4K, false);
10021162
if (ret)
10031163
goto err_out_free;
10041164

@@ -1423,7 +1583,7 @@ static int erdma_init_user_cq(struct erdma_ucontext *ctx, struct erdma_cq *cq,
14231583

14241584
ret = get_mtt_entries(dev, &cq->user_cq.qbuf_mem, ureq->qbuf_va,
14251585
ureq->qbuf_len, 0, ureq->qbuf_va, SZ_64M - SZ_4K,
1426-
1);
1586+
true);
14271587
if (ret)
14281588
return ret;
14291589

drivers/infiniband/hw/erdma/erdma_verbs.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,8 @@ struct erdma_pd {
7373
#define ERDMA_MR_TYPE_FRMR 1
7474
#define ERDMA_MR_TYPE_DMA 2
7575

76-
#define ERDMA_MR_INLINE_MTT 0
77-
#define ERDMA_MR_INDIRECT_MTT 1
76+
#define ERDMA_MR_MTT_0LEVEL 0
77+
#define ERDMA_MR_MTT_1LEVEL 1
7878

7979
#define ERDMA_MR_ACC_RA BIT(0)
8080
#define ERDMA_MR_ACC_LR BIT(1)

0 commit comments

Comments
 (0)