Skip to content

Commit b272fc8

Browse files
committed
RDMA support for DMA handle
From Yishai: This patch series introduces a new DMA Handle (DMAH) object, along with corresponding APIs for its allocation and deallocation. The DMAH object encapsulates attributes relevant for DMA transactions. While initially intended to support TLP Processing Hints (TPH) [1], the design is extensible to accommodate future features such as PCI multipath for DMA, PCI UIO configurations, traffic class selection, and more. Additionally, we introduce a new ioctl method on the MR object: UVERBS_METHOD_REG_MR. This method consolidates multiple reg_mr variants under a single user-space ioctl interface, supporting: ibv_reg_mr(), ibv_reg_mr_iova(), ibv_reg_mr_iova2() and ibv_reg_dmabuf_mr(). It also enables passing a DMA handle as part of the registration process. Throughout the patch series, the following DMAH-related stuff can also be observed in the IB layer: - Association with a CPU ID and its memory type, for use with Steering Tags [2]. - Inclusion of Processing Hints (PH) data for TPH functionality [3]. - Enforces security by ensuring that only tasks allowed to run on a given CPU may request a DMA handle for it. - Reference counting for DMAH life cycle management and safe usage across memory regions. mlx5 driver implementation: -------------------------- The series includes implementation of the above functionality in the mlx5 driver. In mlx5_core: - Enables TPH over PCIe when both firmware and OS support it. - Manages Steering Tags and corresponding indices by writing tag values to the PCI configuration space. - Exposes APIs to upper layers (e.g., mlx5_ib) to enable the PCIe TPH functionality. In mlx5_ib: - Adds full support for DMAH operations. - Utilizes mlx5_core's Steering Tag APIs to derive tag indices from input. - Stores the resulting index in a mlx5_dmah structure for use during MKEY creation with a DMA handle. - Adds support for allowing MKEYs to be created in conjunction with DMA handles. Additional details are provided in the commit messages. [1] Background, from PCIe specification 6.2. TLP Processing Hints (TPH) -------------------------- TLP Processing Hints is an optional feature that provides hints in Request TLP headers to facilitate optimized processing of Requests that target Memory Space. These Processing Hints enable the system hardware (e.g., the Root Complex and/ or Endpoints) to optimize platform resources such as system and memory interconnect on a per TLP basis. Steering Tags are system-specific values used to identify a processing resource that a Requester explicitly targets. System software discovers and identifies TPH capabilities to determine the Steering Tag allocation for each Function that supports TPH [2] Steering Tags Functions that intend to target a TLP towards a specific processing resource such as a host processor or system cache hierarchy require topological information of the target cache (e.g., which host cache). Steering Tags are system-specific values that provide information about the host or cache structure in the system cache hierarchy. These values are used to associate processing elements within the platform with the processing of Requests. [3] Processing Hints The Requester provides hints to the Root Complex or other targets about the intended use of data and data structures by the host and/or device. The hints are provided by the Requester, which has knowledge of upcoming Request patterns, and which the Completer would not be able to deduce autonomously (with good accuracy) Yishai Signed-off-by: Leon Romanovsky <[email protected]> * mlx5-next: net/mlx5: Add support for device steering tag net/mlx5: Expose IFC bits for TPH PCI/TPH: Expose pcie_tph_get_st_table_size() net/mlx5: Expose cable_length field in PFCC register net/mlx5: Add IFC bits and enums for buf_ownership net/mlx5: Add IFC bits to support RSS for IPSec offload net/mlx5: IFC updates for disabled host PF net/mlx5: Expose disciplined_fr_counter through HCA capabilities in mlx5_ifc
2 parents b834407 + 888a777 commit b272fc8

File tree

9 files changed

+277
-27
lines changed

9 files changed

+277
-27
lines changed

drivers/net/ethernet/mellanox/mlx5/core/Makefile

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,5 +167,10 @@ mlx5_core-$(CONFIG_MLX5_SF) += sf/vhca_event.o sf/dev/dev.o sf/dev/driver.o irq_
167167
#
168168
mlx5_core-$(CONFIG_MLX5_SF_MANAGER) += sf/cmd.o sf/hw_table.o sf/devlink.o
169169

170+
#
171+
# TPH support
172+
#
173+
mlx5_core-$(CONFIG_PCIE_TPH) += lib/st.o
174+
170175
obj-$(CONFIG_MLX5_DPLL) += mlx5_dpll.o
171176
mlx5_dpll-y := dpll.o
Lines changed: 164 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,164 @@
1+
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2+
/*
3+
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4+
*/
5+
6+
#include <linux/mlx5/driver.h>
7+
#include <linux/mlx5/device.h>
8+
9+
#include "mlx5_core.h"
10+
#include "lib/mlx5.h"
11+
12+
struct mlx5_st_idx_data {
13+
refcount_t usecount;
14+
u16 tag;
15+
};
16+
17+
struct mlx5_st {
18+
/* serialize access upon alloc/free flows */
19+
struct mutex lock;
20+
struct xa_limit index_limit;
21+
struct xarray idx_xa; /* key == index, value == struct mlx5_st_idx_data */
22+
};
23+
24+
struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev)
25+
{
26+
struct pci_dev *pdev = dev->pdev;
27+
struct mlx5_st *st;
28+
u16 num_entries;
29+
int ret;
30+
31+
if (!MLX5_CAP_GEN(dev, mkey_pcie_tph))
32+
return NULL;
33+
34+
#ifdef CONFIG_MLX5_SF
35+
if (mlx5_core_is_sf(dev))
36+
return dev->priv.parent_mdev->st;
37+
#endif
38+
39+
/* Checking whether the device is capable */
40+
if (!pdev->tph_cap)
41+
return NULL;
42+
43+
num_entries = pcie_tph_get_st_table_size(pdev);
44+
/* We need a reserved entry for non TPH cases */
45+
if (num_entries < 2)
46+
return NULL;
47+
48+
/* The OS doesn't support ST */
49+
ret = pcie_enable_tph(pdev, PCI_TPH_ST_DS_MODE);
50+
if (ret)
51+
return NULL;
52+
53+
st = kzalloc(sizeof(*st), GFP_KERNEL);
54+
if (!st)
55+
goto end;
56+
57+
mutex_init(&st->lock);
58+
xa_init_flags(&st->idx_xa, XA_FLAGS_ALLOC);
59+
/* entry 0 is reserved for non TPH cases */
60+
st->index_limit.min = MLX5_MKC_PCIE_TPH_NO_STEERING_TAG_INDEX + 1;
61+
st->index_limit.max = num_entries - 1;
62+
63+
return st;
64+
65+
end:
66+
pcie_disable_tph(dev->pdev);
67+
return NULL;
68+
}
69+
70+
void mlx5_st_destroy(struct mlx5_core_dev *dev)
71+
{
72+
struct mlx5_st *st = dev->st;
73+
74+
if (mlx5_core_is_sf(dev) || !st)
75+
return;
76+
77+
pcie_disable_tph(dev->pdev);
78+
WARN_ON_ONCE(!xa_empty(&st->idx_xa));
79+
kfree(st);
80+
}
81+
82+
int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
83+
unsigned int cpu_uid, u16 *st_index)
84+
{
85+
struct mlx5_st_idx_data *idx_data;
86+
struct mlx5_st *st = dev->st;
87+
unsigned long index;
88+
u32 xa_id;
89+
u16 tag;
90+
int ret;
91+
92+
if (!st)
93+
return -EOPNOTSUPP;
94+
95+
ret = pcie_tph_get_cpu_st(dev->pdev, mem_type, cpu_uid, &tag);
96+
if (ret)
97+
return ret;
98+
99+
mutex_lock(&st->lock);
100+
101+
xa_for_each(&st->idx_xa, index, idx_data) {
102+
if (tag == idx_data->tag) {
103+
refcount_inc(&idx_data->usecount);
104+
*st_index = index;
105+
goto end;
106+
}
107+
}
108+
109+
idx_data = kzalloc(sizeof(*idx_data), GFP_KERNEL);
110+
if (!idx_data) {
111+
ret = -ENOMEM;
112+
goto end;
113+
}
114+
115+
refcount_set(&idx_data->usecount, 1);
116+
idx_data->tag = tag;
117+
118+
ret = xa_alloc(&st->idx_xa, &xa_id, idx_data, st->index_limit, GFP_KERNEL);
119+
if (ret)
120+
goto clean_idx_data;
121+
122+
ret = pcie_tph_set_st_entry(dev->pdev, xa_id, tag);
123+
if (ret)
124+
goto clean_idx_xa;
125+
126+
*st_index = xa_id;
127+
goto end;
128+
129+
clean_idx_xa:
130+
xa_erase(&st->idx_xa, xa_id);
131+
clean_idx_data:
132+
kfree(idx_data);
133+
end:
134+
mutex_unlock(&st->lock);
135+
return ret;
136+
}
137+
EXPORT_SYMBOL_GPL(mlx5_st_alloc_index);
138+
139+
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
140+
{
141+
struct mlx5_st_idx_data *idx_data;
142+
struct mlx5_st *st = dev->st;
143+
int ret = 0;
144+
145+
if (!st)
146+
return -EOPNOTSUPP;
147+
148+
mutex_lock(&st->lock);
149+
idx_data = xa_load(&st->idx_xa, st_index);
150+
if (WARN_ON_ONCE(!idx_data)) {
151+
ret = -EINVAL;
152+
goto end;
153+
}
154+
155+
if (refcount_dec_and_test(&idx_data->usecount)) {
156+
xa_erase(&st->idx_xa, st_index);
157+
/* We leave PCI config space as was before, no mkey will refer to it */
158+
}
159+
160+
end:
161+
mutex_unlock(&st->lock);
162+
return ret;
163+
}
164+
EXPORT_SYMBOL_GPL(mlx5_st_dealloc_index);

drivers/net/ethernet/mellanox/mlx5/core/main.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,7 @@ static int mlx5_init_once(struct mlx5_core_dev *dev)
11021102
}
11031103

11041104
dev->dm = mlx5_dm_create(dev);
1105+
dev->st = mlx5_st_create(dev);
11051106
dev->tracer = mlx5_fw_tracer_create(dev);
11061107
dev->hv_vhca = mlx5_hv_vhca_create(dev);
11071108
dev->rsc_dump = mlx5_rsc_dump_create(dev);
@@ -1150,6 +1151,7 @@ static void mlx5_cleanup_once(struct mlx5_core_dev *dev)
11501151
mlx5_rsc_dump_destroy(dev);
11511152
mlx5_hv_vhca_destroy(dev->hv_vhca);
11521153
mlx5_fw_tracer_destroy(dev->tracer);
1154+
mlx5_st_destroy(dev);
11531155
mlx5_dm_cleanup(dev);
11541156
mlx5_fs_core_free(dev);
11551157
mlx5_sf_table_cleanup(dev);

drivers/net/ethernet/mellanox/mlx5/core/mlx5_core.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,15 @@ int mlx5_set_mtppse(struct mlx5_core_dev *mdev, u8 pin, u8 arm, u8 mode);
300300
struct mlx5_dm *mlx5_dm_create(struct mlx5_core_dev *dev);
301301
void mlx5_dm_cleanup(struct mlx5_core_dev *dev);
302302

303+
#ifdef CONFIG_PCIE_TPH
304+
struct mlx5_st *mlx5_st_create(struct mlx5_core_dev *dev);
305+
void mlx5_st_destroy(struct mlx5_core_dev *dev);
306+
#else
307+
static inline struct mlx5_st *
308+
mlx5_st_create(struct mlx5_core_dev *dev) { return NULL; }
309+
static inline void mlx5_st_destroy(struct mlx5_core_dev *dev) { return; }
310+
#endif
311+
303312
void mlx5_toggle_port_link(struct mlx5_core_dev *dev);
304313
int mlx5_set_port_admin_status(struct mlx5_core_dev *dev,
305314
enum mlx5_port_status status);

drivers/net/ethernet/mellanox/mlx5/core/steering/hws/definer.c

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -727,8 +727,9 @@ hws_definer_conv_outer(struct mlx5hws_definer_conv_data *cd,
727727
u32 *s_ipv6, *d_ipv6;
728728

729729
if (HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type, 0x2) ||
730-
HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c2, 0xe) ||
731-
HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c4, 0x4)) {
730+
HWS_IS_FLD_SET_SZ(match_param, outer_headers.l4_type_ext, 0x4) ||
731+
HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_c6, 0xa) ||
732+
HWS_IS_FLD_SET_SZ(match_param, outer_headers.reserved_at_d4, 0x4)) {
732733
mlx5hws_err(cd->ctx, "Unsupported outer parameters set\n");
733734
return -EINVAL;
734735
}
@@ -903,8 +904,9 @@ hws_definer_conv_inner(struct mlx5hws_definer_conv_data *cd,
903904
u32 *s_ipv6, *d_ipv6;
904905

905906
if (HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type, 0x2) ||
906-
HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c2, 0xe) ||
907-
HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c4, 0x4)) {
907+
HWS_IS_FLD_SET_SZ(match_param, inner_headers.l4_type_ext, 0x4) ||
908+
HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_c6, 0xa) ||
909+
HWS_IS_FLD_SET_SZ(match_param, inner_headers.reserved_at_d4, 0x4)) {
908910
mlx5hws_err(cd->ctx, "Unsupported inner parameters set\n");
909911
return -EINVAL;
910912
}
@@ -1279,7 +1281,8 @@ hws_definer_conv_misc2(struct mlx5hws_definer_conv_data *cd,
12791281
struct mlx5hws_definer_fc *curr_fc;
12801282

12811283
if (HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1a0, 0x8) ||
1282-
HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1b8, 0x8) ||
1284+
HWS_IS_FLD_SET_SZ(match_param,
1285+
misc_parameters_2.ipsec_next_header, 0x8) ||
12831286
HWS_IS_FLD_SET_SZ(match_param, misc_parameters_2.reserved_at_1c0, 0x40) ||
12841287
HWS_IS_FLD_SET(match_param, misc_parameters_2.macsec_syndrome) ||
12851288
HWS_IS_FLD_SET(match_param, misc_parameters_2.ipsec_syndrome)) {

drivers/pci/tph.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ static u32 get_st_table_loc(struct pci_dev *pdev)
168168
* Return the size of ST table. If ST table is not in TPH Requester Extended
169169
* Capability space, return 0. Otherwise return the ST Table Size + 1.
170170
*/
171-
static u16 get_st_table_size(struct pci_dev *pdev)
171+
u16 pcie_tph_get_st_table_size(struct pci_dev *pdev)
172172
{
173173
u32 reg;
174174
u32 loc;
@@ -185,6 +185,7 @@ static u16 get_st_table_size(struct pci_dev *pdev)
185185

186186
return FIELD_GET(PCI_TPH_CAP_ST_MASK, reg) + 1;
187187
}
188+
EXPORT_SYMBOL(pcie_tph_get_st_table_size);
188189

189190
/* Return device's Root Port completer capability */
190191
static u8 get_rp_completer_type(struct pci_dev *pdev)
@@ -211,7 +212,7 @@ static int write_tag_to_st_table(struct pci_dev *pdev, int index, u16 tag)
211212
int offset;
212213

213214
/* Check if index is out of bound */
214-
st_table_size = get_st_table_size(pdev);
215+
st_table_size = pcie_tph_get_st_table_size(pdev);
215216
if (index >= st_table_size)
216217
return -ENXIO;
217218

@@ -443,7 +444,7 @@ void pci_restore_tph_state(struct pci_dev *pdev)
443444
pci_write_config_dword(pdev, pdev->tph_cap + PCI_TPH_CTRL, *cap++);
444445
st_entry = (u16 *)cap;
445446
offset = PCI_TPH_BASE_SIZEOF;
446-
num_entries = get_st_table_size(pdev);
447+
num_entries = pcie_tph_get_st_table_size(pdev);
447448
for (i = 0; i < num_entries; i++) {
448449
pci_write_config_word(pdev, pdev->tph_cap + offset,
449450
*st_entry++);
@@ -475,7 +476,7 @@ void pci_save_tph_state(struct pci_dev *pdev)
475476
/* Save all ST entries in extended capability structure */
476477
st_entry = (u16 *)cap;
477478
offset = PCI_TPH_BASE_SIZEOF;
478-
num_entries = get_st_table_size(pdev);
479+
num_entries = pcie_tph_get_st_table_size(pdev);
479480
for (i = 0; i < num_entries; i++) {
480481
pci_read_config_word(pdev, pdev->tph_cap + offset,
481482
st_entry++);
@@ -499,7 +500,7 @@ void pci_tph_init(struct pci_dev *pdev)
499500
if (!pdev->tph_cap)
500501
return;
501502

502-
num_entries = get_st_table_size(pdev);
503+
num_entries = pcie_tph_get_st_table_size(pdev);
503504
save_size = sizeof(u32) + num_entries * sizeof(u16);
504505
pci_add_ext_cap_save_buffer(pdev, PCI_EXT_CAP_ID_TPH, save_size);
505506
}

include/linux/mlx5/driver.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include <linux/kernel.h>
3737
#include <linux/completion.h>
3838
#include <linux/pci.h>
39+
#include <linux/pci-tph.h>
3940
#include <linux/irq.h>
4041
#include <linux/spinlock_types.h>
4142
#include <linux/semaphore.h>
@@ -688,6 +689,7 @@ struct mlx5_fw_tracer;
688689
struct mlx5_vxlan;
689690
struct mlx5_geneve;
690691
struct mlx5_hv_vhca;
692+
struct mlx5_st;
691693

692694
#define MLX5_LOG_SW_ICM_BLOCK_SIZE(dev) (MLX5_CAP_DEV_MEM(dev, log_sw_icm_alloc_granularity))
693695
#define MLX5_SW_ICM_BLOCK_SIZE(dev) (1 << MLX5_LOG_SW_ICM_BLOCK_SIZE(dev))
@@ -757,6 +759,7 @@ struct mlx5_core_dev {
757759
u32 issi;
758760
struct mlx5e_resources mlx5e_res;
759761
struct mlx5_dm *dm;
762+
struct mlx5_st *st;
760763
struct mlx5_vxlan *vxlan;
761764
struct mlx5_geneve *geneve;
762765
struct {
@@ -1160,6 +1163,23 @@ int mlx5_dm_sw_icm_alloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
11601163
int mlx5_dm_sw_icm_dealloc(struct mlx5_core_dev *dev, enum mlx5_sw_icm_type type,
11611164
u64 length, u16 uid, phys_addr_t addr, u32 obj_id);
11621165

1166+
#ifdef CONFIG_PCIE_TPH
1167+
int mlx5_st_alloc_index(struct mlx5_core_dev *dev, enum tph_mem_type mem_type,
1168+
unsigned int cpu_uid, u16 *st_index);
1169+
int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index);
1170+
#else
1171+
static inline int mlx5_st_alloc_index(struct mlx5_core_dev *dev,
1172+
enum tph_mem_type mem_type,
1173+
unsigned int cpu_uid, u16 *st_index)
1174+
{
1175+
return -EOPNOTSUPP;
1176+
}
1177+
static inline int mlx5_st_dealloc_index(struct mlx5_core_dev *dev, u16 st_index)
1178+
{
1179+
return -EOPNOTSUPP;
1180+
}
1181+
#endif
1182+
11631183
struct mlx5_core_dev *mlx5_vf_get_core_dev(struct pci_dev *pdev);
11641184
void mlx5_vf_put_core_dev(struct mlx5_core_dev *mdev);
11651185

0 commit comments

Comments
 (0)