Skip to content

Commit f405349

Browse files
dtatuleakuba-moo
authored andcommitted
net/mlx5e: Make PCIe congestion event thresholds configurable
Add devlink driverinit parameters for configuring the thresholds for PCIe congestion events. These parameters are registered only when the firmware supports this feature. Update the mlx5 devlink docs as well on these new params. Signed-off-by: Dragos Tatulea <[email protected]> Signed-off-by: Tariq Toukan <[email protected]> Reviewed-by: Simon Horman <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent 04d1ff1 commit f405349

File tree

4 files changed

+226
-8
lines changed

4 files changed

+226
-8
lines changed

Documentation/networking/devlink/mlx5.rst

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,58 @@ parameters.
146146
- u32
147147
- driverinit
148148
- Control the size (in packets) of the hairpin queues.
149+
* - ``pcie_cong_inbound_high``
150+
- u16
151+
- driverinit
152+
- High threshold configuration for PCIe congestion events. The firmware
153+
will send an event once device side inbound PCIe traffic went
154+
above the configured high threshold for a long enough period (at least
155+
200ms).
156+
157+
See pci_bw_inbound_high ethtool stat.
158+
159+
Units are 0.01 %. Accepted values are in range [0, 10000].
160+
pcie_cong_inbound_low < pcie_cong_inbound_high.
161+
Default value: 9000 (Corresponds to 90%).
162+
* - ``pcie_cong_inbound_low``
163+
- u16
164+
- driverinit
165+
- Low threshold configuration for PCIe congestion events. The firmware
166+
will send an event once device side inbound PCIe traffic went
167+
below the configured low threshold, only after having been previously in
168+
a congested state.
169+
170+
See pci_bw_inbound_low ethtool stat.
171+
172+
Units are 0.01 %. Accepted values are in range [0, 10000].
173+
pcie_cong_inbound_low < pcie_cong_inbound_high.
174+
Default value: 7500.
175+
* - ``pcie_cong_outbound_high``
176+
- u16
177+
- driverinit
178+
- High threshold configuration for PCIe congestion events. The firmware
179+
will send an event once device side outbound PCIe traffic went
180+
above the configured high threshold for a long enough period (at least
181+
200ms).
182+
183+
See pci_bw_outbound_high ethtool stat.
184+
185+
Units are 0.01 %. Accepted values are in range [0, 10000].
186+
pcie_cong_outbound_low < pcie_cong_outbound_high.
187+
Default value: 9000 (Corresponds to 90%).
188+
* - ``pcie_cong_outbound_low``
189+
- u16
190+
- driverinit
191+
- Low threshold configuration for PCIe congestion events. The firmware
192+
will send an event once device side outbound PCIe traffic went
193+
below the configured low threshold, only after having been previously in
194+
a congested state.
195+
196+
See pci_bw_outbound_low ethtool stat.
197+
198+
Units are 0.01 %. Accepted values are in range [0, 10000].
199+
pcie_cong_outbound_low < pcie_cong_outbound_high.
200+
Default value: 7500.
149201

150202
* - ``cqe_compress_type``
151203
- string

drivers/net/ethernet/mellanox/mlx5/core/devlink.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,105 @@ static void mlx5_devlink_eth_params_unregister(struct devlink *devlink)
651651
ARRAY_SIZE(mlx5_devlink_eth_params));
652652
}
653653

654+
#define MLX5_PCIE_CONG_THRESH_MAX 10000
655+
#define MLX5_PCIE_CONG_THRESH_DEF_LOW 7500
656+
#define MLX5_PCIE_CONG_THRESH_DEF_HIGH 9000
657+
658+
static int
659+
mlx5_devlink_pcie_cong_thresh_validate(struct devlink *devl, u32 id,
660+
union devlink_param_value val,
661+
struct netlink_ext_ack *extack)
662+
{
663+
if (val.vu16 > MLX5_PCIE_CONG_THRESH_MAX) {
664+
NL_SET_ERR_MSG_FMT_MOD(extack, "Value %u > max supported (%u)",
665+
val.vu16, MLX5_PCIE_CONG_THRESH_MAX);
666+
667+
return -EINVAL;
668+
}
669+
670+
switch (id) {
671+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW:
672+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH:
673+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW:
674+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH:
675+
break;
676+
default:
677+
return -EOPNOTSUPP;
678+
}
679+
680+
return 0;
681+
}
682+
683+
static void mlx5_devlink_pcie_cong_init_values(struct devlink *devlink)
684+
{
685+
union devlink_param_value value;
686+
u32 id;
687+
688+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
689+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW;
690+
devl_param_driverinit_value_set(devlink, id, value);
691+
692+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
693+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH;
694+
devl_param_driverinit_value_set(devlink, id, value);
695+
696+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
697+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW;
698+
devl_param_driverinit_value_set(devlink, id, value);
699+
700+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
701+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH;
702+
devl_param_driverinit_value_set(devlink, id, value);
703+
}
704+
705+
static const struct devlink_param mlx5_devlink_pcie_cong_params[] = {
706+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
707+
"pcie_cong_inbound_low", DEVLINK_PARAM_TYPE_U16,
708+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
709+
mlx5_devlink_pcie_cong_thresh_validate),
710+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
711+
"pcie_cong_inbound_high", DEVLINK_PARAM_TYPE_U16,
712+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
713+
mlx5_devlink_pcie_cong_thresh_validate),
714+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
715+
"pcie_cong_outbound_low", DEVLINK_PARAM_TYPE_U16,
716+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
717+
mlx5_devlink_pcie_cong_thresh_validate),
718+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
719+
"pcie_cong_outbound_high", DEVLINK_PARAM_TYPE_U16,
720+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
721+
mlx5_devlink_pcie_cong_thresh_validate),
722+
};
723+
724+
static int mlx5_devlink_pcie_cong_params_register(struct devlink *devlink)
725+
{
726+
struct mlx5_core_dev *dev = devlink_priv(devlink);
727+
int err;
728+
729+
if (!mlx5_pcie_cong_event_supported(dev))
730+
return 0;
731+
732+
err = devl_params_register(devlink, mlx5_devlink_pcie_cong_params,
733+
ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
734+
if (err)
735+
return err;
736+
737+
mlx5_devlink_pcie_cong_init_values(devlink);
738+
739+
return 0;
740+
}
741+
742+
static void mlx5_devlink_pcie_cong_params_unregister(struct devlink *devlink)
743+
{
744+
struct mlx5_core_dev *dev = devlink_priv(devlink);
745+
746+
if (!mlx5_pcie_cong_event_supported(dev))
747+
return;
748+
749+
devl_params_unregister(devlink, mlx5_devlink_pcie_cong_params,
750+
ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
751+
}
752+
654753
static int mlx5_devlink_enable_rdma_validate(struct devlink *devlink, u32 id,
655754
union devlink_param_value val,
656755
struct netlink_ext_ack *extack)
@@ -896,13 +995,19 @@ int mlx5_devlink_params_register(struct devlink *devlink)
896995
if (err)
897996
goto max_uc_list_err;
898997

998+
err = mlx5_devlink_pcie_cong_params_register(devlink);
999+
if (err)
1000+
goto pcie_cong_err;
1001+
8991002
err = mlx5_nv_param_register_dl_params(devlink);
9001003
if (err)
9011004
goto nv_param_err;
9021005

9031006
return 0;
9041007

9051008
nv_param_err:
1009+
mlx5_devlink_pcie_cong_params_unregister(devlink);
1010+
pcie_cong_err:
9061011
mlx5_devlink_max_uc_list_params_unregister(devlink);
9071012
max_uc_list_err:
9081013
mlx5_devlink_auxdev_params_unregister(devlink);
@@ -915,6 +1020,7 @@ int mlx5_devlink_params_register(struct devlink *devlink)
9151020
void mlx5_devlink_params_unregister(struct devlink *devlink)
9161021
{
9171022
mlx5_nv_param_unregister_dl_params(devlink);
1023+
mlx5_devlink_pcie_cong_params_unregister(devlink);
9181024
mlx5_devlink_max_uc_list_params_unregister(devlink);
9191025
mlx5_devlink_auxdev_params_unregister(devlink);
9201026
devl_params_unregister(devlink, mlx5_devlink_params,

drivers/net/ethernet/mellanox/mlx5/core/devlink.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ enum mlx5_devlink_param_id {
2222
MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT,
2323
MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES,
2424
MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE,
25+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
26+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
27+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
28+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
2529
MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE
2630
};
2731

drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c

Lines changed: 64 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
22
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
33

4+
#include "../devlink.h"
45
#include "en.h"
56
#include "pcie_cong_event.h"
67

@@ -41,13 +42,6 @@ struct mlx5e_pcie_cong_event {
4142
struct mlx5e_pcie_cong_stats stats;
4243
};
4344

44-
/* In units of 0.01 % */
45-
static const struct mlx5e_pcie_cong_thresh default_thresh_config = {
46-
.inbound_high = 9000,
47-
.inbound_low = 7500,
48-
.outbound_high = 9000,
49-
.outbound_low = 7500,
50-
};
5145

5246
static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
5347
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
@@ -249,15 +243,77 @@ static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb,
249243
return NOTIFY_OK;
250244
}
251245

246+
static int
247+
mlx5e_pcie_cong_get_thresh_config(struct mlx5_core_dev *dev,
248+
struct mlx5e_pcie_cong_thresh *config)
249+
{
250+
u32 ids[4] = {
251+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
252+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
253+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
254+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
255+
};
256+
struct devlink *devlink = priv_to_devlink(dev);
257+
union devlink_param_value val[4];
258+
259+
for (int i = 0; i < 4; i++) {
260+
u32 id = ids[i];
261+
int err;
262+
263+
err = devl_param_driverinit_value_get(devlink, id, &val[i]);
264+
if (err)
265+
return err;
266+
}
267+
268+
config->inbound_low = val[0].vu16;
269+
config->inbound_high = val[1].vu16;
270+
config->outbound_low = val[2].vu16;
271+
config->outbound_high = val[3].vu16;
272+
273+
return 0;
274+
}
275+
276+
static int
277+
mlx5e_thresh_config_validate(struct mlx5_core_dev *mdev,
278+
const struct mlx5e_pcie_cong_thresh *config)
279+
{
280+
int err = 0;
281+
282+
if (config->inbound_low >= config->inbound_high) {
283+
err = -EINVAL;
284+
mlx5_core_err(mdev, "PCIe inbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
285+
config->inbound_low, config->inbound_high);
286+
}
287+
288+
if (config->outbound_low >= config->outbound_high) {
289+
err = -EINVAL;
290+
mlx5_core_err(mdev, "PCIe outbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
291+
config->outbound_low, config->outbound_high);
292+
}
293+
294+
return err;
295+
}
296+
252297
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
253298
{
299+
struct mlx5e_pcie_cong_thresh thresh_config = {};
254300
struct mlx5e_pcie_cong_event *cong_event;
255301
struct mlx5_core_dev *mdev = priv->mdev;
256302
int err;
257303

258304
if (!mlx5_pcie_cong_event_supported(mdev))
259305
return 0;
260306

307+
err = mlx5e_pcie_cong_get_thresh_config(mdev, &thresh_config);
308+
if (WARN_ON(err))
309+
return err;
310+
311+
err = mlx5e_thresh_config_validate(mdev, &thresh_config);
312+
if (err) {
313+
mlx5_core_err(mdev, "PCIe congestion event feature disabled\n");
314+
return err;
315+
}
316+
261317
cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL,
262318
mdev->priv.numa_node);
263319
if (!cong_event)
@@ -269,7 +325,7 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
269325

270326
cong_event->priv = priv;
271327

272-
err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config,
328+
err = mlx5_cmd_pcie_cong_event_set(mdev, &thresh_config,
273329
&cong_event->obj_id);
274330
if (err) {
275331
mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n");

0 commit comments

Comments
 (0)