Skip to content

Commit cf71bdf

Browse files
committed
Merge branch 'net-mlx5e-add-pcie-congestion-event-extras'
Tariq Toukan says: ==================== net/mlx5e: Add pcie congestion event extras This small series by Dragos covers gaps requested in the initial pcie congestion series [1]: - Make pcie congestion thresholds configurable via devlink. - Add a counter for stale pcie congestion events. [1] https://lore.kernel.org/[email protected] ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 04d1ff1 + cdc4927 commit cf71bdf

File tree

5 files changed

+238
-10
lines changed

5 files changed

+238
-10
lines changed

Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1348,7 +1348,7 @@ Device Counters
13481348
is in a congested state.
13491349
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
13501350
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
1351-
- Tnformative
1351+
- Informative
13521352

13531353
* - `pci_bw_inbound_low`
13541354
- The number of times the device crossed the low inbound PCIe bandwidth
@@ -1373,3 +1373,8 @@ Device Counters
13731373
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
13741374
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
13751375
- Informative
1376+
1377+
* - `pci_bw_stale_event`
1378+
- The number of times the device fired a PCIe congestion event but on query
1379+
there was no change in state.
1380+
- Informative

Documentation/networking/devlink/mlx5.rst

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,58 @@ parameters.
146146
- u32
147147
- driverinit
148148
- Control the size (in packets) of the hairpin queues.
149+
* - ``pcie_cong_inbound_high``
150+
- u16
151+
- driverinit
152+
- High threshold configuration for PCIe congestion events. The firmware
153+
will send an event once device side inbound PCIe traffic went
154+
above the configured high threshold for a long enough period (at least
155+
200ms).
156+
157+
See pci_bw_inbound_high ethtool stat.
158+
159+
Units are 0.01 %. Accepted values are in range [0, 10000].
160+
pcie_cong_inbound_low < pcie_cong_inbound_high.
161+
Default value: 9000 (Corresponds to 90%).
162+
* - ``pcie_cong_inbound_low``
163+
- u16
164+
- driverinit
165+
- Low threshold configuration for PCIe congestion events. The firmware
166+
will send an event once device side inbound PCIe traffic went
167+
below the configured low threshold, only after having been previously in
168+
a congested state.
169+
170+
See pci_bw_inbound_low ethtool stat.
171+
172+
Units are 0.01 %. Accepted values are in range [0, 10000].
173+
pcie_cong_inbound_low < pcie_cong_inbound_high.
174+
Default value: 7500.
175+
* - ``pcie_cong_outbound_high``
176+
- u16
177+
- driverinit
178+
- High threshold configuration for PCIe congestion events. The firmware
179+
will send an event once device side outbound PCIe traffic went
180+
above the configured high threshold for a long enough period (at least
181+
200ms).
182+
183+
See pci_bw_outbound_high ethtool stat.
184+
185+
Units are 0.01 %. Accepted values are in range [0, 10000].
186+
pcie_cong_outbound_low < pcie_cong_outbound_high.
187+
Default value: 9000 (Corresponds to 90%).
188+
* - ``pcie_cong_outbound_low``
189+
- u16
190+
- driverinit
191+
- Low threshold configuration for PCIe congestion events. The firmware
192+
will send an event once device side outbound PCIe traffic went
193+
below the configured low threshold, only after having been previously in
194+
a congested state.
195+
196+
See pci_bw_outbound_low ethtool stat.
197+
198+
Units are 0.01 %. Accepted values are in range [0, 10000].
199+
pcie_cong_outbound_low < pcie_cong_outbound_high.
200+
Default value: 7500.
149201

150202
* - ``cqe_compress_type``
151203
- string

drivers/net/ethernet/mellanox/mlx5/core/devlink.c

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,105 @@ static void mlx5_devlink_eth_params_unregister(struct devlink *devlink)
651651
ARRAY_SIZE(mlx5_devlink_eth_params));
652652
}
653653

654+
#define MLX5_PCIE_CONG_THRESH_MAX 10000
655+
#define MLX5_PCIE_CONG_THRESH_DEF_LOW 7500
656+
#define MLX5_PCIE_CONG_THRESH_DEF_HIGH 9000
657+
658+
static int
659+
mlx5_devlink_pcie_cong_thresh_validate(struct devlink *devl, u32 id,
660+
union devlink_param_value val,
661+
struct netlink_ext_ack *extack)
662+
{
663+
if (val.vu16 > MLX5_PCIE_CONG_THRESH_MAX) {
664+
NL_SET_ERR_MSG_FMT_MOD(extack, "Value %u > max supported (%u)",
665+
val.vu16, MLX5_PCIE_CONG_THRESH_MAX);
666+
667+
return -EINVAL;
668+
}
669+
670+
switch (id) {
671+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW:
672+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH:
673+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW:
674+
case MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH:
675+
break;
676+
default:
677+
return -EOPNOTSUPP;
678+
}
679+
680+
return 0;
681+
}
682+
683+
static void mlx5_devlink_pcie_cong_init_values(struct devlink *devlink)
684+
{
685+
union devlink_param_value value;
686+
u32 id;
687+
688+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
689+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW;
690+
devl_param_driverinit_value_set(devlink, id, value);
691+
692+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
693+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH;
694+
devl_param_driverinit_value_set(devlink, id, value);
695+
696+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_LOW;
697+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW;
698+
devl_param_driverinit_value_set(devlink, id, value);
699+
700+
value.vu16 = MLX5_PCIE_CONG_THRESH_DEF_HIGH;
701+
id = MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH;
702+
devl_param_driverinit_value_set(devlink, id, value);
703+
}
704+
705+
static const struct devlink_param mlx5_devlink_pcie_cong_params[] = {
706+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
707+
"pcie_cong_inbound_low", DEVLINK_PARAM_TYPE_U16,
708+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
709+
mlx5_devlink_pcie_cong_thresh_validate),
710+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
711+
"pcie_cong_inbound_high", DEVLINK_PARAM_TYPE_U16,
712+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
713+
mlx5_devlink_pcie_cong_thresh_validate),
714+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
715+
"pcie_cong_outbound_low", DEVLINK_PARAM_TYPE_U16,
716+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
717+
mlx5_devlink_pcie_cong_thresh_validate),
718+
DEVLINK_PARAM_DRIVER(MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
719+
"pcie_cong_outbound_high", DEVLINK_PARAM_TYPE_U16,
720+
BIT(DEVLINK_PARAM_CMODE_DRIVERINIT), NULL, NULL,
721+
mlx5_devlink_pcie_cong_thresh_validate),
722+
};
723+
724+
static int mlx5_devlink_pcie_cong_params_register(struct devlink *devlink)
725+
{
726+
struct mlx5_core_dev *dev = devlink_priv(devlink);
727+
int err;
728+
729+
if (!mlx5_pcie_cong_event_supported(dev))
730+
return 0;
731+
732+
err = devl_params_register(devlink, mlx5_devlink_pcie_cong_params,
733+
ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
734+
if (err)
735+
return err;
736+
737+
mlx5_devlink_pcie_cong_init_values(devlink);
738+
739+
return 0;
740+
}
741+
742+
static void mlx5_devlink_pcie_cong_params_unregister(struct devlink *devlink)
743+
{
744+
struct mlx5_core_dev *dev = devlink_priv(devlink);
745+
746+
if (!mlx5_pcie_cong_event_supported(dev))
747+
return;
748+
749+
devl_params_unregister(devlink, mlx5_devlink_pcie_cong_params,
750+
ARRAY_SIZE(mlx5_devlink_pcie_cong_params));
751+
}
752+
654753
static int mlx5_devlink_enable_rdma_validate(struct devlink *devlink, u32 id,
655754
union devlink_param_value val,
656755
struct netlink_ext_ack *extack)
@@ -896,13 +995,19 @@ int mlx5_devlink_params_register(struct devlink *devlink)
896995
if (err)
897996
goto max_uc_list_err;
898997

998+
err = mlx5_devlink_pcie_cong_params_register(devlink);
999+
if (err)
1000+
goto pcie_cong_err;
1001+
8991002
err = mlx5_nv_param_register_dl_params(devlink);
9001003
if (err)
9011004
goto nv_param_err;
9021005

9031006
return 0;
9041007

9051008
nv_param_err:
1009+
mlx5_devlink_pcie_cong_params_unregister(devlink);
1010+
pcie_cong_err:
9061011
mlx5_devlink_max_uc_list_params_unregister(devlink);
9071012
max_uc_list_err:
9081013
mlx5_devlink_auxdev_params_unregister(devlink);
@@ -915,6 +1020,7 @@ int mlx5_devlink_params_register(struct devlink *devlink)
9151020
void mlx5_devlink_params_unregister(struct devlink *devlink)
9161021
{
9171022
mlx5_nv_param_unregister_dl_params(devlink);
1023+
mlx5_devlink_pcie_cong_params_unregister(devlink);
9181024
mlx5_devlink_max_uc_list_params_unregister(devlink);
9191025
mlx5_devlink_auxdev_params_unregister(devlink);
9201026
devl_params_unregister(devlink, mlx5_devlink_params,

drivers/net/ethernet/mellanox/mlx5/core/devlink.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@ enum mlx5_devlink_param_id {
2222
MLX5_DEVLINK_PARAM_ID_ESW_MULTIPORT,
2323
MLX5_DEVLINK_PARAM_ID_HAIRPIN_NUM_QUEUES,
2424
MLX5_DEVLINK_PARAM_ID_HAIRPIN_QUEUE_SIZE,
25+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
26+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
27+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
28+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
2529
MLX5_DEVLINK_PARAM_ID_CQE_COMPRESSION_TYPE
2630
};
2731

drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c

Lines changed: 70 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
22
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
33

4+
#include "../devlink.h"
45
#include "en.h"
56
#include "pcie_cong_event.h"
67

@@ -23,6 +24,7 @@ struct mlx5e_pcie_cong_stats {
2324
u32 pci_bw_inbound_low;
2425
u32 pci_bw_outbound_high;
2526
u32 pci_bw_outbound_low;
27+
u32 pci_bw_stale_event;
2628
};
2729

2830
struct mlx5e_pcie_cong_event {
@@ -41,13 +43,6 @@ struct mlx5e_pcie_cong_event {
4143
struct mlx5e_pcie_cong_stats stats;
4244
};
4345

44-
/* In units of 0.01 % */
45-
static const struct mlx5e_pcie_cong_thresh default_thresh_config = {
46-
.inbound_high = 9000,
47-
.inbound_low = 7500,
48-
.outbound_high = 9000,
49-
.outbound_low = 7500,
50-
};
5146

5247
static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
5348
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
@@ -58,6 +53,8 @@ static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
5853
pci_bw_outbound_high) },
5954
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
6055
pci_bw_outbound_low) },
56+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
57+
pci_bw_stale_event) },
6158
};
6259

6360
#define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc)
@@ -218,8 +215,10 @@ static void mlx5e_pcie_cong_event_work(struct work_struct *work)
218215
}
219216

220217
changes = cong_event->state ^ new_cong_state;
221-
if (!changes)
218+
if (!changes) {
219+
cong_event->stats.pci_bw_stale_event++;
222220
return;
221+
}
223222

224223
cong_event->state = new_cong_state;
225224

@@ -249,15 +248,77 @@ static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb,
249248
return NOTIFY_OK;
250249
}
251250

251+
static int
252+
mlx5e_pcie_cong_get_thresh_config(struct mlx5_core_dev *dev,
253+
struct mlx5e_pcie_cong_thresh *config)
254+
{
255+
u32 ids[4] = {
256+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_LOW,
257+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_IN_HIGH,
258+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_LOW,
259+
MLX5_DEVLINK_PARAM_ID_PCIE_CONG_OUT_HIGH,
260+
};
261+
struct devlink *devlink = priv_to_devlink(dev);
262+
union devlink_param_value val[4];
263+
264+
for (int i = 0; i < 4; i++) {
265+
u32 id = ids[i];
266+
int err;
267+
268+
err = devl_param_driverinit_value_get(devlink, id, &val[i]);
269+
if (err)
270+
return err;
271+
}
272+
273+
config->inbound_low = val[0].vu16;
274+
config->inbound_high = val[1].vu16;
275+
config->outbound_low = val[2].vu16;
276+
config->outbound_high = val[3].vu16;
277+
278+
return 0;
279+
}
280+
281+
static int
282+
mlx5e_thresh_config_validate(struct mlx5_core_dev *mdev,
283+
const struct mlx5e_pcie_cong_thresh *config)
284+
{
285+
int err = 0;
286+
287+
if (config->inbound_low >= config->inbound_high) {
288+
err = -EINVAL;
289+
mlx5_core_err(mdev, "PCIe inbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
290+
config->inbound_low, config->inbound_high);
291+
}
292+
293+
if (config->outbound_low >= config->outbound_high) {
294+
err = -EINVAL;
295+
mlx5_core_err(mdev, "PCIe outbound congestion threshold configuration invalid: low (%u) >= high (%u).\n",
296+
config->outbound_low, config->outbound_high);
297+
}
298+
299+
return err;
300+
}
301+
252302
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
253303
{
304+
struct mlx5e_pcie_cong_thresh thresh_config = {};
254305
struct mlx5e_pcie_cong_event *cong_event;
255306
struct mlx5_core_dev *mdev = priv->mdev;
256307
int err;
257308

258309
if (!mlx5_pcie_cong_event_supported(mdev))
259310
return 0;
260311

312+
err = mlx5e_pcie_cong_get_thresh_config(mdev, &thresh_config);
313+
if (WARN_ON(err))
314+
return err;
315+
316+
err = mlx5e_thresh_config_validate(mdev, &thresh_config);
317+
if (err) {
318+
mlx5_core_err(mdev, "PCIe congestion event feature disabled\n");
319+
return err;
320+
}
321+
261322
cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL,
262323
mdev->priv.numa_node);
263324
if (!cong_event)
@@ -269,7 +330,7 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
269330

270331
cong_event->priv = priv;
271332

272-
err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config,
333+
err = mlx5_cmd_pcie_cong_event_set(mdev, &thresh_config,
273334
&cong_event->obj_id);
274335
if (err) {
275336
mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n");

0 commit comments

Comments
 (0)