Skip to content

Commit cd03135

Browse files
committed
Merge branch 'net-mlx5e-add-support-for-pcie-congestion-events'
Tariq Toukan says: ==================== net/mlx5e: Add support for PCIe congestion events Dragos says: PCIe congestion events are events generated by the firmware when the device side has sustained PCIe inbound or outbound traffic above certain thresholds. The high and low threshold are hysteresis thresholds to prevent flapping: once the high threshold has been reached, a low threshold event will be triggered only after the bandwidth usage went below the low threshold. This series adds support for receiving and exposing such events as ethtool counters. 2 new pairs of counters are exposed: pci_bw_in/outbound_high/low. These should help the user understand if the device PCI is under pressure. Planned followup patches: - Allow configuration of thresholds through devlink. - Add ethtool counter for wakeups which did not result in any state change. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 7272580 + 8890ee6 commit cd03135

File tree

10 files changed

+381
-1
lines changed

10 files changed

+381
-1
lines changed

Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,3 +1341,35 @@ Device Counters
13411341
- The number of times the device owned queue had not enough buffers
13421342
allocated.
13431343
- Error
1344+
1345+
* - `pci_bw_inbound_high`
1346+
- The number of times the device crossed the high inbound pcie bandwidth
1347+
threshold. To be compared to pci_bw_inbound_low to check if the device
1348+
is in a congested state.
1349+
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
1350+
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
1351+
- Tnformative
1352+
1353+
* - `pci_bw_inbound_low`
1354+
- The number of times the device crossed the low inbound PCIe bandwidth
1355+
threshold. To be compared to pci_bw_inbound_high to check if the device
1356+
is in a congested state.
1357+
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
1358+
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
1359+
- Informative
1360+
1361+
* - `pci_bw_outbound_high`
1362+
- The number of times the device crossed the high outbound pcie bandwidth
1363+
threshold. To be compared to pci_bw_outbound_low to check if the device
1364+
is in a congested state.
1365+
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
1366+
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
1367+
- Informative
1368+
1369+
* - `pci_bw_outbound_low`
1370+
- The number of times the device crossed the low outbound PCIe bandwidth
1371+
threshold. To be compared to pci_bw_outbound_high to check if the device
1372+
is in a congested state.
1373+
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
1374+
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
1375+
- Informative

drivers/net/ethernet/mellanox/mlx5/core/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ mlx5_core-$(CONFIG_MLX5_CORE_EN) += en/rqt.o en/tir.o en/rss.o en/rx_res.o \
2929
en/reporter_tx.o en/reporter_rx.o en/params.o en/xsk/pool.o \
3030
en/xsk/setup.o en/xsk/rx.o en/xsk/tx.o en/devlink.o en/ptp.o \
3131
en/qos.o en/htb.o en/trap.o en/fs_tt_redirect.o en/selq.o \
32-
lib/crypto.o lib/sd.o
32+
lib/crypto.o lib/sd.o en/pcie_cong_event.o
3333

3434
#
3535
# Netdev extra

drivers/net/ethernet/mellanox/mlx5/core/en.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -920,6 +920,8 @@ struct mlx5e_priv {
920920
struct notifier_block events_nb;
921921
struct notifier_block blocking_events_nb;
922922

923+
struct mlx5e_pcie_cong_event *cong_event;
924+
923925
struct udp_tunnel_nic_info nic_info;
924926
#ifdef CONFIG_MLX5_CORE_EN_DCB
925927
struct mlx5e_dcbx dcbx;
Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2+
// Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
3+
4+
#include "en.h"
5+
#include "pcie_cong_event.h"
6+
7+
#define MLX5E_CONG_HIGH_STATE 0x7
8+
9+
enum {
10+
MLX5E_INBOUND_CONG = BIT(0),
11+
MLX5E_OUTBOUND_CONG = BIT(1),
12+
};
13+
14+
struct mlx5e_pcie_cong_thresh {
15+
u16 inbound_high;
16+
u16 inbound_low;
17+
u16 outbound_high;
18+
u16 outbound_low;
19+
};
20+
21+
struct mlx5e_pcie_cong_stats {
22+
u32 pci_bw_inbound_high;
23+
u32 pci_bw_inbound_low;
24+
u32 pci_bw_outbound_high;
25+
u32 pci_bw_outbound_low;
26+
};
27+
28+
struct mlx5e_pcie_cong_event {
29+
u64 obj_id;
30+
31+
struct mlx5e_priv *priv;
32+
33+
/* For event notifier and workqueue. */
34+
struct work_struct work;
35+
struct mlx5_nb nb;
36+
37+
/* Stores last read state. */
38+
u8 state;
39+
40+
/* For ethtool stats group. */
41+
struct mlx5e_pcie_cong_stats stats;
42+
};
43+
44+
/* In units of 0.01 % */
45+
static const struct mlx5e_pcie_cong_thresh default_thresh_config = {
46+
.inbound_high = 9000,
47+
.inbound_low = 7500,
48+
.outbound_high = 9000,
49+
.outbound_low = 7500,
50+
};
51+
52+
static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
53+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
54+
pci_bw_inbound_high) },
55+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
56+
pci_bw_inbound_low) },
57+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
58+
pci_bw_outbound_high) },
59+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
60+
pci_bw_outbound_low) },
61+
};
62+
63+
#define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc)
64+
65+
static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie_cong)
66+
{
67+
return priv->cong_event ? NUM_PCIE_CONG_COUNTERS : 0;
68+
}
69+
70+
static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie_cong) {}
71+
72+
static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie_cong)
73+
{
74+
if (!priv->cong_event)
75+
return;
76+
77+
for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++)
78+
ethtool_puts(data, mlx5e_pcie_cong_stats_desc[i].format);
79+
}
80+
81+
static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie_cong)
82+
{
83+
if (!priv->cong_event)
84+
return;
85+
86+
for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) {
87+
u32 ctr = MLX5E_READ_CTR32_CPU(&priv->cong_event->stats,
88+
mlx5e_pcie_cong_stats_desc,
89+
i);
90+
91+
mlx5e_ethtool_put_stat(data, ctr);
92+
}
93+
}
94+
95+
MLX5E_DEFINE_STATS_GRP(pcie_cong, 0);
96+
97+
static int
98+
mlx5_cmd_pcie_cong_event_set(struct mlx5_core_dev *dev,
99+
const struct mlx5e_pcie_cong_thresh *config,
100+
u64 *obj_id)
101+
{
102+
u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {};
103+
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
104+
void *cong_obj;
105+
void *hdr;
106+
int err;
107+
108+
hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr);
109+
cong_obj = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, cong_obj);
110+
111+
MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode,
112+
MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
113+
114+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type,
115+
MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT);
116+
117+
MLX5_SET(pcie_cong_event_obj, cong_obj, inbound_event_en, 1);
118+
MLX5_SET(pcie_cong_event_obj, cong_obj, outbound_event_en, 1);
119+
120+
MLX5_SET(pcie_cong_event_obj, cong_obj,
121+
inbound_cong_high_threshold, config->inbound_high);
122+
MLX5_SET(pcie_cong_event_obj, cong_obj,
123+
inbound_cong_low_threshold, config->inbound_low);
124+
125+
MLX5_SET(pcie_cong_event_obj, cong_obj,
126+
outbound_cong_high_threshold, config->outbound_high);
127+
MLX5_SET(pcie_cong_event_obj, cong_obj,
128+
outbound_cong_low_threshold, config->outbound_low);
129+
130+
err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
131+
if (err)
132+
return err;
133+
134+
*obj_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
135+
136+
mlx5_core_dbg(dev, "PCIe congestion event (obj_id=%llu) created. Config: in: [%u, %u], out: [%u, %u]\n",
137+
*obj_id,
138+
config->inbound_high, config->inbound_low,
139+
config->outbound_high, config->outbound_low);
140+
141+
return 0;
142+
}
143+
144+
static int mlx5_cmd_pcie_cong_event_destroy(struct mlx5_core_dev *dev,
145+
u64 obj_id)
146+
{
147+
u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {};
148+
u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)];
149+
void *hdr;
150+
151+
hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr);
152+
MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode,
153+
MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
154+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type,
155+
MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT);
156+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id);
157+
158+
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
159+
}
160+
161+
static int mlx5_cmd_pcie_cong_event_query(struct mlx5_core_dev *dev,
162+
u64 obj_id,
163+
u32 *state)
164+
{
165+
u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {};
166+
u32 out[MLX5_ST_SZ_DW(pcie_cong_event_cmd_out)];
167+
void *obj;
168+
void *hdr;
169+
u8 cong;
170+
int err;
171+
172+
hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr);
173+
174+
MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode,
175+
MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
176+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type,
177+
MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT);
178+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id);
179+
180+
err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
181+
if (err)
182+
return err;
183+
184+
obj = MLX5_ADDR_OF(pcie_cong_event_cmd_out, out, cong_obj);
185+
186+
if (state) {
187+
cong = MLX5_GET(pcie_cong_event_obj, obj, inbound_cong_state);
188+
if (cong == MLX5E_CONG_HIGH_STATE)
189+
*state |= MLX5E_INBOUND_CONG;
190+
191+
cong = MLX5_GET(pcie_cong_event_obj, obj, outbound_cong_state);
192+
if (cong == MLX5E_CONG_HIGH_STATE)
193+
*state |= MLX5E_OUTBOUND_CONG;
194+
}
195+
196+
return 0;
197+
}
198+
199+
static void mlx5e_pcie_cong_event_work(struct work_struct *work)
200+
{
201+
struct mlx5e_pcie_cong_event *cong_event;
202+
struct mlx5_core_dev *dev;
203+
struct mlx5e_priv *priv;
204+
u32 new_cong_state = 0;
205+
u32 changes;
206+
int err;
207+
208+
cong_event = container_of(work, struct mlx5e_pcie_cong_event, work);
209+
priv = cong_event->priv;
210+
dev = priv->mdev;
211+
212+
err = mlx5_cmd_pcie_cong_event_query(dev, cong_event->obj_id,
213+
&new_cong_state);
214+
if (err) {
215+
mlx5_core_warn(dev, "Error %d when querying PCIe cong event object (obj_id=%llu).\n",
216+
err, cong_event->obj_id);
217+
return;
218+
}
219+
220+
changes = cong_event->state ^ new_cong_state;
221+
if (!changes)
222+
return;
223+
224+
cong_event->state = new_cong_state;
225+
226+
if (changes & MLX5E_INBOUND_CONG) {
227+
if (new_cong_state & MLX5E_INBOUND_CONG)
228+
cong_event->stats.pci_bw_inbound_high++;
229+
else
230+
cong_event->stats.pci_bw_inbound_low++;
231+
}
232+
233+
if (changes & MLX5E_OUTBOUND_CONG) {
234+
if (new_cong_state & MLX5E_OUTBOUND_CONG)
235+
cong_event->stats.pci_bw_outbound_high++;
236+
else
237+
cong_event->stats.pci_bw_outbound_low++;
238+
}
239+
}
240+
241+
static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb,
242+
unsigned long event, void *eqe)
243+
{
244+
struct mlx5e_pcie_cong_event *cong_event;
245+
246+
cong_event = mlx5_nb_cof(nb, struct mlx5e_pcie_cong_event, nb);
247+
queue_work(cong_event->priv->wq, &cong_event->work);
248+
249+
return NOTIFY_OK;
250+
}
251+
252+
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
253+
{
254+
struct mlx5e_pcie_cong_event *cong_event;
255+
struct mlx5_core_dev *mdev = priv->mdev;
256+
int err;
257+
258+
if (!mlx5_pcie_cong_event_supported(mdev))
259+
return 0;
260+
261+
cong_event = kvzalloc_node(sizeof(*cong_event), GFP_KERNEL,
262+
mdev->priv.numa_node);
263+
if (!cong_event)
264+
return -ENOMEM;
265+
266+
INIT_WORK(&cong_event->work, mlx5e_pcie_cong_event_work);
267+
MLX5_NB_INIT(&cong_event->nb, mlx5e_pcie_cong_event_handler,
268+
OBJECT_CHANGE);
269+
270+
cong_event->priv = priv;
271+
272+
err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config,
273+
&cong_event->obj_id);
274+
if (err) {
275+
mlx5_core_warn(mdev, "Error creating a PCIe congestion event object\n");
276+
goto err_free;
277+
}
278+
279+
err = mlx5_eq_notifier_register(mdev, &cong_event->nb);
280+
if (err) {
281+
mlx5_core_warn(mdev, "Error registering notifier for the PCIe congestion event\n");
282+
goto err_obj_destroy;
283+
}
284+
285+
priv->cong_event = cong_event;
286+
287+
return 0;
288+
289+
err_obj_destroy:
290+
mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id);
291+
err_free:
292+
kvfree(cong_event);
293+
294+
return err;
295+
}
296+
297+
void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv)
298+
{
299+
struct mlx5e_pcie_cong_event *cong_event = priv->cong_event;
300+
struct mlx5_core_dev *mdev = priv->mdev;
301+
302+
if (!cong_event)
303+
return;
304+
305+
priv->cong_event = NULL;
306+
307+
mlx5_eq_notifier_unregister(mdev, &cong_event->nb);
308+
cancel_work_sync(&cong_event->work);
309+
310+
if (mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id))
311+
mlx5_core_warn(mdev, "Error destroying PCIe congestion event (obj_id=%llu)\n",
312+
cong_event->obj_id);
313+
314+
kvfree(cong_event);
315+
}
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
/* SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB */
2+
/* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. */
3+
4+
#ifndef __MLX5_PCIE_CONG_EVENT_H__
5+
#define __MLX5_PCIE_CONG_EVENT_H__
6+
7+
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv);
8+
void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv);
9+
10+
#endif /* __MLX5_PCIE_CONG_EVENT_H__ */

0 commit comments

Comments
 (0)