Skip to content

Commit 8890ee6

Browse files
dtatuleakuba-moo
authored andcommitted
net/mlx5e: Add device PCIe congestion ethtool stats
Implement the PCIe Congestion Event notifier which triggers a work item to query the PCIe Congestion Event object. The result of the congestion state is reflected in the new ethtool stats: * pci_bw_inbound_high: the device has crossed the high threshold for inbound PCIe traffic. * pci_bw_inbound_low: the device has crossed the low threshold for inbound PCIe traffic * pci_bw_outbound_high: the device has crossed the high threshold for outbound PCIe traffic. * pci_bw_outbound_low: the device has crossed the low threshold for outbound PCIe traffic The high and low thresholds are currently configured at 90% and 75%. These are hysteresis thresholds which help to check if the PCI bus on the device side is in a congested state. If low + 1 = high then the device is in a congested state. If low == high then the device is not in a congested state. The counters are also documented. A follow-up patch will make the thresholds configurable. Signed-off-by: Dragos Tatulea <[email protected]> Signed-off-by: Tariq Toukan <[email protected]> Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
1 parent ab2b0d4 commit 8890ee6

File tree

5 files changed

+212
-0
lines changed

5 files changed

+212
-0
lines changed

Documentation/networking/device_drivers/ethernet/mellanox/mlx5/counters.rst

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1341,3 +1341,35 @@ Device Counters
13411341
- The number of times the device owned queue had not enough buffers
13421342
allocated.
13431343
- Error
1344+
1345+
* - `pci_bw_inbound_high`
1346+
- The number of times the device crossed the high inbound pcie bandwidth
1347+
threshold. To be compared to pci_bw_inbound_low to check if the device
1348+
is in a congested state.
1349+
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
1350+
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
1351+
- Tnformative
1352+
1353+
* - `pci_bw_inbound_low`
1354+
- The number of times the device crossed the low inbound PCIe bandwidth
1355+
threshold. To be compared to pci_bw_inbound_high to check if the device
1356+
is in a congested state.
1357+
If pci_bw_inbound_high == pci_bw_inbound_low then the device is not congested.
1358+
If pci_bw_inbound_high > pci_bw_inbound_low then the device is congested.
1359+
- Informative
1360+
1361+
* - `pci_bw_outbound_high`
1362+
- The number of times the device crossed the high outbound pcie bandwidth
1363+
threshold. To be compared to pci_bw_outbound_low to check if the device
1364+
is in a congested state.
1365+
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
1366+
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
1367+
- Informative
1368+
1369+
* - `pci_bw_outbound_low`
1370+
- The number of times the device crossed the low outbound PCIe bandwidth
1371+
threshold. To be compared to pci_bw_outbound_high to check if the device
1372+
is in a congested state.
1373+
If pci_bw_outbound_high == pci_bw_outbound_low then the device is not congested.
1374+
If pci_bw_outbound_high > pci_bw_outbound_low then the device is congested.
1375+
- Informative

drivers/net/ethernet/mellanox/mlx5/core/en/pcie_cong_event.c

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,41 @@
44
#include "en.h"
55
#include "pcie_cong_event.h"
66

7+
#define MLX5E_CONG_HIGH_STATE 0x7
8+
9+
enum {
10+
MLX5E_INBOUND_CONG = BIT(0),
11+
MLX5E_OUTBOUND_CONG = BIT(1),
12+
};
13+
714
struct mlx5e_pcie_cong_thresh {
815
u16 inbound_high;
916
u16 inbound_low;
1017
u16 outbound_high;
1118
u16 outbound_low;
1219
};
1320

21+
struct mlx5e_pcie_cong_stats {
22+
u32 pci_bw_inbound_high;
23+
u32 pci_bw_inbound_low;
24+
u32 pci_bw_outbound_high;
25+
u32 pci_bw_outbound_low;
26+
};
27+
1428
struct mlx5e_pcie_cong_event {
1529
u64 obj_id;
1630

1731
struct mlx5e_priv *priv;
32+
33+
/* For event notifier and workqueue. */
34+
struct work_struct work;
35+
struct mlx5_nb nb;
36+
37+
/* Stores last read state. */
38+
u8 state;
39+
40+
/* For ethtool stats group. */
41+
struct mlx5e_pcie_cong_stats stats;
1842
};
1943

2044
/* In units of 0.01 % */
@@ -25,6 +49,51 @@ static const struct mlx5e_pcie_cong_thresh default_thresh_config = {
2549
.outbound_low = 7500,
2650
};
2751

52+
static const struct counter_desc mlx5e_pcie_cong_stats_desc[] = {
53+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
54+
pci_bw_inbound_high) },
55+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
56+
pci_bw_inbound_low) },
57+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
58+
pci_bw_outbound_high) },
59+
{ MLX5E_DECLARE_STAT(struct mlx5e_pcie_cong_stats,
60+
pci_bw_outbound_low) },
61+
};
62+
63+
#define NUM_PCIE_CONG_COUNTERS ARRAY_SIZE(mlx5e_pcie_cong_stats_desc)
64+
65+
static MLX5E_DECLARE_STATS_GRP_OP_NUM_STATS(pcie_cong)
66+
{
67+
return priv->cong_event ? NUM_PCIE_CONG_COUNTERS : 0;
68+
}
69+
70+
static MLX5E_DECLARE_STATS_GRP_OP_UPDATE_STATS(pcie_cong) {}
71+
72+
static MLX5E_DECLARE_STATS_GRP_OP_FILL_STRS(pcie_cong)
73+
{
74+
if (!priv->cong_event)
75+
return;
76+
77+
for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++)
78+
ethtool_puts(data, mlx5e_pcie_cong_stats_desc[i].format);
79+
}
80+
81+
static MLX5E_DECLARE_STATS_GRP_OP_FILL_STATS(pcie_cong)
82+
{
83+
if (!priv->cong_event)
84+
return;
85+
86+
for (int i = 0; i < NUM_PCIE_CONG_COUNTERS; i++) {
87+
u32 ctr = MLX5E_READ_CTR32_CPU(&priv->cong_event->stats,
88+
mlx5e_pcie_cong_stats_desc,
89+
i);
90+
91+
mlx5e_ethtool_put_stat(data, ctr);
92+
}
93+
}
94+
95+
MLX5E_DEFINE_STATS_GRP(pcie_cong, 0);
96+
2897
static int
2998
mlx5_cmd_pcie_cong_event_set(struct mlx5_core_dev *dev,
3099
const struct mlx5e_pcie_cong_thresh *config,
@@ -89,6 +158,97 @@ static int mlx5_cmd_pcie_cong_event_destroy(struct mlx5_core_dev *dev,
89158
return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
90159
}
91160

161+
static int mlx5_cmd_pcie_cong_event_query(struct mlx5_core_dev *dev,
162+
u64 obj_id,
163+
u32 *state)
164+
{
165+
u32 in[MLX5_ST_SZ_DW(pcie_cong_event_cmd_in)] = {};
166+
u32 out[MLX5_ST_SZ_DW(pcie_cong_event_cmd_out)];
167+
void *obj;
168+
void *hdr;
169+
u8 cong;
170+
int err;
171+
172+
hdr = MLX5_ADDR_OF(pcie_cong_event_cmd_in, in, hdr);
173+
174+
MLX5_SET(general_obj_in_cmd_hdr, hdr, opcode,
175+
MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
176+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_type,
177+
MLX5_GENERAL_OBJECT_TYPES_PCIE_CONG_EVENT);
178+
MLX5_SET(general_obj_in_cmd_hdr, hdr, obj_id, obj_id);
179+
180+
err = mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
181+
if (err)
182+
return err;
183+
184+
obj = MLX5_ADDR_OF(pcie_cong_event_cmd_out, out, cong_obj);
185+
186+
if (state) {
187+
cong = MLX5_GET(pcie_cong_event_obj, obj, inbound_cong_state);
188+
if (cong == MLX5E_CONG_HIGH_STATE)
189+
*state |= MLX5E_INBOUND_CONG;
190+
191+
cong = MLX5_GET(pcie_cong_event_obj, obj, outbound_cong_state);
192+
if (cong == MLX5E_CONG_HIGH_STATE)
193+
*state |= MLX5E_OUTBOUND_CONG;
194+
}
195+
196+
return 0;
197+
}
198+
199+
static void mlx5e_pcie_cong_event_work(struct work_struct *work)
200+
{
201+
struct mlx5e_pcie_cong_event *cong_event;
202+
struct mlx5_core_dev *dev;
203+
struct mlx5e_priv *priv;
204+
u32 new_cong_state = 0;
205+
u32 changes;
206+
int err;
207+
208+
cong_event = container_of(work, struct mlx5e_pcie_cong_event, work);
209+
priv = cong_event->priv;
210+
dev = priv->mdev;
211+
212+
err = mlx5_cmd_pcie_cong_event_query(dev, cong_event->obj_id,
213+
&new_cong_state);
214+
if (err) {
215+
mlx5_core_warn(dev, "Error %d when querying PCIe cong event object (obj_id=%llu).\n",
216+
err, cong_event->obj_id);
217+
return;
218+
}
219+
220+
changes = cong_event->state ^ new_cong_state;
221+
if (!changes)
222+
return;
223+
224+
cong_event->state = new_cong_state;
225+
226+
if (changes & MLX5E_INBOUND_CONG) {
227+
if (new_cong_state & MLX5E_INBOUND_CONG)
228+
cong_event->stats.pci_bw_inbound_high++;
229+
else
230+
cong_event->stats.pci_bw_inbound_low++;
231+
}
232+
233+
if (changes & MLX5E_OUTBOUND_CONG) {
234+
if (new_cong_state & MLX5E_OUTBOUND_CONG)
235+
cong_event->stats.pci_bw_outbound_high++;
236+
else
237+
cong_event->stats.pci_bw_outbound_low++;
238+
}
239+
}
240+
241+
static int mlx5e_pcie_cong_event_handler(struct notifier_block *nb,
242+
unsigned long event, void *eqe)
243+
{
244+
struct mlx5e_pcie_cong_event *cong_event;
245+
246+
cong_event = mlx5_nb_cof(nb, struct mlx5e_pcie_cong_event, nb);
247+
queue_work(cong_event->priv->wq, &cong_event->work);
248+
249+
return NOTIFY_OK;
250+
}
251+
92252
int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
93253
{
94254
struct mlx5e_pcie_cong_event *cong_event;
@@ -103,6 +263,10 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
103263
if (!cong_event)
104264
return -ENOMEM;
105265

266+
INIT_WORK(&cong_event->work, mlx5e_pcie_cong_event_work);
267+
MLX5_NB_INIT(&cong_event->nb, mlx5e_pcie_cong_event_handler,
268+
OBJECT_CHANGE);
269+
106270
cong_event->priv = priv;
107271

108272
err = mlx5_cmd_pcie_cong_event_set(mdev, &default_thresh_config,
@@ -112,10 +276,18 @@ int mlx5e_pcie_cong_event_init(struct mlx5e_priv *priv)
112276
goto err_free;
113277
}
114278

279+
err = mlx5_eq_notifier_register(mdev, &cong_event->nb);
280+
if (err) {
281+
mlx5_core_warn(mdev, "Error registering notifier for the PCIe congestion event\n");
282+
goto err_obj_destroy;
283+
}
284+
115285
priv->cong_event = cong_event;
116286

117287
return 0;
118288

289+
err_obj_destroy:
290+
mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id);
119291
err_free:
120292
kvfree(cong_event);
121293

@@ -132,6 +304,9 @@ void mlx5e_pcie_cong_event_cleanup(struct mlx5e_priv *priv)
132304

133305
priv->cong_event = NULL;
134306

307+
mlx5_eq_notifier_unregister(mdev, &cong_event->nb);
308+
cancel_work_sync(&cong_event->work);
309+
135310
if (mlx5_cmd_pcie_cong_event_destroy(mdev, cong_event->obj_id))
136311
mlx5_core_warn(mdev, "Error destroying PCIe congestion event (obj_id=%llu)\n",
137312
cong_event->obj_id);

drivers/net/ethernet/mellanox/mlx5/core/en_stats.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2612,6 +2612,7 @@ mlx5e_stats_grp_t mlx5e_nic_stats_grps[] = {
26122612
#ifdef CONFIG_MLX5_MACSEC
26132613
&MLX5E_STATS_GRP(macsec_hw),
26142614
#endif
2615+
&MLX5E_STATS_GRP(pcie_cong),
26152616
};
26162617

26172618
unsigned int mlx5e_nic_stats_grps_num(struct mlx5e_priv *priv)

drivers/net/ethernet/mellanox/mlx5/core/en_stats.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,5 +535,6 @@ extern MLX5E_DECLARE_STATS_GRP(ipsec_hw);
535535
extern MLX5E_DECLARE_STATS_GRP(ipsec_sw);
536536
extern MLX5E_DECLARE_STATS_GRP(ptp);
537537
extern MLX5E_DECLARE_STATS_GRP(macsec_hw);
538+
extern MLX5E_DECLARE_STATS_GRP(pcie_cong);
538539

539540
#endif /* __MLX5_EN_STATS_H__ */

drivers/net/ethernet/mellanox/mlx5/core/eq.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,9 @@ static void gather_async_events_mask(struct mlx5_core_dev *dev, u64 mask[4])
585585
async_event_mask |=
586586
(1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE);
587587

588+
if (mlx5_pcie_cong_event_supported(dev))
589+
async_event_mask |= (1ull << MLX5_EVENT_TYPE_OBJECT_CHANGE);
590+
588591
mask[0] = async_event_mask;
589592

590593
if (MLX5_CAP_GEN(dev, event_cap))

0 commit comments

Comments
 (0)