Skip to content

Commit 85d6164

Browse files
konradknitteranguy11
authored andcommitted
ice: add fw and port health reporters
Firmware generates events for global events or port specific events. Driver shall subscribe for health status events from firmware on supported FW versions >= 1.7.6. Driver shall expose those under specific health reporter, two new reporters are introduced: - FW health reporter shall represent global events (problems with the image, recovery mode); - Port health reporter shall represent port-specific events (module failure). Firmware only reports problems when those are detected, it does not store active fault list. Driver will hold only last global and last port-specific event. Driver will report all events via devlink health report, so in case of multiple events of the same source they can be reviewed using devlink autodump feature. $ devlink health pci/0000:b1:00.3: reporter fw state healthy error 0 recover 0 auto_dump true reporter port state error error 1 recover 0 last_dump_date 2024-03-17 last_dump_time 09:29:29 auto_dump true $ devlink health diagnose pci/0000:b1:00.3 reporter port Syndrome: 262 Description: Module is not present. Possible Solution: Check that the module is inserted correctly. Port Number: 0 Tested on Intel Corporation Ethernet Controller E810-C for SFP Reviewed-by: Marcin Szycik <[email protected]> Co-developed-by: Sharon Haroni <[email protected]> Signed-off-by: Sharon Haroni <[email protected]> Co-developed-by: Nicholas Nunley <[email protected]> Signed-off-by: Nicholas Nunley <[email protected]> Co-developed-by: Brett Creeley <[email protected]> Signed-off-by: Brett Creeley <[email protected]> Signed-off-by: Konrad Knitter <[email protected]> Tested-by: Rinitha S <[email protected]> (A Contingent worker at Intel) Signed-off-by: Tony Nguyen <[email protected]>
1 parent e81e1d7 commit 85d6164

File tree

7 files changed

+437
-8
lines changed

7 files changed

+437
-8
lines changed

drivers/net/ethernet/intel/ice/devlink/health.c

Lines changed: 288 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,270 @@
11
// SPDX-License-Identifier: GPL-2.0
22
/* Copyright (c) 2024, Intel Corporation. */
33

4-
#include "health.h"
54
#include "ice.h"
5+
#include "ice_adminq_cmd.h" /* for enum ice_aqc_health_status_elem */
6+
#include "health.h"
67

78
#define ICE_DEVLINK_FMSG_PUT_FIELD(fmsg, obj, name) \
89
devlink_fmsg_put(fmsg, #name, (obj)->name)
910

11+
#define ICE_HEALTH_STATUS_DATA_SIZE 2
12+
13+
struct ice_health_status {
14+
enum ice_aqc_health_status code;
15+
const char *description;
16+
const char *solution;
17+
const char *data_label[ICE_HEALTH_STATUS_DATA_SIZE];
18+
};
19+
20+
/*
21+
* In addition to the health status codes provided below, the firmware might
22+
* generate Health Status Codes that are not pertinent to the end-user.
23+
* For instance, Health Code 0x1002 is triggered when the command fails.
24+
* Such codes should be disregarded by the end-user.
25+
* The below lookup requires to be sorted by code.
26+
*/
27+
28+
static const char *const ice_common_port_solutions =
29+
"Check your cable connection. Change or replace the module or cable. Manually set speed and duplex.";
30+
static const char *const ice_port_number_label = "Port Number";
31+
static const char *const ice_update_nvm_solution = "Update to the latest NVM image.";
32+
33+
static const struct ice_health_status ice_health_status_lookup[] = {
34+
{ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_STRICT, "An unsupported module was detected.",
35+
ice_common_port_solutions, {ice_port_number_label}},
36+
{ICE_AQC_HEALTH_STATUS_ERR_MOD_TYPE, "Module type is not supported.",
37+
"Change or replace the module or cable.", {ice_port_number_label}},
38+
{ICE_AQC_HEALTH_STATUS_ERR_MOD_QUAL, "Module is not qualified.",
39+
ice_common_port_solutions, {ice_port_number_label}},
40+
{ICE_AQC_HEALTH_STATUS_ERR_MOD_COMM,
41+
"Device cannot communicate with the module.",
42+
"Check your cable connection. Change or replace the module or cable. Manually set speed and duplex.",
43+
{ice_port_number_label}},
44+
{ICE_AQC_HEALTH_STATUS_ERR_MOD_CONFLICT, "Unresolved module conflict.",
45+
"Manually set speed/duplex or change the port option. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.",
46+
{ice_port_number_label}},
47+
{ICE_AQC_HEALTH_STATUS_ERR_MOD_NOT_PRESENT, "Module is not present.",
48+
"Check that the module is inserted correctly. If the problem persists, use a cable/module that is found in the supported modules and cables list for this device.",
49+
{ice_port_number_label}},
50+
{ICE_AQC_HEALTH_STATUS_INFO_MOD_UNDERUTILIZED, "Underutilized module.",
51+
"Change or replace the module or cable. Change the port option.",
52+
{ice_port_number_label}},
53+
{ICE_AQC_HEALTH_STATUS_ERR_UNKNOWN_MOD_LENIENT, "An unsupported module was detected.",
54+
ice_common_port_solutions, {ice_port_number_label}},
55+
{ICE_AQC_HEALTH_STATUS_ERR_INVALID_LINK_CFG, "Invalid link configuration.",
56+
NULL, {ice_port_number_label}},
57+
{ICE_AQC_HEALTH_STATUS_ERR_PORT_ACCESS, "Port hardware access error.",
58+
ice_update_nvm_solution, {ice_port_number_label}},
59+
{ICE_AQC_HEALTH_STATUS_ERR_PORT_UNREACHABLE, "A port is unreachable.",
60+
"Change the port option. Update to the latest NVM image."},
61+
{ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_MOD_LIMITED, "Port speed is limited due to module.",
62+
"Change the module or configure the port option to match the current module speed. Change the port option.",
63+
{ice_port_number_label}},
64+
{ICE_AQC_HEALTH_STATUS_ERR_PARALLEL_FAULT,
65+
"All configured link modes were attempted but failed to establish link. The device will restart the process to establish link.",
66+
"Check link partner connection and configuration.",
67+
{ice_port_number_label}},
68+
{ICE_AQC_HEALTH_STATUS_INFO_PORT_SPEED_PHY_LIMITED,
69+
"Port speed is limited by PHY capabilities.",
70+
"Change the module to align to port option.", {ice_port_number_label}},
71+
{ICE_AQC_HEALTH_STATUS_ERR_NETLIST_TOPO, "LOM topology netlist is corrupted.",
72+
ice_update_nvm_solution, {ice_port_number_label}},
73+
{ICE_AQC_HEALTH_STATUS_ERR_NETLIST, "Unrecoverable netlist error.",
74+
ice_update_nvm_solution, {ice_port_number_label}},
75+
{ICE_AQC_HEALTH_STATUS_ERR_TOPO_CONFLICT, "Port topology conflict.",
76+
"Change the port option. Update to the latest NVM image."},
77+
{ICE_AQC_HEALTH_STATUS_ERR_LINK_HW_ACCESS, "Unrecoverable hardware access error.",
78+
ice_update_nvm_solution, {ice_port_number_label}},
79+
{ICE_AQC_HEALTH_STATUS_ERR_LINK_RUNTIME, "Unrecoverable runtime error.",
80+
ice_update_nvm_solution, {ice_port_number_label}},
81+
{ICE_AQC_HEALTH_STATUS_ERR_DNL_INIT, "Link management engine failed to initialize.",
82+
ice_update_nvm_solution, {ice_port_number_label}},
83+
{ICE_AQC_HEALTH_STATUS_ERR_PHY_FW_LOAD,
84+
"Failed to load the firmware image in the external PHY.",
85+
ice_update_nvm_solution, {ice_port_number_label}},
86+
{ICE_AQC_HEALTH_STATUS_INFO_RECOVERY, "The device is in firmware recovery mode.",
87+
ice_update_nvm_solution, {"Extended Error"}},
88+
{ICE_AQC_HEALTH_STATUS_ERR_FLASH_ACCESS, "The flash chip cannot be accessed.",
89+
"If issue persists, call customer support.", {"Access Type"}},
90+
{ICE_AQC_HEALTH_STATUS_ERR_NVM_AUTH, "NVM authentication failed.",
91+
ice_update_nvm_solution},
92+
{ICE_AQC_HEALTH_STATUS_ERR_OROM_AUTH, "Option ROM authentication failed.",
93+
ice_update_nvm_solution},
94+
{ICE_AQC_HEALTH_STATUS_ERR_DDP_AUTH, "DDP package authentication failed.",
95+
"Update to latest base driver and DDP package."},
96+
{ICE_AQC_HEALTH_STATUS_ERR_NVM_COMPAT, "NVM image is incompatible.",
97+
ice_update_nvm_solution},
98+
{ICE_AQC_HEALTH_STATUS_ERR_OROM_COMPAT, "Option ROM is incompatible.",
99+
ice_update_nvm_solution, {"Expected PCI Device ID", "Expected Module ID"}},
100+
{ICE_AQC_HEALTH_STATUS_ERR_DCB_MIB,
101+
"Supplied MIB file is invalid. DCB reverted to default configuration.",
102+
"Disable FW-LLDP and check DCBx system configuration.",
103+
{ice_port_number_label, "MIB ID"}},
104+
};
105+
106+
static int ice_health_status_lookup_compare(const void *a, const void *b)
107+
{
108+
return ((struct ice_health_status *)a)->code - ((struct ice_health_status *)b)->code;
109+
}
110+
111+
static const struct ice_health_status *ice_get_health_status(u16 code)
112+
{
113+
struct ice_health_status key = { .code = code };
114+
115+
return bsearch(&key, ice_health_status_lookup, ARRAY_SIZE(ice_health_status_lookup),
116+
sizeof(struct ice_health_status), ice_health_status_lookup_compare);
117+
}
118+
119+
static void ice_describe_status_code(struct devlink_fmsg *fmsg,
120+
struct ice_aqc_health_status_elem *hse)
121+
{
122+
static const char *const aux_label[] = { "Aux Data 1", "Aux Data 2" };
123+
const struct ice_health_status *health_code;
124+
u32 internal_data[2];
125+
u16 status_code;
126+
127+
status_code = le16_to_cpu(hse->health_status_code);
128+
129+
devlink_fmsg_put(fmsg, "Syndrome", status_code);
130+
if (status_code) {
131+
internal_data[0] = le32_to_cpu(hse->internal_data1);
132+
internal_data[1] = le32_to_cpu(hse->internal_data2);
133+
134+
health_code = ice_get_health_status(status_code);
135+
if (!health_code)
136+
return;
137+
138+
devlink_fmsg_string_pair_put(fmsg, "Description", health_code->description);
139+
if (health_code->solution)
140+
devlink_fmsg_string_pair_put(fmsg, "Possible Solution",
141+
health_code->solution);
142+
143+
for (size_t i = 0; i < ICE_HEALTH_STATUS_DATA_SIZE; i++) {
144+
if (internal_data[i] != ICE_AQC_HEALTH_STATUS_UNDEFINED_DATA)
145+
devlink_fmsg_u32_pair_put(fmsg,
146+
health_code->data_label[i] ?
147+
health_code->data_label[i] :
148+
aux_label[i],
149+
internal_data[i]);
150+
}
151+
}
152+
}
153+
154+
static int
155+
ice_port_reporter_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg,
156+
struct netlink_ext_ack *extack)
157+
{
158+
struct ice_pf *pf = devlink_health_reporter_priv(reporter);
159+
160+
ice_describe_status_code(fmsg, &pf->health_reporters.port_status);
161+
return 0;
162+
}
163+
164+
static int
165+
ice_port_reporter_dump(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg,
166+
void *priv_ctx, struct netlink_ext_ack __always_unused *extack)
167+
{
168+
struct ice_pf *pf = devlink_health_reporter_priv(reporter);
169+
170+
ice_describe_status_code(fmsg, &pf->health_reporters.port_status);
171+
return 0;
172+
}
173+
174+
static int
175+
ice_fw_reporter_diagnose(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg,
176+
struct netlink_ext_ack *extack)
177+
{
178+
struct ice_pf *pf = devlink_health_reporter_priv(reporter);
179+
180+
ice_describe_status_code(fmsg, &pf->health_reporters.fw_status);
181+
return 0;
182+
}
183+
184+
static int
185+
ice_fw_reporter_dump(struct devlink_health_reporter *reporter, struct devlink_fmsg *fmsg,
186+
void *priv_ctx, struct netlink_ext_ack *extack)
187+
{
188+
struct ice_pf *pf = devlink_health_reporter_priv(reporter);
189+
190+
ice_describe_status_code(fmsg, &pf->health_reporters.fw_status);
191+
return 0;
192+
}
193+
194+
static void ice_config_health_events(struct ice_pf *pf, bool enable)
195+
{
196+
u8 enable_bits = 0;
197+
int ret;
198+
199+
if (enable)
200+
enable_bits = ICE_AQC_HEALTH_STATUS_SET_PF_SPECIFIC_MASK |
201+
ICE_AQC_HEALTH_STATUS_SET_GLOBAL_MASK;
202+
203+
ret = ice_aq_set_health_status_cfg(&pf->hw, enable_bits);
204+
if (ret)
205+
dev_err(ice_pf_to_dev(pf), "Failed to %s firmware health events, err %d aq_err %s\n",
206+
str_enable_disable(enable), ret,
207+
ice_aq_str(pf->hw.adminq.sq_last_status));
208+
}
209+
210+
/**
211+
* ice_process_health_status_event - Process the health status event from FW
212+
* @pf: pointer to the PF structure
213+
* @event: event structure containing the Health Status Event opcode
214+
*
215+
* Decode the Health Status Events and print the associated messages
216+
*/
217+
void ice_process_health_status_event(struct ice_pf *pf, struct ice_rq_event_info *event)
218+
{
219+
const struct ice_aqc_health_status_elem *health_info;
220+
u16 count;
221+
222+
health_info = (struct ice_aqc_health_status_elem *)event->msg_buf;
223+
count = le16_to_cpu(event->desc.params.get_health_status.health_status_count);
224+
225+
if (count > (event->buf_len / sizeof(*health_info))) {
226+
dev_err(ice_pf_to_dev(pf), "Received a health status event with invalid element count\n");
227+
return;
228+
}
229+
230+
for (size_t i = 0; i < count; i++) {
231+
const struct ice_health_status *health_code;
232+
u16 status_code;
233+
234+
status_code = le16_to_cpu(health_info->health_status_code);
235+
health_code = ice_get_health_status(status_code);
236+
237+
if (health_code) {
238+
switch (le16_to_cpu(health_info->event_source)) {
239+
case ICE_AQC_HEALTH_STATUS_GLOBAL:
240+
pf->health_reporters.fw_status = *health_info;
241+
devlink_health_report(pf->health_reporters.fw,
242+
"FW syndrome reported", NULL);
243+
break;
244+
case ICE_AQC_HEALTH_STATUS_PF:
245+
case ICE_AQC_HEALTH_STATUS_PORT:
246+
pf->health_reporters.port_status = *health_info;
247+
devlink_health_report(pf->health_reporters.port,
248+
"Port syndrome reported", NULL);
249+
break;
250+
default:
251+
dev_err(ice_pf_to_dev(pf), "Health code with unknown source\n");
252+
}
253+
} else {
254+
u32 data1, data2;
255+
u16 source;
256+
257+
source = le16_to_cpu(health_info->event_source);
258+
data1 = le32_to_cpu(health_info->internal_data1);
259+
data2 = le32_to_cpu(health_info->internal_data2);
260+
dev_dbg(ice_pf_to_dev(pf),
261+
"Received internal health status code 0x%08x, source: 0x%08x, data1: 0x%08x, data2: 0x%08x",
262+
status_code, source, data1, data2);
263+
}
264+
health_info++;
265+
}
266+
}
267+
10268
/**
11269
* ice_devlink_health_report - boilerplate to call given @reporter
12270
*
@@ -203,14 +461,26 @@ ice_init_devlink_rep(struct ice_pf *pf,
203461
return rep;
204462
}
205463

206-
#define ICE_DEFINE_HEALTH_REPORTER_OPS(_name) \
207-
static const struct devlink_health_reporter_ops ice_ ## _name ## _reporter_ops = { \
464+
#define ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field) \
465+
._field = ice_##_name##_reporter_##_field,
466+
467+
#define ICE_DEFINE_HEALTH_REPORTER_OPS_1(_name, _field1) \
468+
static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
208469
.name = #_name, \
209-
.dump = ice_ ## _name ## _reporter_dump, \
210-
}
470+
ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
471+
}
472+
473+
#define ICE_DEFINE_HEALTH_REPORTER_OPS_2(_name, _field1, _field2) \
474+
static const struct devlink_health_reporter_ops ice_##_name##_reporter_ops = { \
475+
.name = #_name, \
476+
ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field1) \
477+
ICE_HEALTH_REPORTER_OPS_FIELD(_name, _field2) \
478+
}
211479

212-
ICE_DEFINE_HEALTH_REPORTER_OPS(mdd);
213-
ICE_DEFINE_HEALTH_REPORTER_OPS(tx_hang);
480+
ICE_DEFINE_HEALTH_REPORTER_OPS_1(mdd, dump);
481+
ICE_DEFINE_HEALTH_REPORTER_OPS_1(tx_hang, dump);
482+
ICE_DEFINE_HEALTH_REPORTER_OPS_2(fw, dump, diagnose);
483+
ICE_DEFINE_HEALTH_REPORTER_OPS_2(port, dump, diagnose);
214484

215485
/**
216486
* ice_health_init - allocate and init all ice devlink health reporters and
@@ -224,6 +494,12 @@ void ice_health_init(struct ice_pf *pf)
224494

225495
reps->mdd = ice_init_devlink_rep(pf, &ice_mdd_reporter_ops);
226496
reps->tx_hang = ice_init_devlink_rep(pf, &ice_tx_hang_reporter_ops);
497+
498+
if (ice_is_fw_health_report_supported(&pf->hw)) {
499+
reps->fw = ice_init_devlink_rep(pf, &ice_fw_reporter_ops);
500+
reps->port = ice_init_devlink_rep(pf, &ice_port_reporter_ops);
501+
ice_config_health_events(pf, true);
502+
}
227503
}
228504

229505
/**
@@ -246,6 +522,11 @@ void ice_health_deinit(struct ice_pf *pf)
246522
{
247523
ice_deinit_devl_reporter(pf->health_reporters.mdd);
248524
ice_deinit_devl_reporter(pf->health_reporters.tx_hang);
525+
if (ice_is_fw_health_report_supported(&pf->hw)) {
526+
ice_deinit_devl_reporter(pf->health_reporters.fw);
527+
ice_deinit_devl_reporter(pf->health_reporters.port);
528+
ice_config_health_events(pf, false);
529+
}
249530
}
250531

251532
static

drivers/net/ethernet/intel/ice/devlink/health.h

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,10 @@
1313
* devlink health mechanism for ice driver.
1414
*/
1515

16+
struct ice_aqc_health_status_elem;
1617
struct ice_pf;
1718
struct ice_tx_ring;
19+
struct ice_rq_event_info;
1820

1921
enum ice_mdd_src {
2022
ICE_MDD_SRC_TX_PQM,
@@ -25,26 +27,37 @@ enum ice_mdd_src {
2527

2628
/**
2729
* struct ice_health - stores ice devlink health reporters and accompanied data
28-
* @tx_hang: devlink health reporter for tx_hang event
30+
* @fw: devlink health reporter for FW Health Status events
2931
* @mdd: devlink health reporter for MDD detection event
32+
* @port: devlink health reporter for Port Health Status events
33+
* @tx_hang: devlink health reporter for tx_hang event
3034
* @tx_hang_buf: pre-allocated place to put info for Tx hang reporter from
3135
* non-sleeping context
3236
* @tx_ring: ring that the hang occurred on
3337
* @head: descriptor head
3438
* @intr: interrupt register value
3539
* @vsi_num: VSI owning the queue that the hang occurred on
40+
* @fw_status: buffer for last received FW Status event
41+
* @port_status: buffer for last received Port Status event
3642
*/
3743
struct ice_health {
44+
struct devlink_health_reporter *fw;
3845
struct devlink_health_reporter *mdd;
46+
struct devlink_health_reporter *port;
3947
struct devlink_health_reporter *tx_hang;
4048
struct_group_tagged(ice_health_tx_hang_buf, tx_hang_buf,
4149
struct ice_tx_ring *tx_ring;
4250
u32 head;
4351
u32 intr;
4452
u16 vsi_num;
4553
);
54+
struct ice_aqc_health_status_elem fw_status;
55+
struct ice_aqc_health_status_elem port_status;
4656
};
4757

58+
void ice_process_health_status_event(struct ice_pf *pf,
59+
struct ice_rq_event_info *event);
60+
4861
void ice_health_init(struct ice_pf *pf);
4962
void ice_health_deinit(struct ice_pf *pf);
5063
void ice_health_clear(struct ice_pf *pf);

0 commit comments

Comments
 (0)