Skip to content

Commit f8910ff

Browse files
Xianting Tiancminyard
authored andcommitted
ipmi:msghandler: retry to get device id on an error
We fail to get the BMCS's device id with low probability when loading the ipmi driver and it causes BMC device registration failed. When this issue occurs we got below kernel prints: [Wed Sep 9 19:52:03 2020] ipmi_si IPI0001:00: IPMI message handler: device id demangle failed: -22 [Wed Sep 9 19:52:03 2020] IPMI BT: using default values [Wed Sep 9 19:52:03 2020] IPMI BT: req2rsp=5 secs retries=2 [Wed Sep 9 19:52:03 2020] ipmi_si IPI0001:00: Unable to get the device id: -5 [Wed Sep 9 19:52:04 2020] ipmi_si IPI0001:00: Unable to register device: error -5 When this issue happens, we want to manually unload the driver and try to load it again, but it can't be unloaded by 'rmmod' as it is already 'in use'. We add a print in handle_one_recv_msg(), when this issue happens, the msg we received is "Recv: 1c 01 d5", which means the data_len is 1, data[0] is 0xd5 (completion code), which means "bmc cannot execute command. Command, or request parameter(s), not supported in present state". Debug code: static int handle_one_recv_msg(struct ipmi_smi *intf, struct ipmi_smi_msg *msg) { printk("Recv: %*ph\n", msg->rsp_size, msg->rsp); ... ... } Then in ipmi_demangle_device_id(), it returned '-EINVAL' as 'data_len < 7' and 'data[0] != 0'. We created this patch to retry the get device id when this error happens. We reproduced this issue again and the retry succeed on the first retry, we finally got the correct msg and then all is ok: Recv: 1c 01 00 01 81 05 84 02 af db 07 00 01 00 b9 00 10 00 So use a retry machanism in this patch to give bmc more opportunity to correctly response kernel when we received specific completion codes. Signed-off-by: Xianting Tian <[email protected]> Message-Id: <[email protected]> [Cleaned up the verbage a bit in the header and prints.] Signed-off-by: Corey Minyard <[email protected]>
1 parent c2b1e76 commit f8910ff

File tree

2 files changed

+27
-4
lines changed

2 files changed

+27
-4
lines changed

drivers/char/ipmi/ipmi_msghandler.c

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <linux/uuid.h>
3535
#include <linux/nospec.h>
3636
#include <linux/vmalloc.h>
37+
#include <linux/delay.h>
3738

3839
#define IPMI_DRIVER_VERSION "39.2"
3940

@@ -60,6 +61,9 @@ enum ipmi_panic_event_op {
6061
#else
6162
#define IPMI_PANIC_DEFAULT IPMI_SEND_PANIC_EVENT_NONE
6263
#endif
64+
65+
#define GET_DEVICE_ID_MAX_RETRY 5
66+
6367
static enum ipmi_panic_event_op ipmi_send_panic_event = IPMI_PANIC_DEFAULT;
6468

6569
static int panic_op_write_handler(const char *val,
@@ -317,6 +321,7 @@ struct bmc_device {
317321
int dyn_guid_set;
318322
struct kref usecount;
319323
struct work_struct remove_work;
324+
char cc; /* completion code */
320325
};
321326
#define to_bmc_device(x) container_of((x), struct bmc_device, pdev.dev)
322327

@@ -2381,6 +2386,8 @@ static void bmc_device_id_handler(struct ipmi_smi *intf,
23812386
msg->msg.data, msg->msg.data_len, &intf->bmc->fetch_id);
23822387
if (rv) {
23832388
dev_warn(intf->si_dev, "device id demangle failed: %d\n", rv);
2389+
/* record completion code when error */
2390+
intf->bmc->cc = msg->msg.data[0];
23842391
intf->bmc->dyn_id_set = 0;
23852392
} else {
23862393
/*
@@ -2426,19 +2433,34 @@ send_get_device_id_cmd(struct ipmi_smi *intf)
24262433
static int __get_device_id(struct ipmi_smi *intf, struct bmc_device *bmc)
24272434
{
24282435
int rv;
2429-
2430-
bmc->dyn_id_set = 2;
2436+
unsigned int retry_count = 0;
24312437

24322438
intf->null_user_handler = bmc_device_id_handler;
24332439

2440+
retry:
2441+
bmc->cc = 0;
2442+
bmc->dyn_id_set = 2;
2443+
24342444
rv = send_get_device_id_cmd(intf);
24352445
if (rv)
24362446
goto out_reset_handler;
24372447

24382448
wait_event(intf->waitq, bmc->dyn_id_set != 2);
24392449

2440-
if (!bmc->dyn_id_set)
2450+
if (!bmc->dyn_id_set) {
2451+
if ((bmc->cc == IPMI_DEVICE_IN_FW_UPDATE_ERR
2452+
|| bmc->cc == IPMI_DEVICE_IN_INIT_ERR
2453+
|| bmc->cc == IPMI_NOT_IN_MY_STATE_ERR)
2454+
&& ++retry_count <= GET_DEVICE_ID_MAX_RETRY) {
2455+
msleep(500);
2456+
dev_warn(intf->si_dev,
2457+
"BMC returned 0x%2.2x, retry get bmc device id\n",
2458+
bmc->cc);
2459+
goto retry;
2460+
}
2461+
24412462
rv = -EIO; /* Something went wrong in the fetch. */
2463+
}
24422464

24432465
/* dyn_id_set makes the id data available. */
24442466
smp_rmb();
@@ -3246,7 +3268,6 @@ channel_handler(struct ipmi_smi *intf, struct ipmi_recv_msg *msg)
32463268
/* It's the one we want */
32473269
if (msg->msg.data[0] != 0) {
32483270
/* Got an error from the channel, just go on. */
3249-
32503271
if (msg->msg.data[0] == IPMI_INVALID_COMMAND_ERR) {
32513272
/*
32523273
* If the MC does not support this

include/uapi/linux/ipmi_msgdefs.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,8 @@
6969
#define IPMI_ERR_MSG_TRUNCATED 0xc6
7070
#define IPMI_REQ_LEN_INVALID_ERR 0xc7
7171
#define IPMI_REQ_LEN_EXCEEDED_ERR 0xc8
72+
#define IPMI_DEVICE_IN_FW_UPDATE_ERR 0xd1
73+
#define IPMI_DEVICE_IN_INIT_ERR 0xd2
7274
#define IPMI_NOT_IN_MY_STATE_ERR 0xd5 /* IPMI 2.0 */
7375
#define IPMI_LOST_ARBITRATION_ERR 0x81
7476
#define IPMI_BUS_ERR 0x82

0 commit comments

Comments
 (0)