Skip to content

Commit 3a6f729

Browse files
committed
Some robust fixes for USB
This commit has the following robust changes for rshim/USB mainly on DPU BMC. - Reset USB at runtime if read timeout after all the retries. - Reset USB during probe if low-speed device was found. It avoids an issue that sometimes rshim on DPU-BMC is stuck at the low-speed for the device detected by UEFI and couldn't get chance to re-probe and switch to high-speec (EHCI). RM #4829848
1 parent 1fee512 commit 3a6f729

File tree

2 files changed

+70
-7
lines changed

2 files changed

+70
-7
lines changed

src/rshim.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2908,7 +2908,7 @@ int rshim_init(int *epollfd, int *timerfd)
29082908
}
29092909

29102910
/* Add periodic timer. */
2911-
timer_fd = timerfd_create(CLOCK_MONOTONIC, 0);
2911+
timer_fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK | TFD_CLOEXEC);
29122912
if (timer_fd == -1) {
29132913
RSHIM_ERR("timerfd_create failed\n");
29142914
exit(1);

src/rshim_usb.c

Lines changed: 69 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
#define USB_BLUEFIELD_2_PRODUCT_ID 0x0214 /* Mellanox Bluefield-2 */
1919
#define USB_BLUEFIELD_3_PRODUCT_ID 0x021c /* Mellanox Bluefield-3 */
2020

21-
#define READ_RETRIES 5
21+
#define READ_RETRIES 3
2222
#define WRITE_RETRIES 5
2323

2424
#define BF_MMIO_BASE 0x1000
@@ -57,7 +57,7 @@ typedef struct {
5757

5858
static libusb_context *rshim_usb_ctx;
5959
static int rshim_usb_epoll_fd;
60-
static bool rshim_usb_need_probe;
60+
static volatile bool rshim_usb_need_probe;
6161

6262
int rshim_usb_timeout = RSHIM_USB_TIMEOUT;
6363
#define RSHIM_USB_TIMEOUT_MS (rshim_usb_timeout * 1000)
@@ -341,6 +341,7 @@ static void rshim_usb_fifo_read_callback(struct libusb_transfer *urb)
341341
rshim_usb_t *dev = urb->user_data;
342342
rshim_backend_t *bd = &dev->bd;
343343
bool lock;
344+
int rc;
344345

345346
RSHIM_DBG("rshim%d(fifo_read_callback) %s urb completed, status %d, "
346347
"actual length %d, intr buf 0x%x\n",
@@ -383,18 +384,17 @@ static void rshim_usb_fifo_read_callback(struct libusb_transfer *urb)
383384
case LIBUSB_TRANSFER_STALL:
384385
case LIBUSB_TRANSFER_OVERFLOW:
385386
if (dev->read_or_intr_retries < READ_RETRIES && urb->actual_length == 0) {
387+
RSHIM_INFO("rshim%d(fifo_read_callback) retry\n", bd->index);
386388
/*
387389
* We got an error which could benefit from being retried.
388390
* Just submit the same urb again. Note that we don't
389391
* handle partial reads; it's hard, and we haven't really
390392
* seen them.
391393
*/
392-
int rc;
393-
394394
dev->read_or_intr_retries++;
395395
rc = libusb_submit_transfer(urb);
396396
if (rc) {
397-
RSHIM_DBG("rshim%d(fifo_read_callback) failed to resubmit urb(%d)\n",
397+
RSHIM_DBG("rshim%d(fifo_read_callback) failed to retry(%d)\n",
398398
bd->index, rc);
399399
/*
400400
* In this case, we won't try again; signal the
@@ -405,6 +405,17 @@ static void rshim_usb_fifo_read_callback(struct libusb_transfer *urb)
405405
bd->spin_flags |= RSH_SFLG_READING;
406406
}
407407
break;
408+
} else {
409+
RSHIM_ERR("rshim%d(fifo_read_callback) retry timeout\n", bd->index);
410+
dev->read_or_intr_retries = 0;
411+
rc = libusb_clear_halt(dev->handle, urb->endpoint);
412+
if (rc) {
413+
RSHIM_ERR("rshim%d clear_halt failed: %s\n",
414+
bd->index, libusb_error_name(rc));
415+
rshim_notify(bd, RSH_EVENT_FIFO_ERR,
416+
urb->status > 0 ? -urb->status : urb->status);
417+
}
418+
break;
408419
}
409420

410421
case LIBUSB_TRANSFER_CANCELLED:
@@ -1017,6 +1028,10 @@ static void rshim_usb_disconnect(struct libusb_device *usb_dev)
10171028
libusb_cancel_transfer(dev->write_urb);
10181029
dev->write_urb = NULL;
10191030

1031+
pthread_mutex_lock(&bd->ringlock);
1032+
bd->spin_flags &= ~RSH_SFLG_READING;
1033+
pthread_mutex_unlock(&bd->ringlock);
1034+
10201035
free(dev->intr_buf);
10211036
dev->intr_buf = NULL;
10221037

@@ -1118,11 +1133,36 @@ static int rshim_hotplug_callback(struct libusb_context *ctx,
11181133
}
11191134
#endif
11201135

1136+
#ifdef __arm__
1137+
static int rshim_usb_reset(libusb_device *dev)
1138+
{
1139+
libusb_device_handle *handle = NULL;
1140+
int rc;
1141+
1142+
rc = libusb_open(dev, &handle);
1143+
if (rc < 0 || !handle)
1144+
return rc;
1145+
1146+
rc = libusb_reset_device(handle);
1147+
if (rc < 0) {
1148+
RSHIM_WARN("Failed to reset: %s\n", libusb_error_name(rc));
1149+
}
1150+
1151+
libusb_close(handle);
1152+
1153+
return rc;
1154+
}
1155+
#endif
1156+
11211157
static bool rshim_usb_probe(void)
11221158
{
11231159
libusb_context *ctx = rshim_usb_ctx;
11241160
libusb_device **devs, *dev;
1125-
int rc, i = 0, j, num;
1161+
int rc, i = 0, j, num, speed;
1162+
#ifdef __arm__
1163+
bool need_retry = false;
1164+
static int retries;
1165+
#endif
11261166

11271167
rc = libusb_get_device_list(ctx, &devs);
11281168
if (rc < 0) {
@@ -1140,13 +1180,36 @@ static bool rshim_usb_probe(void)
11401180
if (desc.idVendor != USB_TILERA_VENDOR_ID)
11411181
continue;
11421182

1183+
speed = libusb_get_device_speed(dev);
1184+
if (speed < LIBUSB_SPEED_HIGH) {
1185+
RSHIM_INFO("Detect low-speed rshim USB: %d-%d, speed = %d\n",
1186+
libusb_get_bus_number(dev),
1187+
libusb_get_device_address(dev), speed);
1188+
#ifdef __arm__
1189+
need_retry = true;
1190+
if (retries < 3) {
1191+
retries++;
1192+
rshim_usb_reset(dev);
1193+
rshim_usb_need_probe = true;
1194+
sleep(1);
1195+
rshim_work_signal(NULL);
1196+
return true;
1197+
}
1198+
#endif
1199+
}
1200+
11431201
num = sizeof(rshim_usb_product_ids) / sizeof(rshim_usb_product_ids[0]);
11441202
for (j = 0; j < num; j++) {
11451203
if (desc.idProduct == rshim_usb_product_ids[j])
11461204
rshim_usb_probe_one(ctx, dev, &desc);
11471205
}
11481206
}
11491207

1208+
#ifdef __arm__
1209+
if (!need_retry)
1210+
retries = 0;
1211+
#endif
1212+
11501213
rc = rshim_usb_add_poll(ctx);
11511214
if (rc)
11521215
return false;

0 commit comments

Comments
 (0)