Skip to content

Commit 27e3040

Browse files
committed
btl/usnic: cap the number of resends per progress iteration
New MCA param: btl_usnic_max_resends_per_iteration. This is the max number of resends we'll do in a single pass through usNIC component progress. This prevents progress from getting stuck in an endless loop of retransmissions (i.e., if more retransmissions are triggered during the sending of retransmissions). Specifically: we need to leave the resend loop to allow receives to happen (which may ACK messages we have sent previously, and therefore cause pending resends to be moot). Signed-off-by: Jeff Squyres <[email protected]>
1 parent 3cc95d8 commit 27e3040

File tree

3 files changed

+13
-2
lines changed

3 files changed

+13
-2
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,10 @@ typedef struct opal_btl_usnic_component_t {
190190
/** retrans characteristics */
191191
int retrans_timeout;
192192

193+
/** max number of messages re-sent during a single progress
194+
iteration */
195+
int max_resends_per_iteration;
196+
193197
/** minimum number of times through component progress before
194198
checking to see if standalone ACKs need to be sent */
195199
int ack_iteration_delay;

opal/mca/btl/usnic/btl_usnic_mca.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -249,6 +249,10 @@ int opal_btl_usnic_component_register(void)
249249
100000, &mca_btl_usnic_component.retrans_timeout,
250250
REGINT_GE_ONE, OPAL_INFO_LVL_5));
251251

252+
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
253+
16, &mca_btl_usnic_component.max_resends_per_iteration,
254+
REGINT_GE_ONE, OPAL_INFO_LVL_5));
255+
252256
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
253257
0, &mca_btl_usnic_component.ack_iteration_delay,
254258
REGINT_GE_ZERO, OPAL_INFO_LVL_5));

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -954,11 +954,12 @@ usnic_do_resends(
954954
opal_btl_usnic_send_segment_t *sseg;
955955
opal_btl_usnic_endpoint_t *endpoint;
956956
struct opal_btl_usnic_channel_t *data_channel;
957-
int ret;
957+
int ret, count;
958958

959959
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
960960

961-
while ((get_send_credits(data_channel) > 1) &&
961+
count = mca_btl_usnic_component.max_resends_per_iteration;
962+
while (count > 0 && (get_send_credits(data_channel) > 1) &&
962963
!opal_list_is_empty(&module->pending_resend_segs)) {
963964

964965
/*
@@ -999,6 +1000,8 @@ usnic_do_resends(
9991000
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
10001001
opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__);
10011002
}
1003+
1004+
--count;
10021005
}
10031006
}
10041007

0 commit comments

Comments
 (0)