Skip to content

Commit a49ae7f

Browse files
authored
Merge pull request #7038 from jsquyres/pr/usnic-fixes-and-optimizations
btl/usnic fixes and optimizations
2 parents b774b47 + fe7f772 commit a49ae7f

File tree

6 files changed

+61
-33
lines changed

6 files changed

+61
-33
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ extern uint64_t opal_btl_usnic_ticks;
5757
extern opal_recursive_mutex_t btl_usnic_lock;
5858

5959
static inline uint64_t
60-
get_nsec(void)
60+
get_ticks(void)
6161
{
6262
return opal_btl_usnic_ticks;
6363
}
@@ -190,6 +190,14 @@ typedef struct opal_btl_usnic_component_t {
190190
/** retrans characteristics */
191191
int retrans_timeout;
192192

193+
/** max number of messages re-sent during a single progress
194+
iteration */
195+
int max_resends_per_iteration;
196+
197+
/** minimum number of times through component progress before
198+
checking to see if standalone ACKs need to be sent */
199+
int ack_iteration_delay;
200+
193201
/** transport header length for all usNIC devices on this server
194202
(it is guaranteed that all usNIC devices on a single server
195203
will have the same underlying transport, and therefore the

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -380,8 +380,9 @@ static int check_usnic_config(opal_btl_usnic_module_t *module,
380380

381381
static void usnic_clock_callback(int fd, short flags, void *timeout)
382382
{
383-
/* 1ms == 1,000,000 ns */
384-
opal_btl_usnic_ticks += 1000000;
383+
/* Increase by so many ticks that we will definitely force sending
384+
any ACKs that are pending */
385+
opal_btl_usnic_ticks += 1000;
385386

386387
/* run progress to make sure time change gets noticed */
387388
usnic_component_progress();
@@ -1128,7 +1129,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
11281129
*/
11291130
static int usnic_handle_completion(opal_btl_usnic_module_t* module,
11301131
opal_btl_usnic_channel_t *channel, struct fi_cq_entry *completion);
1131-
static int usnic_component_progress_2(void);
1132+
static int usnic_component_progress_2(bool check_priority);
11321133
static void usnic_handle_cq_error(opal_btl_usnic_module_t* module,
11331134
opal_btl_usnic_channel_t *channel, int cq_ret);
11341135

@@ -1141,9 +1142,7 @@ static int usnic_component_progress(void)
11411142
struct fi_cq_entry completion;
11421143
opal_btl_usnic_channel_t *channel;
11431144
static bool fastpath_ok = true;
1144-
1145-
/* update our simulated clock */
1146-
opal_btl_usnic_ticks += 5000;
1145+
bool check_priority = true;
11471146

11481147
count = 0;
11491148
if (fastpath_ok) {
@@ -1176,10 +1175,11 @@ static int usnic_component_progress(void)
11761175
usnic_handle_cq_error(module, channel, ret);
11771176
}
11781177
}
1178+
check_priority = false;
11791179
}
11801180

11811181
fastpath_ok = true;
1182-
return count + usnic_component_progress_2();
1182+
return count + usnic_component_progress_2(check_priority);
11831183
}
11841184

11851185
static int usnic_handle_completion(
@@ -1300,7 +1300,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module,
13001300
}
13011301
}
13021302

1303-
static int usnic_component_progress_2(void)
1303+
static int usnic_component_progress_2(bool check_priority)
13041304
{
13051305
int i, j, count = 0, num_events, ret;
13061306
opal_btl_usnic_module_t* module;
@@ -1309,15 +1309,18 @@ static int usnic_component_progress_2(void)
13091309
int rc;
13101310
int c;
13111311

1312-
/* update our simulated clock */
1313-
opal_btl_usnic_ticks += 5000;
1312+
opal_btl_usnic_ticks += 1;
1313+
1314+
/* If we need to check priority, start with the priority channel.
1315+
Otherwise, just check the data channel. */
1316+
int c_start = check_priority ? USNIC_PRIORITY_CHANNEL : USNIC_DATA_CHANNEL;
13141317

13151318
/* Poll for completions */
13161319
for (i = 0; i < mca_btl_usnic_component.num_modules; i++) {
13171320
module = mca_btl_usnic_component.usnic_active_modules[i];
13181321

13191322
/* poll each channel */
1320-
for (c=0; c<USNIC_NUM_CHANNELS; ++c) {
1323+
for (c=c_start; c<USNIC_NUM_CHANNELS; ++c) {
13211324
channel = &module->mod_channels[c];
13221325

13231326
if (channel->chan_deferred_recv != NULL) {

opal/mca/btl/usnic/btl_usnic_frag.h

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -138,18 +138,24 @@ typedef struct {
138138
the length of the packet to meet a minimum size */
139139
uint16_t payload_len;
140140

141-
/* If this is an emulated PUT, store at this address on receiver */
142-
char *put_addr;
143-
144141
/* Type of BTL header (see enum, above) */
145142
uint8_t payload_type;
146143

147144
/* true if there is piggy-backed ACK */
148145
uint8_t ack_present;
149146

147+
/* This field is ordered here so that we have no holes in the
148+
struct. Technically this doesn't matter, because we're using
149+
the __packed__ attribute (so there will be no holes anyway),
150+
but ordering things nicely in the struct prevents the need for
151+
unaligned reads/writes when using _packed__. */
152+
/* If this is an emulated PUT, store at this address on
153+
receiver */
154+
char *put_addr;
155+
150156
/* tag for upper layer */
151157
mca_btl_base_tag_t tag;
152-
} opal_btl_usnic_btl_header_t;
158+
} __opal_attribute_packed__ opal_btl_usnic_btl_header_t;
153159

154160
/**
155161
* BTL header for a chunk of a fragment

opal/mca/btl/usnic/btl_usnic_mca.c

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,17 @@ int opal_btl_usnic_component_register(void)
246246
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;
247247

248248
CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame",
249-
5000, &mca_btl_usnic_component.retrans_timeout,
249+
100000, &mca_btl_usnic_component.retrans_timeout,
250250
REGINT_GE_ONE, OPAL_INFO_LVL_5));
251251

252+
CHECK(reg_int("max_resends_per_iteration", "Maximum number of frames to resend in a single iteration through usNIC component progress",
253+
16, &mca_btl_usnic_component.max_resends_per_iteration,
254+
REGINT_GE_ONE, OPAL_INFO_LVL_5));
255+
256+
CHECK(reg_int("ack_iteration_delay", "Minimum number of times through usNIC \"progress\" function before checking to see if standalone ACKs need to be sent",
257+
0, &mca_btl_usnic_component.ack_iteration_delay,
258+
REGINT_GE_ZERO, OPAL_INFO_LVL_5));
259+
252260
CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)",
253261
0, &max_tiny_msg_size,
254262
REGINT_GE_ZERO, OPAL_INFO_LVL_5));

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -954,11 +954,12 @@ usnic_do_resends(
954954
opal_btl_usnic_send_segment_t *sseg;
955955
opal_btl_usnic_endpoint_t *endpoint;
956956
struct opal_btl_usnic_channel_t *data_channel;
957-
int ret;
957+
int ret, count;
958958

959959
data_channel = &module->mod_channels[USNIC_DATA_CHANNEL];
960960

961-
while ((get_send_credits(data_channel) > 1) &&
961+
count = mca_btl_usnic_component.max_resends_per_iteration;
962+
while (count > 0 && (get_send_credits(data_channel) > 1) &&
962963
!opal_list_is_empty(&module->pending_resend_segs)) {
963964

964965
/*
@@ -999,6 +1000,8 @@ usnic_do_resends(
9991000
if (OPAL_UNLIKELY(OPAL_SUCCESS != ret)) {
10001001
opal_btl_usnic_util_abort("hotel checkin failed\n", __FILE__, __LINE__);
10011002
}
1003+
1004+
--count;
10021005
}
10031006
}
10041007

@@ -1226,7 +1229,7 @@ opal_btl_usnic_module_progress_sends(
12261229

12271230
/* Is it time to send ACK? */
12281231
if (endpoint->endpoint_acktime == 0 ||
1229-
endpoint->endpoint_acktime <= get_nsec()) {
1232+
endpoint->endpoint_acktime <= get_ticks()) {
12301233
if (OPAL_LIKELY(opal_btl_usnic_ack_send(module, endpoint) == OPAL_SUCCESS)) {
12311234
opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint);
12321235
} else {
@@ -2344,14 +2347,14 @@ static void init_freelists(opal_btl_usnic_module_t *module)
23442347
uint32_t segsize;
23452348

23462349
segsize = (module->local_modex.max_msg_size +
2347-
opal_cache_line_size - 1) &
2350+
mca_btl_usnic_component.prefix_send_offset +
2351+
opal_cache_line_size - 1) &
23482352
~(opal_cache_line_size - 1);
23492353

23502354
/* Send frags freelists */
23512355
OBJ_CONSTRUCT(&module->small_send_frags, opal_free_list_t);
23522356
rc = usnic_compat_free_list_init(&module->small_send_frags,
2353-
sizeof(opal_btl_usnic_small_send_frag_t) +
2354-
mca_btl_usnic_component.prefix_send_offset,
2357+
sizeof(opal_btl_usnic_small_send_frag_t),
23552358
opal_cache_line_size,
23562359
OBJ_CLASS(opal_btl_usnic_small_send_frag_t),
23572360
segsize,
@@ -2368,8 +2371,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
23682371

23692372
OBJ_CONSTRUCT(&module->large_send_frags, opal_free_list_t);
23702373
rc = usnic_compat_free_list_init(&module->large_send_frags,
2371-
sizeof(opal_btl_usnic_large_send_frag_t) +
2372-
mca_btl_usnic_component.prefix_send_offset,
2374+
sizeof(opal_btl_usnic_large_send_frag_t),
23732375
opal_cache_line_size,
23742376
OBJ_CLASS(opal_btl_usnic_large_send_frag_t),
23752377
0, /* payload size */
@@ -2386,8 +2388,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
23862388

23872389
OBJ_CONSTRUCT(&module->put_dest_frags, opal_free_list_t);
23882390
rc = usnic_compat_free_list_init(&module->put_dest_frags,
2389-
sizeof(opal_btl_usnic_put_dest_frag_t) +
2390-
mca_btl_usnic_component.prefix_send_offset,
2391+
sizeof(opal_btl_usnic_put_dest_frag_t),
23912392
opal_cache_line_size,
23922393
OBJ_CLASS(opal_btl_usnic_put_dest_frag_t),
23932394
0, /* payload size */
@@ -2405,8 +2406,7 @@ static void init_freelists(opal_btl_usnic_module_t *module)
24052406
/* list of segments to use for sending */
24062407
OBJ_CONSTRUCT(&module->chunk_segs, opal_free_list_t);
24072408
rc = usnic_compat_free_list_init(&module->chunk_segs,
2408-
sizeof(opal_btl_usnic_chunk_segment_t) +
2409-
mca_btl_usnic_component.prefix_send_offset,
2409+
sizeof(opal_btl_usnic_chunk_segment_t),
24102410
opal_cache_line_size,
24112411
OBJ_CLASS(opal_btl_usnic_chunk_segment_t),
24122412
segsize,
@@ -2424,11 +2424,11 @@ static void init_freelists(opal_btl_usnic_module_t *module)
24242424
/* ACK segments freelist */
24252425
uint32_t ack_segment_len;
24262426
ack_segment_len = (sizeof(opal_btl_usnic_btl_header_t) +
2427+
mca_btl_usnic_component.prefix_send_offset +
24272428
opal_cache_line_size - 1) & ~(opal_cache_line_size - 1);
24282429
OBJ_CONSTRUCT(&module->ack_segs, opal_free_list_t);
24292430
rc = usnic_compat_free_list_init(&module->ack_segs,
2430-
sizeof(opal_btl_usnic_ack_segment_t) +
2431-
mca_btl_usnic_component.prefix_send_offset,
2431+
sizeof(opal_btl_usnic_ack_segment_t),
24322432
opal_cache_line_size,
24332433
OBJ_CLASS(opal_btl_usnic_ack_segment_t),
24342434
ack_segment_len,

opal/mca/btl/usnic/btl_usnic_recv.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,9 +112,12 @@ opal_btl_usnic_update_window(
112112
opal_btl_usnic_add_to_endpoints_needing_ack(endpoint);
113113
}
114114

115-
/* give this process a chance to send something before ACKing */
115+
/* A hueristic: set to send this ACK after we have checked our
116+
incoming DATA_CHANNEL component.act_iteration_delay times
117+
(i.e., so we can piggyback an ACK on an outgoing send) */
116118
if (0 == endpoint->endpoint_acktime) {
117-
endpoint->endpoint_acktime = get_nsec() + 50000; /* 50 usec */
119+
endpoint->endpoint_acktime =
120+
get_ticks() + mca_btl_usnic_component.ack_iteration_delay;
118121
}
119122

120123
/* Save this incoming segment in the received segmentss array on the

0 commit comments

Comments
 (0)