Skip to content

Commit 5ecd905

Browse files
committed
mtl/portals4: move opal_timer_base_get_usec() out of the fast path
Rearrange the receive frag timeout logic to avoid calling opal_timer_base_get_usec() in read_msg(). Instead set it at the first retry. Signed-off-by: Todd Kordenbrock <[email protected]>
1 parent 37766d7 commit 5ecd905

File tree

2 files changed

+19
-9
lines changed

2 files changed

+19
-9
lines changed

ompi/mca/mtl/portals4/mtl_portals4_recv.c

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
8282
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;
8383

8484
frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
85-
frag->frag_start_time_usec = opal_timer_base_get_usec();
85+
frag->frag_abs_timeout_usec = 0;
8686

8787
OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
8888
i + 1, frag_count, frag->frag_length));
@@ -337,17 +337,26 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
337337
__FILE__, __LINE__, ev->ni_fail_type);
338338

339339
if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) {
340-
mtl_ptl_error(1, "PTL_EVENT_REPLY with ni_fail_type: %s"
341-
" => cannot retry",
342-
name_of_err[ev->ni_fail_type]);
340+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
341+
"PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry",
342+
(uint32_t)ev->ni_fail_type);
343343
ret = PTL_FAIL;
344344
goto callback_error;
345345
}
346346

347-
opal_timer_t time = opal_timer_base_get_usec() - rndv_get_frag->frag_start_time_usec;
348-
if (time > (unsigned int) ompi_mtl_portals4.get_retransmit_timeout) {
349-
mtl_ptl_error(1, "timeout retrying GET");
350-
return OMPI_ERROR;
347+
if (0 == rndv_get_frag->frag_abs_timeout_usec) {
348+
/* this is the first retry of the frag. start the timer. */
349+
/* instead of recording the start time, record the end time
350+
* and avoid addition on each retry. */
351+
rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout;
352+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
353+
"setting frag timeout at %lu",
354+
rndv_get_frag->frag_abs_timeout_usec);
355+
} else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) {
356+
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
357+
"timeout retrying GET");
358+
ret = PTL_FAIL;
359+
goto callback_error;
351360
}
352361

353362
OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,

ompi/mca/mtl/portals4/mtl_portals4_request.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
9494
ptl_process_t frag_target;
9595
ptl_hdr_data_t frag_match_bits;
9696
ptl_size_t frag_remote_offset;
97-
opal_timer_t frag_start_time_usec;
97+
/* the absolute time at which this frag times out */
98+
opal_timer_t frag_abs_timeout_usec;
9899

99100
int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);
100101

0 commit comments

Comments
 (0)