Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ompi/mca/mtl/portals4/mtl_portals4.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ struct mca_mtl_portals4_module_t {

/* free list of rendezvous get fragments */
opal_free_list_t fl_rndv_get_frag;
int get_retransmit_timeout;

/** Network interface handle for matched interface */
ptl_handle_ni_t ni_h;
Expand Down
10 changes: 10 additions & 0 deletions ompi/mca/mtl/portals4/mtl_portals4_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,16 @@ ompi_mtl_portals4_component_register(void)
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.max_msg_size_mtl);

ompi_mtl_portals4.get_retransmit_timeout=10000;
(void) mca_base_component_var_register(&mca_mtl_portals4_component.mtl_version,
"get_retransmit_timeout",
"PtlGET retransmission timeout in usec",
MCA_BASE_VAR_TYPE_INT,
NULL, 0, 0,
OPAL_INFO_LVL_5,
MCA_BASE_VAR_SCOPE_READONLY,
&ompi_mtl_portals4.get_retransmit_timeout);

OBJ_RELEASE(new_enum);
if (0 > ret) {
return OMPI_ERR_NOT_SUPPORTED;
Expand Down
28 changes: 27 additions & 1 deletion ompi/mca/mtl/portals4/mtl_portals4_recv.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "ompi/mca/mtl/base/base.h"
#include "ompi/mca/mtl/base/mtl_base_datatype.h"
#include "ompi/message/message.h"
#include "opal/mca/timer/base/base.h"

#include "mtl_portals4.h"
#include "mtl_portals4_endpoint.h"
Expand Down Expand Up @@ -81,6 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
frag->frag_remote_offset = remote_offset + i * ompi_mtl_portals4.max_msg_size_mtl;

frag->event_callback = ompi_mtl_portals4_rndv_get_frag_progress;
frag->frag_abs_timeout_usec = 0;

OPAL_OUTPUT_VERBOSE((90, ompi_mtl_base_framework.framework_output, "GET (fragment %d/%d, size %ld) send",
i + 1, frag_count, frag->frag_length));
Expand Down Expand Up @@ -322,17 +324,41 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
ompi_mtl_portals4_recv_request_t* ptl_request =
(ompi_mtl_portals4_recv_request_t*) rndv_get_frag->request;

assert(ev->type==PTL_EVENT_REPLY);
assert(PTL_EVENT_REPLY == ev->type);

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Recv %lu (0x%lx) got reply event",
ptl_request->opcount, ptl_request->hdr_data));


if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_OK)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: PTL_EVENT_REPLY with ni_fail_type: %d",
__FILE__, __LINE__, ev->ni_fail_type);

if (OPAL_UNLIKELY(ev->ni_fail_type != PTL_NI_DROPPED)) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry",
(uint32_t)ev->ni_fail_type);
ret = PTL_FAIL;
goto callback_error;
}

if (0 == rndv_get_frag->frag_abs_timeout_usec) {
/* this is the first retry of the frag. start the timer. */
/* instead of recording the start time, record the end time
* and avoid addition on each retry. */
rndv_get_frag->frag_abs_timeout_usec = opal_timer_base_get_usec() + ompi_mtl_portals4.get_retransmit_timeout;
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"setting frag timeout at %lu",
rndv_get_frag->frag_abs_timeout_usec);
} else if (opal_timer_base_get_usec() >= rndv_get_frag->frag_abs_timeout_usec) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"timeout retrying GET");
ret = PTL_FAIL;
goto callback_error;
}

OPAL_OUTPUT_VERBOSE((50, ompi_mtl_base_framework.framework_output,
"Rendezvous Get Failed: Reissuing frag #%u", rndv_get_frag->frag_num));

Expand Down
3 changes: 3 additions & 0 deletions ompi/mca/mtl/portals4/mtl_portals4_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

#include "opal/datatype/opal_convertor.h"
#include "ompi/mca/mtl/mtl.h"
#include "opal/mca/timer/base/base.h"

struct ompi_mtl_portals4_message_t;
struct ompi_mtl_portals4_pending_request_t;
Expand Down Expand Up @@ -93,6 +94,8 @@ struct ompi_mtl_portals4_rndv_get_frag_t {
ptl_process_t frag_target;
ptl_hdr_data_t frag_match_bits;
ptl_size_t frag_remote_offset;
/* the absolute time at which this frag times out */
opal_timer_t frag_abs_timeout_usec;

int (*event_callback)(ptl_event_t *ev, struct ompi_mtl_portals4_rndv_get_frag_t*);

Expand Down