@@ -82,7 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
8282 frag -> frag_remote_offset = remote_offset + i * ompi_mtl_portals4 .max_msg_size_mtl ;
8383
8484 frag -> event_callback = ompi_mtl_portals4_rndv_get_frag_progress ;
85- frag -> frag_start_time_usec = opal_timer_base_get_usec () ;
85+ frag -> frag_abs_timeout_usec = 0 ;
8686
8787 OPAL_OUTPUT_VERBOSE ((90 , ompi_mtl_base_framework .framework_output , "GET (fragment %d/%d, size %ld) send" ,
8888 i + 1 , frag_count , frag -> frag_length ));
@@ -337,17 +337,26 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
337337 __FILE__ , __LINE__ , ev -> ni_fail_type );
338338
339339 if (OPAL_UNLIKELY (ev -> ni_fail_type != PTL_NI_DROPPED )) {
340- mtl_ptl_error (1 , "PTL_EVENT_REPLY with ni_fail_type: %s"
341- " => cannot retry" ,
342- name_of_err [ ev -> ni_fail_type ] );
340+ opal_output_verbose (1 , ompi_mtl_base_framework . framework_output ,
341+ "PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry" ,
342+ ( uint32_t ) ev -> ni_fail_type );
343343 ret = PTL_FAIL ;
344344 goto callback_error ;
345345 }
346346
347- opal_timer_t time = opal_timer_base_get_usec () - rndv_get_frag -> frag_start_time_usec ;
348- if (time > (unsigned int ) ompi_mtl_portals4 .get_retransmit_timeout ) {
349- mtl_ptl_error (1 , "timeout retrying GET" );
350- return OMPI_ERROR ;
347+ if (0 == rndv_get_frag -> frag_abs_timeout_usec ) {
348+ /* this is the first retry of the frag. start the timer. */
349+ /* instead of recording the start time, record the end time
350+ * and avoid addition on each retry. */
351+ rndv_get_frag -> frag_abs_timeout_usec = opal_timer_base_get_usec () + ompi_mtl_portals4 .get_retransmit_timeout ;
352+ opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
353+ "setting frag timeout at %lu" ,
354+ rndv_get_frag -> frag_abs_timeout_usec );
355+ } else if (opal_timer_base_get_usec () >= rndv_get_frag -> frag_abs_timeout_usec ) {
356+ opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
357+ "timeout retrying GET" );
358+ ret = PTL_FAIL ;
359+ goto callback_error ;
351360 }
352361
353362 OPAL_OUTPUT_VERBOSE ((50 , ompi_mtl_base_framework .framework_output ,
0 commit comments