@@ -82,7 +82,7 @@ read_msg(void *start, ptl_size_t length, ptl_process_t target,
82
82
frag -> frag_remote_offset = remote_offset + i * ompi_mtl_portals4 .max_msg_size_mtl ;
83
83
84
84
frag -> event_callback = ompi_mtl_portals4_rndv_get_frag_progress ;
85
- frag -> frag_start_time_usec = opal_timer_base_get_usec () ;
85
+ frag -> frag_abs_timeout_usec = 0 ;
86
86
87
87
OPAL_OUTPUT_VERBOSE ((90 , ompi_mtl_base_framework .framework_output , "GET (fragment %d/%d, size %ld) send" ,
88
88
i + 1 , frag_count , frag -> frag_length ));
@@ -337,17 +337,26 @@ ompi_mtl_portals4_rndv_get_frag_progress(ptl_event_t *ev,
337
337
__FILE__ , __LINE__ , ev -> ni_fail_type );
338
338
339
339
if (OPAL_UNLIKELY (ev -> ni_fail_type != PTL_NI_DROPPED )) {
340
- mtl_ptl_error (1 , "PTL_EVENT_REPLY with ni_fail_type: %s"
341
- " => cannot retry" ,
342
- name_of_err [ ev -> ni_fail_type ] );
340
+ opal_output_verbose (1 , ompi_mtl_base_framework . framework_output ,
341
+ "PTL_EVENT_REPLY with ni_fail_type: %u => cannot retry" ,
342
+ ( uint32_t ) ev -> ni_fail_type );
343
343
ret = PTL_FAIL ;
344
344
goto callback_error ;
345
345
}
346
346
347
- opal_timer_t time = opal_timer_base_get_usec () - rndv_get_frag -> frag_start_time_usec ;
348
- if (time > (unsigned int ) ompi_mtl_portals4 .get_retransmit_timeout ) {
349
- mtl_ptl_error (1 , "timeout retrying GET" );
350
- return OMPI_ERROR ;
347
+ if (0 == rndv_get_frag -> frag_abs_timeout_usec ) {
348
+ /* this is the first retry of the frag. start the timer. */
349
+ /* instead of recording the start time, record the end time
350
+ * and avoid addition on each retry. */
351
+ rndv_get_frag -> frag_abs_timeout_usec = opal_timer_base_get_usec () + ompi_mtl_portals4 .get_retransmit_timeout ;
352
+ opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
353
+ "setting frag timeout at %lu" ,
354
+ rndv_get_frag -> frag_abs_timeout_usec );
355
+ } else if (opal_timer_base_get_usec () >= rndv_get_frag -> frag_abs_timeout_usec ) {
356
+ opal_output_verbose (1 , ompi_mtl_base_framework .framework_output ,
357
+ "timeout retrying GET" );
358
+ ret = PTL_FAIL ;
359
+ goto callback_error ;
351
360
}
352
361
353
362
OPAL_OUTPUT_VERBOSE ((50 , ompi_mtl_base_framework .framework_output ,
0 commit comments