33 * Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
44 * University Research and Technology
55 * Corporation. All rights reserved.
6- * Copyright (c) 2004-2019 The University of Tennessee and The University
6+ * Copyright (c) 2004-2022 The University of Tennessee and The University
77 * of Tennessee Research Foundation. All rights
88 * reserved.
99 * Copyright (c) 2004-2007 High Performance Computing Center Stuttgart,
@@ -370,7 +370,7 @@ int mca_pml_ob1_revoke_comm( struct ompi_communicator_t* ompi_comm, bool coll_on
370370 /* note this is not an ompi_proc, but a ob1_comm_proc, thus we don't
371371 * use ompi_proc_is_sentinel to verify if initialized. */
372372 if ( NULL == proc ) continue ;
373- /* remove the frag from the unexpected list, add to the nack list
373+ /* remove the frag from the unexpected list, add to the nack list
374374 * so that we can send the nack as needed to remote cancel the send
375375 * from outside the match lock.
376376 */
@@ -385,7 +385,7 @@ int mca_pml_ob1_revoke_comm( struct ompi_communicator_t* ompi_comm, bool coll_on
385385 }
386386 }
387387 /* same for the cantmatch queue/heap; this list is more complicated
388- * Keep it simple: we pop all of the complex list, put the bad items
388+ * Keep it simple: we pop all of the complex list, put the bad items
389389 * in the nack_list, and keep the good items in the keep_list;
390390 * then we reinsert the good items in the cantmatch heaplist */
391391 mca_pml_ob1_recv_frag_t * frag ;
@@ -519,7 +519,7 @@ void mca_pml_ob1_recv_frag_callback_match (mca_btl_base_module_t *btl,
519519 }
520520#endif
521521
522- if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE (comm_ptr )) {
522+ if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE (comm_ptr ) || 0 > hdr -> hdr_tag ) {
523523 /* get sequence number of next message that can be processed.
524524 * If this frag is out of sequence, queue it up in the list
525525 * now as we still have the lock.
@@ -637,6 +637,38 @@ void mca_pml_ob1_recv_frag_callback_match (mca_btl_base_module_t *btl,
637637 }
638638}
639639
640+ /**
641+ * Merge all out of sequence fragments into the matching queue, as if they were received now.
642+ */
643+ int mca_pml_ob1_merge_cant_match ( ompi_communicator_t * ompi_comm )
644+ {
645+ mca_pml_ob1_comm_t * pml_comm = (mca_pml_ob1_comm_t * )ompi_comm -> c_pml_comm ;
646+ mca_pml_ob1_recv_frag_t * frag , * frags_cant_match ;
647+ mca_pml_ob1_comm_proc_t * proc ;
648+ int cnt = 0 ;
649+
650+ for (uint32_t i = 0 ; i < pml_comm -> num_procs ; i ++ ) {
651+ if ((NULL == (proc = pml_comm -> procs [i ])) || (NULL != proc -> frags_cant_match )) {
652+ continue ;
653+ }
654+
655+ OB1_MATCHING_LOCK (& pml_comm -> matching_lock );
656+ /* Acquire all cant_match frags from the peer */
657+ frags_cant_match = proc -> frags_cant_match ;
658+ proc -> frags_cant_match = NULL ;
659+ while (NULL != (frag = remove_head_from_ordered_list (& frags_cant_match ))) {
660+ /* mca_pml_ob1_recv_frag_match_proc() will release the lock. */
661+ mca_pml_ob1_recv_frag_match_proc (frag -> btl , ompi_comm , proc ,
662+ & frag -> hdr .hdr_match ,
663+ frag -> segments , frag -> num_segments ,
664+ frag -> hdr .hdr_match .hdr_common .hdr_type , frag );
665+ OB1_MATCHING_LOCK (& pml_comm -> matching_lock );
666+ cnt ++ ;
667+ }
668+ }
669+ OB1_MATCHING_UNLOCK (& pml_comm -> matching_lock );
670+ return cnt ;
671+ }
640672
641673void mca_pml_ob1_recv_frag_callback_rndv (mca_btl_base_module_t * btl ,
642674 const mca_btl_base_receive_descriptor_t * descriptor )
@@ -1092,7 +1124,7 @@ static int mca_pml_ob1_recv_frag_match (mca_btl_base_module_t *btl,
10921124 frag_msg_seq = hdr -> hdr_seq ;
10931125 next_msg_seq_expected = (uint16_t )proc -> expected_sequence ;
10941126
1095- if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE (comm_ptr )) {
1127+ if (!OMPI_COMM_CHECK_ASSERT_ALLOW_OVERTAKE (comm_ptr ) || 0 > hdr -> hdr_tag ) {
10961128 /* If the sequence number is wrong, queue it up for later. */
10971129 if (OPAL_UNLIKELY (frag_msg_seq != next_msg_seq_expected )) {
10981130 mca_pml_ob1_recv_frag_t * frag ;
0 commit comments