@@ -417,6 +417,26 @@ ompi_mtl_ofi_send_callback(struct fi_cq_tagged_entry *wc,
417417 return OMPI_SUCCESS ;
418418}
419419
420+
421+ /*
422+ * special send callback for excid send operation.
423+ * Since the send excid operation cannot block
424+ * waiting for completion of the send operation,
425+ * we have to free the internal message buffer allocated
426+ * as part of the excid operation here as well as the
427+ * request itself.
428+ */
429+ __opal_attribute_always_inline__ static inline int
430+ ompi_mtl_ofi_send_excid_callback (struct fi_cq_tagged_entry * wc ,
431+ ompi_mtl_ofi_request_t * ofi_req )
432+ {
433+ assert (ofi_req -> completion_count > 0 );
434+ free (ofi_req -> buffer );
435+ ofi_req -> completion_count -- ; /* no one's waiting on this */
436+ free (ofi_req );
437+ return OMPI_SUCCESS ;
438+ }
439+
420440__opal_attribute_always_inline__ static inline int
421441ompi_mtl_ofi_send_error_callback (struct fi_cq_err_entry * error ,
422442 ompi_mtl_ofi_request_t * ofi_req )
@@ -668,6 +688,13 @@ ompi_mtl_ofi_ssend_recv(ompi_mtl_ofi_request_t *ack_req,
668688 return OMPI_SUCCESS ;
669689}
670690
691+ /*
692+ * this routine is invoked in the case of communicators which are not using a
693+ * global cid, i.e. those created using MPI_Comm_create_from_group/
694+ * MPI_Intercomm_create_from_groups in order to exchange the local cid used
695+ * by the sender for this supplied communicator. This function is only invoked
696+ * for the first message sent to a given receiver.
697+ */
671698static int
672699ompi_mtl_ofi_send_excid (struct mca_mtl_base_module_t * mtl ,
673700 struct ompi_communicator_t * comm ,
@@ -676,14 +703,26 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
676703 bool is_send )
677704{
678705 ssize_t ret = OMPI_SUCCESS ;
679- ompi_mtl_ofi_request_t * ofi_req = malloc ( sizeof ( ompi_mtl_ofi_request_t )) ;
706+ ompi_mtl_ofi_request_t * ofi_req = NULL ;
680707 int ctxt_id = 0 ;
681- mca_mtl_ofi_cid_hdr_t * start = malloc ( sizeof ( mca_mtl_ofi_cid_hdr_t )) ;
708+ mca_mtl_ofi_cid_hdr_t * start = NULL ;
682709 ompi_proc_t * ompi_proc = NULL ;
683710 mca_mtl_ofi_endpoint_t * endpoint = NULL ;
684711 fi_addr_t sep_peer_fiaddr = 0 ;
685712 mca_mtl_comm_t * mtl_comm ;
686713
714+ ofi_req = (ompi_mtl_ofi_request_t * )malloc (sizeof (ompi_mtl_ofi_request_t ));
715+ if (NULL == ofi_req ) {
716+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
717+ goto fn_exit ;
718+ }
719+
720+ start = (mca_mtl_ofi_cid_hdr_t * )malloc (sizeof (mca_mtl_ofi_cid_hdr_t ));
721+ if (NULL == start ) {
722+ ret = OMPI_ERR_OUT_OF_RESOURCE ;
723+ goto fn_exit ;
724+ }
725+
687726 mtl_comm = comm -> c_mtl_comm ;
688727
689728 ctxt_id = 0 ;
@@ -692,8 +731,9 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
692731 /**
693732 * Create a send request, start it and wait until it completes.
694733 */
695- ofi_req -> event_callback = ompi_mtl_ofi_send_callback ;
734+ ofi_req -> event_callback = ompi_mtl_ofi_send_excid_callback ;
696735 ofi_req -> error_callback = ompi_mtl_ofi_send_error_callback ;
736+ ofi_req -> buffer = start ;
697737
698738 ompi_proc = ompi_comm_peer_lookup (comm , dest );
699739 endpoint = ompi_mtl_ofi_get_endpoint (mtl , ompi_proc );
@@ -719,12 +759,10 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
719759 opal_show_help ("help-mtl-ofi.txt" ,
720760 "message too big" , false,
721761 length , endpoint -> mtl_ofi_module -> max_msg_size );
722- return OMPI_ERROR ;
762+ ret = OMPI_ERROR ;
763+ goto fn_exit ;
723764 }
724765
725- if (OPAL_UNLIKELY (ofi_req -> status .MPI_ERROR != OMPI_SUCCESS ))
726- return ofi_req -> status .MPI_ERROR ;
727-
728766 if (ompi_mtl_ofi .max_inject_size >= length ) {
729767 if (ofi_cq_data ) {
730768 MTL_OFI_RETRY_UNTIL_DONE (fi_injectdata (ompi_mtl_ofi .ofi_ctxt [0 ].tx_ep ,
@@ -743,8 +781,6 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
743781 ofi_cq_data ? "fi_injectdata failed"
744782 : "fi_inject failed" );
745783
746- ofi_req -> status .MPI_ERROR = ompi_mtl_ofi_get_error (ret );
747- return ofi_req -> status .MPI_ERROR ;
748784 }
749785 } else {
750786 ofi_req -> completion_count = 1 ;
@@ -768,11 +804,20 @@ ompi_mtl_ofi_send_excid(struct mca_mtl_base_module_t *mtl,
768804 MTL_OFI_LOG_FI_ERR (ret ,
769805 ofi_cq_data ? "fi_tsenddata failed"
770806 : "fi_tsend failed" );
771- ofi_req -> status .MPI_ERROR = ompi_mtl_ofi_get_error (ret );
772807 }
773808 }
774809
775- return ofi_req -> status .MPI_ERROR ;
810+ ret = ompi_mtl_ofi_get_error (ret );
811+ ofi_req -> status .MPI_ERROR = ret ;
812+
813+ fn_exit :
814+
815+ if ((OMPI_SUCCESS != ret ) || (ofi_req -> completion_count == 0 )) {
816+ if (NULL != ofi_req ) free (ofi_req );
817+ if (NULL != start ) free (start );
818+ }
819+
820+ return ret ;
776821}
777822
778823__opal_attribute_always_inline__ static inline int
0 commit comments