@@ -82,11 +82,46 @@ mca_pml_ucx_module_t ompi_pml_ucx = {
8282#define PML_UCX_REQ_ALLOCA () \
8383 ((char *)alloca(ompi_pml_ucx.request_size) + ompi_pml_ucx.request_size);
8484
85+ #if HAVE_UCP_WORKER_ADDRESS_FLAGS
86+ static int mca_pml_ucx_send_worker_address_type (int addr_flags , int modex_scope )
87+ {
88+ ucs_status_t status ;
89+ ucp_worker_attr_t attrs ;
90+ int rc ;
91+
92+ attrs .field_mask = UCP_WORKER_ATTR_FIELD_ADDRESS |
93+ UCP_WORKER_ATTR_FIELD_ADDRESS_FLAGS ;
94+ attrs .address_flags = addr_flags ;
95+
96+ status = ucp_worker_query (ompi_pml_ucx .ucp_worker , & attrs );
97+ if (UCS_OK != status ) {
98+ PML_UCX_ERROR ("Failed to query UCP worker address" );
99+ return OMPI_ERROR ;
100+ }
101+
102+ OPAL_MODEX_SEND (rc , modex_scope , & mca_pml_ucx_component .pmlm_version ,
103+ (void * )attrs .address , attrs .address_length );
104+
105+ ucp_worker_release_address (ompi_pml_ucx .ucp_worker , attrs .address );
106+
107+ if (OMPI_SUCCESS != rc ) {
108+ return OMPI_ERROR ;
109+ }
110+
111+ PML_UCX_VERBOSE (2 , "Pack %s worker address, size %ld" ,
112+ (modex_scope == OPAL_PMIX_LOCAL ) ? "local" : "remote" ,
113+ attrs .address_length );
114+
115+ return OMPI_SUCCESS ;
116+ }
117+ #endif
85118
86119static int mca_pml_ucx_send_worker_address (void )
87120{
88- ucp_address_t * address ;
89121 ucs_status_t status ;
122+
123+ #if !HAVE_UCP_WORKER_ADDRESS_FLAGS
124+ ucp_address_t * address ;
90125 size_t addrlen ;
91126 int rc ;
92127
@@ -96,16 +131,35 @@ static int mca_pml_ucx_send_worker_address(void)
96131 return OMPI_ERROR ;
97132 }
98133
134+ PML_UCX_VERBOSE (2 , "Pack worker address, size %ld" , addrlen );
135+
99136 OPAL_MODEX_SEND (rc , OPAL_PMIX_GLOBAL ,
100137 & mca_pml_ucx_component .pmlm_version , (void * )address , addrlen );
138+
139+ ucp_worker_release_address (ompi_pml_ucx .ucp_worker , address );
140+
101141 if (OMPI_SUCCESS != rc ) {
102- PML_UCX_ERROR ("Open MPI couldn't distribute EP connection details" );
103- return OMPI_ERROR ;
142+ goto err ;
143+ }
144+ #else
145+ /* Pack just network device addresses for remote node peers */
146+ status = mca_pml_ucx_send_worker_address_type (UCP_WORKER_ADDRESS_FLAG_NET_ONLY ,
147+ OPAL_PMIX_REMOTE );
148+ if (UCS_OK != status ) {
149+ goto err ;
104150 }
105151
106- ucp_worker_release_address (ompi_pml_ucx .ucp_worker , address );
152+ status = mca_pml_ucx_send_worker_address_type (0 , OPAL_PMIX_LOCAL );
153+ if (UCS_OK != status ) {
154+ goto err ;
155+ }
156+ #endif
107157
108158 return OMPI_SUCCESS ;
159+
160+ err :
161+ PML_UCX_ERROR ("Open MPI couldn't distribute EP connection details" );
162+ return OMPI_ERROR ;
109163}
110164
111165static int mca_pml_ucx_recv_worker_address (ompi_proc_t * proc ,
@@ -121,6 +175,9 @@ static int mca_pml_ucx_recv_worker_address(ompi_proc_t *proc,
121175 PML_UCX_ERROR ("Failed to receive UCX worker address: %s (%d)" ,
122176 opal_strerror (ret ), ret );
123177 }
178+
179+ PML_UCX_VERBOSE (2 , "Got proc %d address, size %ld" ,
180+ proc -> super .proc_name .vpid , * addrlen_p );
124181 return ret ;
125182}
126183
0 commit comments