@@ -176,6 +176,7 @@ static int spml_ucx_init(void)
176176 }
177177
178178 OBJ_CONSTRUCT (& (mca_spml_ucx .ctx_list ), opal_list_t );
179+ OBJ_CONSTRUCT (& (mca_spml_ucx .idle_ctx_list ), opal_list_t );
179180 SHMEM_MUTEX_INIT (mca_spml_ucx .internal_mutex );
180181
181182 wkr_params .field_mask = UCP_WORKER_PARAM_FIELD_THREAD_MODE ;
@@ -224,42 +225,81 @@ mca_spml_ucx_component_init(int* priority,
224225 return & mca_spml_ucx .super ;
225226}
226227
228+ static void _ctx_cleanup (mca_spml_ucx_ctx_list_item_t * ctx_item )
229+ {
230+ int i , j , nprocs = oshmem_num_procs ();
231+ opal_common_ucx_del_proc_t * del_procs ;
232+
233+ del_procs = malloc (sizeof (* del_procs ) * nprocs );
234+
235+ for (i = 0 ; i < nprocs ; ++ i ) {
236+ for (j = 0 ; j < MCA_MEMHEAP_SEG_COUNT ; j ++ ) {
237+ if (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey != NULL ) {
238+ ucp_rkey_destroy (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey );
239+ }
240+ }
241+
242+ del_procs [i ].ep = ctx_item -> ctx .ucp_peers [i ].ucp_conn ;
243+ del_procs [i ].vpid = i ;
244+ ctx_item -> ctx .ucp_peers [i ].ucp_conn = NULL ;
245+ }
246+
247+ opal_common_ucx_del_procs_nb (del_procs , nprocs , oshmem_my_proc_id (),
248+ mca_spml_ucx .num_disconnect ,
249+ ctx_item -> ctx .ucp_worker );
250+ free (del_procs );
251+ free (ctx_item -> ctx .ucp_peers );
252+ }
253+
227254static int mca_spml_ucx_component_fini (void )
228255{
229256 mca_spml_ucx_ctx_list_item_t * ctx_item , * next ;
230- size_t i , j , nprocs = oshmem_num_procs ();
257+ int fenced = 0 ;
258+ int ret = OSHMEM_SUCCESS ;
231259
232260 opal_progress_unregister (spml_ucx_progress );
233261
234262 if (!mca_spml_ucx .enabled )
235263 return OSHMEM_SUCCESS ; /* never selected.. return success.. */
236264
237265 /* delete context objects from list */
238- OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
266+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
239267 mca_spml_ucx_ctx_list_item_t ) {
240- opal_list_remove_item (& (mca_spml_ucx .ctx_list ), & ctx_item -> super );
268+ _ctx_cleanup (ctx_item );
269+ }
241270
242- opal_common_ucx_del_proc_t * del_procs ;
243- del_procs = malloc (sizeof (* del_procs ) * nprocs );
271+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
272+ mca_spml_ucx_ctx_list_item_t ) {
273+ _ctx_cleanup (ctx_item );
274+ }
244275
245- for (i = 0 ; i < nprocs ; ++ i ) {
246- for (j = 0 ; j < MCA_MEMHEAP_SEG_COUNT ; j ++ ) {
247- if (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey != NULL ) {
248- ucp_rkey_destroy (ctx_item -> ctx .ucp_peers [i ].mkeys [j ].key .rkey );
249- }
250- }
276+ ret = opal_common_ucx_mca_pmix_fence_nb (& fenced );
277+ if (OPAL_SUCCESS != ret ) {
278+ return ret ;
279+ }
251280
252- del_procs [i ].ep = ctx_item -> ctx .ucp_peers [i ].ucp_conn ;
253- del_procs [i ].vpid = i ;
254- ctx_item -> ctx .ucp_peers [i ].ucp_conn = NULL ;
281+ while (!fenced ) {
282+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
283+ mca_spml_ucx_ctx_list_item_t ) {
284+ ucp_worker_progress (ctx_item -> ctx .ucp_worker );
255285 }
286+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
287+ mca_spml_ucx_ctx_list_item_t ) {
288+ ucp_worker_progress (ctx_item -> ctx .ucp_worker );
289+ }
290+ ucp_worker_progress (mca_spml_ucx_ctx_default .ucp_worker );
291+ }
256292
257- opal_common_ucx_del_procs (del_procs , nprocs , oshmem_my_proc_id (),
258- mca_spml_ucx .num_disconnect ,
259- ctx_item -> ctx .ucp_worker );
260- free (del_procs );
261- free (ctx_item -> ctx .ucp_peers );
262-
293+ /* delete all workers */
294+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .idle_ctx_list ),
295+ mca_spml_ucx_ctx_list_item_t ) {
296+ opal_list_remove_item (& (mca_spml_ucx .idle_ctx_list ), & ctx_item -> super );
297+ ucp_worker_destroy (ctx_item -> ctx .ucp_worker );
298+ OBJ_RELEASE (ctx_item );
299+ }
300+ OPAL_LIST_FOREACH_SAFE (ctx_item , next , & (mca_spml_ucx .ctx_list ),
301+ mca_spml_ucx_ctx_list_item_t ) {
302+ opal_list_remove_item (& (mca_spml_ucx .ctx_list ), & ctx_item -> super );
263303 ucp_worker_destroy (ctx_item -> ctx .ucp_worker );
264304 OBJ_RELEASE (ctx_item );
265305 }
@@ -271,6 +311,7 @@ static int mca_spml_ucx_component_fini(void)
271311 mca_spml_ucx .enabled = false; /* not anymore */
272312
273313 OBJ_DESTRUCT (& (mca_spml_ucx .ctx_list ));
314+ OBJ_DESTRUCT (& (mca_spml_ucx .idle_ctx_list ));
274315 SHMEM_MUTEX_DESTROY (mca_spml_ucx .internal_mutex );
275316
276317 if (mca_spml_ucx .ucp_context ) {
0 commit comments