@@ -292,6 +292,17 @@ static void mca_spml_ucx_rkey_store_put(mca_spml_ucx_rkey_store_t *store,
292292 ucp_rkey_destroy (rkey );
293293}
294294
295+ static void mca_spml_ucx_team_world_init ()
296+ {
297+ int rc = mca_spml_ucx_team_split_strided (NULL , 0 , 1 , oshmem_num_procs (), NULL , 0 ,
298+ & SHMEM_TEAM_WORLD );
299+
300+ if (rc != OSHMEM_SUCCESS ) {
301+ SPML_UCX_ERROR ("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)" );
302+ oshmem_shmem_abort (-1 );
303+ }
304+ }
305+
295306int mca_spml_ucx_enable (bool enable )
296307{
297308 SPML_UCX_VERBOSE (50 , "*** ucx ENABLED ****" );
@@ -315,8 +326,7 @@ void mca_spml_ucx_peer_mkey_cache_init(mca_spml_ucx_ctx_t *ucx_ctx, int pe)
315326int mca_spml_ucx_peer_mkey_cache_add (ucp_peer_t * ucp_peer , int index )
316327{
317328 /* Allocate an array to hold the pointers to the ucx_cached_mkey */
318- if (index >= (int )ucp_peer -> mkeys_cnt ){
319- int old_size = ucp_peer -> mkeys_cnt ;
329+ if (index >= (int )ucp_peer -> mkeys_cnt ){ int old_size = ucp_peer -> mkeys_cnt ;
320330 ucp_peer -> mkeys_cnt = index + 1 ;
321331 ucp_peer -> mkeys = realloc (ucp_peer -> mkeys , sizeof (ucp_peer -> mkeys [0 ]) * ucp_peer -> mkeys_cnt );
322332 if (NULL == ucp_peer -> mkeys ) {
@@ -451,6 +461,14 @@ int mca_spml_ucx_ctx_mkey_del(mca_spml_ucx_ctx_t *ucx_ctx, int pe, uint32_t segn
451461 return OSHMEM_SUCCESS ;
452462}
453463
464+ static void mca_spml_ucx_team_world_destroy ()
465+ {
466+ if (SHMEM_TEAM_WORLD != NULL ) {
467+ mca_spml_ucx_team_destroy (SHMEM_TEAM_WORLD );
468+ SHMEM_TEAM_WORLD = NULL ;
469+ }
470+ }
471+
454472int mca_spml_ucx_del_procs (oshmem_group_t * group , size_t nprocs )
455473{
456474 size_t ucp_workers = mca_spml_ucx .ucp_workers ;
@@ -460,6 +478,8 @@ int mca_spml_ucx_del_procs(oshmem_group_t* group, size_t nprocs)
460478
461479 oshmem_shmem_barrier ();
462480
481+ mca_spml_ucx_team_world_destroy ();
482+
463483 if (!mca_spml_ucx_ctx_default .ucp_peers ) {
464484 return OSHMEM_SUCCESS ;
465485 }
@@ -1163,8 +1183,6 @@ int mca_spml_ucx_ctx_create(long options, shmem_ctx_t *ctx)
11631183 }
11641184 SHMEM_MUTEX_UNLOCK (mca_spml_ucx .internal_mutex );
11651185 }
1166-
1167- mca_spml_ucx_team_world_init ();
11681186
11691187 (* ctx ) = (shmem_ctx_t )ucx_ctx ;
11701188 return OSHMEM_SUCCESS ;
@@ -1183,8 +1201,6 @@ void mca_spml_ucx_ctx_destroy(shmem_ctx_t ctx)
11831201 opal_progress_unregister (spml_ucx_ctx_progress );
11841202 }
11851203
1186- mca_spml_ucx_team_world_destroy ();
1187-
11881204 SHMEM_MUTEX_UNLOCK (mca_spml_ucx .internal_mutex );
11891205}
11901206
@@ -1754,25 +1770,6 @@ size_t mca_spml_ucx_test_some_vector(void *ivars, int cmp,
17541770 RUNTIME_SHMEM_NOT_IMPLEMENTED_API_ABORT_RET_SIZE_T ();
17551771}
17561772
1757- void mca_spml_ucx_team_world_init ()
1758- {
1759- int rc = mca_spml_ucx_team_split_strided (NULL , 0 , 1 , oshmem_num_procs (), NULL , 0 ,
1760- & SHMEM_TEAM_WORLD );
1761-
1762- if (rc != OSHMEM_SUCCESS ) {
1763- SPML_UCX_ERROR ("mca_spml_ucx_team_split_strided failed (SHMEM_TEAM_WORLD creation)" );
1764- oshmem_shmem_abort (-1 );
1765- }
1766- }
1767-
1768- void mca_spml_ucx_team_world_destroy ()
1769- {
1770- if (SHMEM_TEAM_WORLD != NULL ) {
1771- mca_spml_ucx_team_destroy (SHMEM_TEAM_WORLD );
1772- SHMEM_TEAM_WORLD = NULL ;
1773- }
1774- }
1775-
17761773/* This routine is not implemented */
17771774int mca_spml_ucx_team_sync (shmem_team_t team )
17781775{
@@ -1820,16 +1817,16 @@ int mca_spml_ucx_team_translate_pe(shmem_team_t src_team, int src_pe,
18201817 mca_spml_ucx_team_t * ucx_dest_team = (mca_spml_ucx_team_t * ) dest_team ;
18211818 int global_pe ;
18221819
1823- if (src_pe == SPML_UCX_PE_NOT_IN_TEAM || (src_team == dest_team )) {
1820+ if (( src_pe == SPML_UCX_PE_NOT_IN_TEAM ) || (src_team == dest_team )) {
18241821 return src_pe ;
18251822 }
18261823
1827- if (src_team == dest_team ) {
1828- return src_pe ;
1829- }
1830-
18311824 global_pe = ucx_src_team -> start + src_pe * ucx_src_team -> stride ;
18321825
1826+ SPML_UCX_WARN ("team_translate_pe(src_team=%p, src_pe=%d, dest_team=%p), global pe: %d, "
1827+ "src_team->start: %d, src pe: %d, src_team->stride: %d" ,
1828+ src_team , src_pe , dest_team , global_pe , ucx_src_team -> start , src_pe , ucx_src_team -> stride );
1829+
18331830 if (dest_team == SHMEM_TEAM_WORLD ) {
18341831 return global_pe ;
18351832 }
@@ -1849,32 +1846,44 @@ int mca_spml_ucx_team_split_strided(shmem_team_t parent_team, int start, int
18491846 mca_spml_ucx_team_t * ucx_parent_team ;
18501847 mca_spml_ucx_team_t * ucx_new_team ;
18511848 int my_pe ;
1852- int n_pes ;
18531849
18541850 SPML_UCX_ASSERT (((start + size * stride ) <= oshmem_num_procs ()) && (start < size ) && (stride > 0 ) && (size > 0 ));
18551851
1852+ SPML_UCX_WARN ("team_split_strided(parent_team=%p, start=%d, stride=%d, size=%d, config=%p, "
1853+ "config_mask=%ld, new_team=%p)" ,
1854+ parent_team , start , stride , size , config , config_mask , new_team );
1855+
1856+ ucx_new_team = (mca_spml_ucx_team_t * )malloc (sizeof (mca_spml_ucx_team_t ));
1857+ ucx_new_team -> start = start ;
1858+ ucx_new_team -> stride = stride ;
1859+
18561860 if (parent_team == NULL ) {
18571861 my_pe = shmem_my_pe ();
18581862 } else {
18591863 ucx_parent_team = (mca_spml_ucx_team_t * ) parent_team ;
1864+
18601865 SPML_UCX_VALIDATE_TEAM (parent_team );
18611866 if (mca_spml_ucx_is_pe_in_strided_team (ucx_parent_team -> my_pe , start , stride , size )) {
18621867 my_pe = (ucx_parent_team -> my_pe - start ) / stride ;
1868+ SPML_UCX_WARN ("split: my_pe at parent team: %d, start: %d, stride: %d, size: %d, "
1869+ "my_pe at new team: %d" , ucx_parent_team -> my_pe , start , stride , size , my_pe );
18631870 } else {
18641871 /* not in team, according to spec it should be SHMEM_TEAM_INVALID but its value is NULL which
18651872 can be also interpreted as 0 (first pe), therefore -1 is used */
1873+
1874+ SPML_UCX_WARN ("pe #%d is not part of the new team" , ucx_parent_team -> my_pe );
18661875 my_pe = SPML_UCX_PE_NOT_IN_TEAM ;
18671876 }
1877+
1878+ /* In order to simplify pe translations start and stride are calculated with respect to
1879+ * world_team */
1880+ ucx_new_team -> start += ucx_parent_team -> start ;
1881+ ucx_new_team -> stride *= ucx_parent_team -> stride ;
18681882 }
18691883
1870- ucx_new_team = (mca_spml_ucx_team_t * )malloc (sizeof (mca_spml_ucx_team_t ));
18711884 ucx_new_team -> n_pes = size ;
18721885 ucx_new_team -> my_pe = my_pe ;
18731886
1874- /* In order to simplify pe translations start and stride are calculated with respect to
1875- * world_team */
1876- ucx_new_team -> start = ucx_parent_team -> start + start ;
1877- ucx_new_team -> stride = ucx_parent_team -> stride * stride ;
18781887 ucx_new_team -> config = calloc (1 , sizeof (mca_spml_ucx_team_config_t ));
18791888
18801889 if (config != NULL ) {
0 commit comments