2222 * and Technology (RIST). All rights reserved.
2323 * Copyright (c) 2014-2020 Intel, Inc. All rights reserved.
2424 * Copyright (c) 2015 Mellanox Technologies. All rights reserved.
25- * Copyright (c) 2017 IBM Corporation. All rights reserved.
25+ * Copyright (c) 2017-2022 IBM Corporation. All rights reserved.
2626 * Copyright (c) 2021 Nanook Consulting. All rights reserved.
2727 * Copyright (c) 2018-2022 Triad National Security, LLC. All rights
2828 * reserved.
5757
5858#include "ompi/runtime/params.h"
5959
60+ struct ompi_comm_split_type_hw_guided_t {
61+ const char * info_value ;
62+ int split_type ;
63+ };
64+ typedef struct ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_t ;
65+
66+ static const ompi_comm_split_type_hw_guided_t ompi_comm_split_type_hw_guided_support [] = {
67+ {.info_value = "mpi_shared_memory" , .split_type = MPI_COMM_TYPE_SHARED },
68+ {.info_value = "hwthread" , .split_type = OMPI_COMM_TYPE_HWTHREAD },
69+ {.info_value = "core" , .split_type = OMPI_COMM_TYPE_CORE },
70+ {.info_value = "l1cache" , .split_type = OMPI_COMM_TYPE_L1CACHE },
71+ {.info_value = "l2cache" , .split_type = OMPI_COMM_TYPE_L2CACHE },
72+ {.info_value = "l3cache" , .split_type = OMPI_COMM_TYPE_L3CACHE },
73+ {.info_value = "socket" , .split_type = OMPI_COMM_TYPE_SOCKET },
74+ {.info_value = "numanode" , .split_type = OMPI_COMM_TYPE_NUMA },
75+ {.info_value = "board" , .split_type = OMPI_COMM_TYPE_BOARD },
76+ {.info_value = "host" , .split_type = OMPI_COMM_TYPE_HOST },
77+ {.info_value = "cu" , .split_type = OMPI_COMM_TYPE_CU },
78+ {.info_value = "cluster" , .split_type = OMPI_COMM_TYPE_CLUSTER },
79+ {.info_value = NULL },
80+ };
81+
6082/*
6183** sort-function for MPI_Comm_split
6284*/
@@ -764,6 +786,15 @@ static int ompi_comm_split_type_get_part (ompi_group_t *group, const int split_t
764786 case OMPI_COMM_TYPE_CLUSTER :
765787 include = OPAL_PROC_ON_LOCAL_CLUSTER (locality );
766788 break ;
789+ case MPI_COMM_TYPE_HW_GUIDED :
790+ case MPI_COMM_TYPE_HW_UNGUIDED :
791+ /*
792+ * MPI_COMM_TYPE_HW_(UN)GUIDED handled in calling function.
793+ * We should not get here as the split type will be changed
794+ * at a higher level.
795+ */
796+ opal_output (0 , "Error: in ompi_comm_split_type_get_part() unexpected split_type=%d" , split_type );
797+ return OMPI_ERR_BAD_PARAM ;
767798 }
768799
769800 if (include ) {
@@ -837,8 +868,9 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
837868 ompi_communicator_t * newcomp = MPI_COMM_NULL ;
838869 int my_size , my_rsize = 0 , mode , inter ;
839870 int * lranks = NULL , * rranks = NULL ;
840- int global_split_type , ok , tmp [4 ];
871+ int global_split_type , ok , tmp [6 ];
841872 int rc ;
873+ int orig_split_type = split_type ;
842874
843875 /* silence clang warning. newcomm should never be NULL */
844876 if (OPAL_UNLIKELY (NULL == newcomm )) {
@@ -847,14 +879,58 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
847879
848880 inter = OMPI_COMM_IS_INTER (comm );
849881
882+ /* Step 0: Convert MPI_COMM_TYPE_HW_GUIDED to the internal type */
883+ if (MPI_COMM_TYPE_HW_GUIDED == split_type ) {
884+ int flag ;
885+ opal_cstring_t * value = NULL ;
886+
887+ opal_info_get (info , "mpi_hw_resource_type" , & value , & flag );
888+ /* If key is not in the 'info', then return MPI_COMM_NULL.
889+ * This is caught at the MPI interface level, but it doesn't hurt to
890+ * check it again.
891+ */
892+ if (!flag ) {
893+ * newcomm = MPI_COMM_NULL ;
894+ return OMPI_SUCCESS ;
895+ }
896+
897+ /* Verify the value associated with the "mpi_hw_resource_type" key
898+ * - is supported, and
899+ * - is the same value at all ranks
900+ *
901+ * If not supported, then return MPI_COMM_NULL.
902+ * If not the same at all ranks, throw an error.
903+ */
904+ flag = 0 ;
905+ for (int i = 0 ; ompi_comm_split_type_hw_guided_support [i ].info_value ; ++ i ) {
906+ if (0 == strncasecmp (value -> string , ompi_comm_split_type_hw_guided_support [i ].info_value , strlen (ompi_comm_split_type_hw_guided_support [i ].info_value ))) {
907+ split_type = ompi_comm_split_type_hw_guided_support [i ].split_type ;
908+ flag = 1 ;
909+ break ;
910+ }
911+ }
912+ /* If not supported, then return MPI_COMM_NULL. */
913+ if (0 == flag ) {
914+ * newcomm = MPI_COMM_NULL ;
915+ return OMPI_SUCCESS ;
916+ }
917+ }
918+
850919 /* Step 1: verify all ranks have supplied the same value for split type. All split types
851920 * must be the same or MPI_UNDEFINED (which is negative). */
852- tmp [0 ] = split_type ;
853- tmp [1 ] = - split_type ;
921+ tmp [0 ] = orig_split_type ;
922+ tmp [1 ] = - orig_split_type ;
854923 tmp [2 ] = key ;
855924 tmp [3 ] = - key ;
925+ /* For MPI_COMM_TYPE_HW_GUIDED, verify all ranks have supplied the same
926+ * split_type (represented by orig_split_type) and info 'value' (represented by split_type).
927+ *
928+ * For split_type != MPI_COMM_TYPE_HW_GUIDED then orig_split_type == split_type.
929+ */
930+ tmp [4 ] = split_type ;
931+ tmp [5 ] = - split_type ;
856932
857- rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & tmp , 4 , MPI_INT , MPI_MAX , comm ,
933+ rc = comm -> c_coll -> coll_allreduce (MPI_IN_PLACE , & tmp , 6 , MPI_INT , MPI_MAX , comm ,
858934 comm -> c_coll -> coll_allreduce_module );
859935 if (OPAL_UNLIKELY (OMPI_SUCCESS != rc )) {
860936 return rc ;
@@ -899,6 +975,26 @@ int ompi_comm_split_type (ompi_communicator_t *comm, int split_type, int key,
899975 return OMPI_SUCCESS ;
900976 }
901977
978+ /* MPI_COMM_TYPE_HW_GUIDED: Check if 'value' the same at all ranks */
979+ if (tmp [4 ] != - tmp [5 ]) {
980+ if (0 == ompi_comm_rank (comm )) {
981+ opal_output (0 , "Error: Mismatched info values for MPI_COMM_TYPE_HW_GUIDED" );
982+ }
983+ return OMPI_ERR_BAD_PARAM ;
984+ }
985+
986+ /* TODO: Make this better...
987+ *
988+ * See Example 7.4 in the MPI 4.0 standard for example usage.
989+ *
990+ * Stage 0: Recognized, but not implemented.
991+ * Stage 1: Do better than that
992+ */
993+ if (MPI_COMM_TYPE_HW_UNGUIDED == global_split_type ) {
994+ * newcomm = MPI_COMM_NULL ;
995+ return OMPI_SUCCESS ;
996+ }
997+
902998 /* Step 2: Build potential communicator groups. If any ranks will not be part of
903999 * the ultimate communicator we will drop them later. This saves doing an extra
9041000 * allgather on the whole communicator. By using ompi_comm_split() later only
0 commit comments