diff --git a/ompi/communicator/comm.c b/ompi/communicator/comm.c index bfb16202e43..aae8e94cf5a 100644 --- a/ompi/communicator/comm.c +++ b/ompi/communicator/comm.c @@ -26,7 +26,7 @@ * Copyright (c) 2021 Nanook Consulting. All rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -56,6 +56,7 @@ #include "ompi/communicator/communicator.h" #include "ompi/mca/pml/pml.h" #include "ompi/request/request.h" +#include "ompi/info/info_memkind.h" #include "ompi/runtime/params.h" @@ -444,6 +445,7 @@ int ompi_comm_create_w_info (ompi_communicator_t *comm, ompi_group_t *group, opa /* Copy info if there is one. */ newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } @@ -699,8 +701,9 @@ int ompi_comm_split_with_info( ompi_communicator_t* comm, int color, int key, ompi_comm_print_cid (newcomp), ompi_comm_print_cid (comm)); /* Copy info if there is one */ + newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); if (info) { - newcomp->super.s_info = OBJ_NEW(opal_info_t); opal_info_dup(info, &(newcomp->super.s_info)); } @@ -991,6 +994,7 @@ static int ompi_comm_split_type_core(ompi_communicator_t *comm, ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_LAZY_BARRIER); ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_ACTIVE_POLL); + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); if (info) { opal_infosubscribe_change_info(&newcomp->super, info); } @@ -1344,6 +1348,7 @@ int ompi_comm_dup_with_info ( ompi_communicator_t * comm, opal_info_t *info, omp // Copy info if there is one. ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_LAZY_BARRIER); ompi_comm_assert_subscribe (newcomp, OMPI_COMM_ASSERT_ACTIVE_POLL); + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); if (info) { opal_infosubscribe_change_info(&newcomp->super, info); } @@ -1434,6 +1439,7 @@ static int ompi_comm_idup_internal (ompi_communicator_t *comm, ompi_group_t *gro { ompi_communicator_t *newcomp = context->newcomp; newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&comm->super, &newcomp->super, info); if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } @@ -1588,6 +1594,7 @@ int ompi_comm_create_from_group (ompi_group_t *group, const char *tag, opal_info if (NULL == newcomp->super.s_info) { return OMPI_ERR_OUT_OF_RESOURCE; } + ompi_info_memkind_copy_or_set (&group->grp_instance->super, &newcomp->super, info); /* activate communicator and init coll-module. use the group allreduce implementation as * no collective module has yet been selected. the tag does not matter as any tag will @@ -1885,6 +1892,7 @@ int ompi_intercomm_create_from_groups (ompi_group_t *local_group, int local_lead // Copy info if there is one. newcomp->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&local_comm->super, &newcomp->super, info); if (info) { opal_info_dup(info, &(newcomp->super.s_info)); } diff --git a/ompi/communicator/comm_init.c b/ompi/communicator/comm_init.c index 498bf4a1e70..82edbb03631 100644 --- a/ompi/communicator/comm_init.c +++ b/ompi/communicator/comm_init.c @@ -25,7 +25,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018-2024 Triad National Security, LLC. All rights * reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All rights reserved. * Copyright (c) 2023 NVIDIA Corporation. All rights reserved. * $COPYRIGHT$ * @@ -53,6 +53,7 @@ #include "ompi/dpm/dpm.h" #include "ompi/memchecker.h" #include "ompi/instance/instance.h" +#include "ompi/info/info_memkind.h" /* ** Table for Fortran <-> C communicator handle conversion @@ -266,6 +267,7 @@ int ompi_comm_init_mpi3 (void) free(str); } } + /* Setup MPI_COMM_SELF */ OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t); assert(ompi_mpi_comm_self.comm.c_f_to_c_index == 1); @@ -300,6 +302,15 @@ int ompi_comm_init_mpi3 (void) MPI_COMM_SELF, the keyhash will automatically be created. */ ompi_mpi_comm_self.comm.c_keyhash = NULL; + char *memkind_requested = getenv ("OMPI_MCA_mpi_memory_alloc_kinds"); + if (NULL != memkind_requested) { + char *memkind_provided; + + ompi_info_memkind_process (memkind_requested, &memkind_provided); + opal_infosubscribe_subscribe (&ompi_mpi_comm_world.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb); + opal_infosubscribe_subscribe (&ompi_mpi_comm_self.comm.super, "mpi_memory_alloc_kinds", memkind_provided, ompi_info_memkind_cb); + } + /* * finally here we set the predefined attribute keyvals */ diff --git a/ompi/file/file.c b/ompi/file/file.c index 9026fbea751..351aa00b705 100644 --- a/ompi/file/file.c +++ b/ompi/file/file.c @@ -18,6 +18,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2024 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -34,7 +35,7 @@ #include "ompi/runtime/params.h" #include "ompi/mca/io/base/base.h" #include "ompi/info/info.h" - +#include "ompi/info/info_memkind.h" opal_mutex_t ompi_mpi_file_bootstrap_mutex = OPAL_MUTEX_STATIC_INIT; @@ -120,6 +121,7 @@ int ompi_file_open(struct ompi_communicator_t *comm, const char *filename, /* Copy the info for the info layer */ file->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&comm->super, &file->super, info); if (info) { opal_info_dup(info, &(file->super.s_info)); } diff --git a/ompi/info/Makefile.am b/ompi/info/Makefile.am index e4af170dcf8..171d7877185 100644 --- a/ompi/info/Makefile.am +++ b/ompi/info/Makefile.am @@ -21,7 +21,9 @@ # This makefile.am does not stand on its own - it is included from ompi/Makefile.am headers += \ - info/info.h + info/info.h \ + info/info_memkind.h lib@OMPI_LIBMPI_NAME@_la_SOURCES += \ - info/info.c + info/info.c \ + info/info_memkind.c diff --git a/ompi/info/info.c b/ompi/info/info.c index 577910da840..15b4d50033f 100644 --- a/ompi/info/info.c +++ b/ompi/info/info.c @@ -51,6 +51,7 @@ #include "opal/util/info.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/runtime/mpiruntime.h" #include "ompi/runtime/params.h" #include "ompi/runtime/ompi_rte.h" @@ -351,6 +352,9 @@ int ompi_mpiinfo_finalize(void) } } + /* Release the array of available memkind objects */ + ompi_info_memkind_free_available(); + /* All done -- destroy the table */ OBJ_DESTRUCT(&ompi_info_f_to_c_table); diff --git a/ompi/info/info_memkind.c b/ompi/info/info_memkind.c new file mode 100644 index 00000000000..d325dd16ded --- /dev/null +++ b/ompi/info/info_memkind.c @@ -0,0 +1,545 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#include +#include +#include +#include +#include + +#include "info/info_memkind.h" +#include "opal/util/argv.h" +#include "opal/mca/accelerator/accelerator.h" +#include "opal/mca/accelerator/base/base.h" +#include "opal/util/printf.h" +#include "ompi/errhandler/errcode.h" +#include "ompi/constants.h" + +static opal_mutex_t ompi_info_memkind_mutex = OPAL_MUTEX_STATIC_INIT; +static ompi_memkind_t *ompi_info_memkind_available; +static int ompi_info_memkind_num_available = 0; + +#define FREE_STR_ARR(_str_arr) { \ + int _i=0; \ + char *_t = _str_arr[_i]; \ + while (NULL != _t) { \ + free (_t); \ + _t = _str_arr[++_i]; \ + } \ + free (_str_arr); \ +} + +static int ompi_info_memkind_get_num_unique (char **memkind_strs, int num_max) +{ + int iter = 0, pos = 0; + int num_unique = 0; + char **tmp_arr; + + tmp_arr = (char**) malloc ( (num_max + 1) * sizeof(char*)); + + char *m = memkind_strs[iter]; + while (m != NULL) { + char **tmp_str = opal_argv_split (m, ':'); + tmp_arr[pos++] = strdup (tmp_str[0]); + + FREE_STR_ARR(tmp_str); + m = memkind_strs[++iter]; + } + tmp_arr[pos] = NULL; + + iter = 0; + m = tmp_arr[iter]; + while (NULL != m) { + bool already_seen = false; + + for (int i = 0; i < iter; i++) { + if (!strncmp(m, tmp_arr[i], strlen(m))) { + already_seen = true; + break; + } + } + if (!already_seen) { + num_unique++; + } + m = tmp_arr[++iter]; + } + FREE_STR_ARR(tmp_arr); + + return num_unique; +} + +#if 0 +static void ompi_info_memkind_dump (const char *var_name, int num_memkinds, ompi_memkind_t *memkinds) +{ + for (int i = 0; i < num_memkinds; i++) { + printf("[%d] %s memkind[%d].name: %s ", getpid(), var_name, i, memkinds[i].im_name); + if (memkinds[i].im_num_restrictors > 0) { + printf("restrictors: "); + for (int j = 0; j < memkinds[i].im_num_restrictors; j++) { + printf("%c %s", (j == 0 ? ' ': ','), memkinds[i].im_restrictors[j]); + } + } else { + printf("no restrictors."); + } + printf("\n"); + } +} +#endif + +static void ompi_info_memkind_extract (const char* memkind_str, int *num_memkinds, ompi_memkind_t **memkinds) +{ + /* The memkind string is a comma-separated list of memkinds, which can have two forms: + ** - standalone memkind type, which implies that all restrictors of the memkind are requested + ** (or looking at it the other way around, no restrictions are imposed on that memory kind) + ** - memkind:restrictor + ** The same memkind type can appear multiple times, e.g. + ** memkind_a:restrictor_1,memkind_a:restrictor_2; + */ + + /* Separate requested_str into an array of individual entries */ + char **memkind_combos = opal_argv_split(memkind_str, ','); + int max_num_memkinds = opal_argv_count(memkind_combos); + int num_unique_memkinds = ompi_info_memkind_get_num_unique (memkind_combos, max_num_memkinds); + + ompi_memkind_t *memkind_arr = NULL; + memkind_arr = (ompi_memkind_t *) malloc(num_unique_memkinds * sizeof(ompi_memkind_t)); + if (NULL == memkind_arr) { + goto err_exit; + } + for (int i = 0; i < num_unique_memkinds; i++) { + memkind_arr[i].im_num_restrictors = 0; + } + + int iter = 0; + char *m = memkind_combos[iter]; + int pos = 0, current_max = 0; + while (m != NULL) { + bool already_seen = false; + char **tmp_str = opal_argv_split (m, ':'); + + for (int i = 0; i < current_max; i++) { + if (!strncmp(tmp_str[0], memkind_arr[i].im_name, strlen(tmp_str[0]))) { + already_seen = true; + pos = i; + break; + } + } + + if (!already_seen) { + pos = current_max; + memkind_arr[pos].im_name = strdup (tmp_str[0]); + current_max++; + } + if (NULL != tmp_str[1]) { + memkind_arr[pos].im_restrictors[memkind_arr[pos].im_num_restrictors++] = strdup(tmp_str[1]); + } else { + memkind_arr[pos].im_num_restrictors = 0; + } + FREE_STR_ARR(tmp_str); + m = memkind_combos[++iter]; + } + + err_exit: + *num_memkinds = num_unique_memkinds; + *memkinds = memkind_arr; + + return; +} + +static int ompi_info_memkind_get_available(int *num_memkinds, ompi_memkind_t **memkinds) +{ + int ret = OMPI_SUCCESS; + if (ompi_info_memkind_num_available > 0) { + goto exit_no_lock; + } + + OPAL_THREAD_LOCK (&ompi_info_memkind_mutex); + if (ompi_info_memkind_num_available > 0) { + goto exit; + } + + int tmp_num = 2; + if (0 != strcmp(opal_accelerator_base_selected_component.base_version.mca_component_name, "null")) { + tmp_num++; + } + + ompi_info_memkind_available = (ompi_memkind_t *) malloc (tmp_num * sizeof(ompi_memkind_t)); + if (NULL == ompi_info_memkind_available) { + *num_memkinds = 0; + *memkinds = NULL; + OPAL_THREAD_UNLOCK(&ompi_info_memkind_mutex); + return OMPI_ERROR; + } + + /* The system and mpi memory kinds are defined in MPI 4.1 section 12.4.3 */ + ompi_info_memkind_available[0].im_name = strdup ("system"); + ompi_info_memkind_available[0].im_num_restrictors = 0; + + ompi_info_memkind_available[1].im_name = strdup ("mpi"); + ompi_info_memkind_available[1].im_num_restrictors = 3; + ompi_info_memkind_available[1].im_restrictors[0] = strdup ("alloc_mem"); + ompi_info_memkind_available[1].im_restrictors[1] = strdup ("win_allocate"); + ompi_info_memkind_available[1].im_restrictors[2] = strdup ("win_allocate_shared"); + + if (tmp_num > 2) { + ompi_info_memkind_available[2].im_num_restrictors = OMPI_MAX_NUM_MEMKIND_RESTRICTORS; + opal_accelerator.get_memkind (&(ompi_info_memkind_available[2].im_name), + &(ompi_info_memkind_available[2].im_num_restrictors), + (char**)ompi_info_memkind_available[2].im_restrictors); + } + ompi_info_memkind_num_available = tmp_num; + + exit: + OPAL_THREAD_UNLOCK(&ompi_info_memkind_mutex); + exit_no_lock: + *num_memkinds = ompi_info_memkind_num_available; + *memkinds = ompi_info_memkind_available; + return ret; +} + +static void ompi_info_memkind_free (int num, ompi_memkind_t *memkind_arr) +{ + for (int i = 0; i < num; i++) { + free (memkind_arr[i].im_name); + for (int j = 0; j < memkind_arr[i].im_num_restrictors; j++) { + free (memkind_arr[i].im_restrictors[j]); + } + } + free (memkind_arr); +} + +static void ompi_info_memkind_str_create (int num_memkinds, ompi_memkind_t *memkinds, char** memkind_str) +{ + int num_elems = 0; + + for (int i = 0; i < num_memkinds; i++) { + if (memkinds[i].im_num_restrictors == 0) { + num_elems++; + } else { + num_elems += memkinds[i].im_num_restrictors; + } + } + + char **tmp_str_arr = (char**) malloc ((num_elems+1) * sizeof (char**)); + if (NULL == tmp_str_arr) { + *memkind_str = NULL; + return; + } + + int c = 0; + for (int i = 0; i < num_memkinds; i++) { + if (memkinds[i].im_num_restrictors == 0) { + opal_asprintf(&tmp_str_arr[c++], "%s",memkinds[i].im_name); + } else { + for (int j = 0; j < memkinds[i].im_num_restrictors; j++) { + opal_asprintf(&tmp_str_arr[c++], "%s:%s",memkinds[i].im_name, + memkinds[i].im_restrictors[j]); + } + } + } + tmp_str_arr[num_elems] = NULL; + + char *tmp_str = opal_argv_join(tmp_str_arr, ','); + FREE_STR_ARR (tmp_str_arr); + + *memkind_str = tmp_str; + return; +} + +#define COPY_MEMKIND(_to,_from) { \ + _to.im_name = strdup(_from.im_name); \ + _to.im_num_restrictors = _from.im_num_restrictors; \ + for (int _i = 0; _i < _from.im_num_restrictors; _i++) { \ + _to.im_restrictors[_i] = strdup (_from.im_restrictors[_i]); \ + } \ +} + +static int ompi_info_memkind_remove_unsupported (int num_requested, ompi_memkind_t *requested_memkinds, + int num_available, ompi_memkind_t *available_memkinds, + int *num_provided, ompi_memkind_t **provided_memkinds) +{ + bool have_system_memkind = false; + bool have_mpi_memkind = false; + int pos = 0; + int *apos = malloc (num_requested *sizeof(int)); + if (NULL == apos) { + return OMPI_ERR_OUT_OF_RESOURCE; + } + + /* + ** Check whether we support the memkinds requested by the user + ** In addition, keep track whether user requested "system" and "mpi" + ** memory_alloc_kinds, since we always add those to the list + ** of support memory_alloc_kinds + */ + for (int i = 0; i < num_requested; i++) { + bool found_name = false; + bool found_all_requested_restrictors = true; + int j = -1; + + if (!have_system_memkind && !strncmp(requested_memkinds[i].im_name, "system", strlen("system"))) { + have_system_memkind = true; + } + if (!have_mpi_memkind && !strncmp(requested_memkinds[i].im_name, "mpi", strlen("mpi"))) { + have_mpi_memkind = true; + } + + // Check for memory_alloc_kind name first + for (j = 0; j < num_available; j++) { + if (!strncmp(requested_memkinds[i].im_name, available_memkinds[j].im_name, + strlen(requested_memkinds[i].im_name))) { + found_name = true; + break; + } + } + if (found_name) { + // Check whether we recognize all restrictors requested by user for + // this memory_alloc_kind + bool found_this_restrictor = false; + for (int k = 0; k < requested_memkinds[i].im_num_restrictors; k++) { + for (int l = 0; l < available_memkinds[j].im_num_restrictors; l++) { + if (!strncmp(requested_memkinds[i].im_restrictors[k], available_memkinds[j].im_restrictors[l], + strlen(requested_memkinds[i].im_restrictors[k]))) { + found_this_restrictor = true; + break; + } + } + if (!found_this_restrictor) { + found_all_requested_restrictors = false; + break; + } + } + if (found_all_requested_restrictors) { + apos[pos++] = i; + } + } + } + + // Add "system" and "mpi" memkinds as supported, even if not requested by user + int total_len = pos; + if (!have_system_memkind) { + total_len++; + } + if (!have_mpi_memkind) { + total_len++; + } + + ompi_memkind_t *final = (ompi_memkind_t*) malloc (total_len * sizeof(ompi_memkind_t)); + if (NULL == final) { + free (apos); + return OMPI_ERR_OUT_OF_RESOURCE; + } + + int offset = 0; + // assert (!strncmp(available_memkinds[0].im_name, "system", strlen("system"))); + COPY_MEMKIND(final[0], available_memkinds[0]); + offset++; + + // assert (!strncmp(available_memkinds[1].im_name, "mpim", strlen("mpi"))); + COPY_MEMKIND(final[offset], available_memkinds[1]); + offset++; + + for (int i = 0; i < pos; i++) { + if (!strncmp(requested_memkinds[apos[i]].im_name, "system", strlen("system"))) { + continue; + } + if (!strncmp(requested_memkinds[apos[i]].im_name, "mpi", strlen("mpi"))) { + continue; + } + COPY_MEMKIND (final[offset], requested_memkinds[apos[i]]); + offset++; + } + + *num_provided = total_len; + *provided_memkinds = final; + return OMPI_SUCCESS; +} + +static bool ompi_info_memkind_is_subset (int num_subset, ompi_memkind_t *subset, + int num_superset, ompi_memkind_t *superset) +{ + bool ret = true; + + for (int i = 0; i < num_subset; i++) { + bool found_name = false; + int j = -1; + + // Check for memory_alloc_kind name first + for (j = 0; j < num_superset; j++) { + if (!strncmp(subset[i].im_name, superset[j].im_name, + strlen(subset[i].im_name))) { + found_name = true; + break; + } + } + if (found_name) { + /* Check whether we recognize all restrictors requested listed in + ** the subset in the superset. Note, that the superset might not + ** have any restrictors set, in which case all restrictors are accepted + */ + if (0 == superset[j].im_num_restrictors) { + continue; + } + for (int k = 0; k < subset[i].im_num_restrictors; k++) { + bool found_this_restrictor = false; + for (int l = 0; l < superset[j].im_num_restrictors; l++) { + if (!strncmp(subset[i].im_restrictors[k], superset[j].im_restrictors[l], + strlen(subset[i].im_restrictors[k]))) { + found_this_restrictor = true; + break; + } + } + if (!found_this_restrictor) { + ret = false; + goto exit; + } + } + } else { + ret = false; + goto exit; + } + } + + exit: + return ret; +} + +static bool ompi_info_memkind_validate (const char *assert_str, const char *parent_str) +{ + int num_assert_memkinds = 0, num_parent_memkinds = 0; + ompi_memkind_t *assert_memkinds = NULL; + ompi_memkind_t *parent_memkinds = NULL; + bool ret; + + ompi_info_memkind_extract (assert_str, &num_assert_memkinds, &assert_memkinds); + ompi_info_memkind_extract (parent_str, &num_parent_memkinds, &parent_memkinds); + + ret = ompi_info_memkind_is_subset (num_assert_memkinds, assert_memkinds, + num_parent_memkinds, parent_memkinds); + + if (NULL != assert_memkinds) { + ompi_info_memkind_free(num_assert_memkinds, assert_memkinds); + } + if (NULL != parent_memkinds) { + ompi_info_memkind_free(num_parent_memkinds, parent_memkinds); + } + + return ret; +} + + +int ompi_info_memkind_process (const char* requested_str, char **provided_str) +{ + int err; + char *tmp_str = NULL; + + int num_requested_memkinds, num_available_memkinds, num_provided_memkinds; + ompi_memkind_t *requested_memkinds = NULL ; + ompi_memkind_t *available_memkinds = NULL; + ompi_memkind_t *provided_memkinds = NULL; + + if (NULL == requested_str) { + *provided_str = NULL; + return OMPI_SUCCESS; + } + + ompi_info_memkind_extract (requested_str, &num_requested_memkinds, &requested_memkinds); + err = ompi_info_memkind_get_available (&num_available_memkinds, &available_memkinds); + if (OMPI_SUCCESS != err) { + goto exit; + } + + err = ompi_info_memkind_remove_unsupported (num_requested_memkinds, requested_memkinds, + num_available_memkinds, available_memkinds, + &num_provided_memkinds, &provided_memkinds); + if (OMPI_SUCCESS != err) { + goto exit; + } + + ompi_info_memkind_str_create(num_provided_memkinds, provided_memkinds, &tmp_str); + + exit: + if (NULL != requested_memkinds) { + ompi_info_memkind_free(num_requested_memkinds, requested_memkinds); + } + if (NULL != provided_memkinds) { + ompi_info_memkind_free(num_provided_memkinds, provided_memkinds); + } + // Don't free the available_memkinds, they will be released in info_finalize; + + *provided_str = tmp_str; + return err; +} + +const char *ompi_info_memkind_cb (opal_infosubscriber_t *obj, const char *key, const char *value) +{ + char *result; + ompi_info_memkind_process (value, &result); + return result; +} + +/* +** Algorithm is a bit convoluted: +** +** - retrieve mpi_memory_alloc_kinds from parent object. +** - if info object passed in as argument to this routine contains +** mpi_assert_memory_alloc_kinds key/value pair: +** - validate that we recognize all memory kinds listed +** - if that is the case, use the value of the of mpi_assert_memory_alloc_kinds +** when setting mpi_memory_alloc_kinds on the child object +** - else ignore the mpi_assert_memory_alloc_kinds. (Quote: +** "If the MPI library does not support one or more of the allocation kinds associated +** with the mpi_assert_memory_alloc_kinds info key, it will ignore this info key". +** So we are supposed to drop the entire key, not just the memory kinds that we did +** recognize.) +** - else use the same memkinds as in mpi_memory_alloc_kinds on the parent object on the +** child object (i.e. we just copy it over) +** +** To summerize, the value of one info key (mpi_assert_memory_alloc_kinds) can influence the +** value of another info key (mpi_memory_alloc_kinds). +*/ +int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, opal_infosubscriber_t *child, + opal_info_t *info) +{ + opal_cstring_t *parent_val; + opal_cstring_t *assert_val; + char *final_str = NULL; + int flag; + + opal_info_get(parent->s_info, "mpi_memory_alloc_kinds", &parent_val, &flag); + if (0 == flag) { + return OMPI_SUCCESS; + } + final_str = (char*) parent_val->string; + + if (NULL != info) { + opal_info_get(info, "mpi_assert_memory_alloc_kinds", &assert_val, &flag); + if (0 == flag) { + // assert_memory_alloc_kinds was not set by code + goto exit; + } + + // Validate asserted memory kind + bool ret = ompi_info_memkind_validate (assert_val->string, parent_val->string); + if (ret) { + final_str = (char*) assert_val->string; + } + } + + exit: + opal_infosubscribe_subscribe (child, "mpi_memory_alloc_kinds", final_str, + ompi_info_memkind_cb); + return OMPI_SUCCESS; +} + +void ompi_info_memkind_free_available (void) +{ + ompi_info_memkind_free (ompi_info_memkind_num_available, ompi_info_memkind_available); +} diff --git a/ompi/info/info_memkind.h b/ompi/info/info_memkind.h new file mode 100644 index 00000000000..866b5fad141 --- /dev/null +++ b/ompi/info/info_memkind.h @@ -0,0 +1,69 @@ +/* -*- Mode: C; c-basic-offset:4 ; indent-tabs-mode:nil -*- */ +/* + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. + * $COPYRIGHT$ + * + * Additional copyrights may follow + * + * $HEADER$ + */ + +#ifndef OMPI_INFO_MEMKIND_H +#define OMPI_INFO_MEMKIND_H + +#include "ompi_config.h" +#include "opal/util/info_subscriber.h" + +BEGIN_C_DECLS + +#define OMPI_MAX_NUM_MEMKIND_RESTRICTORS 3 +struct ompi_memkind_t { + char *im_name; + int im_num_restrictors; + char *im_restrictors[OMPI_MAX_NUM_MEMKIND_RESTRICTORS]; +}; +typedef struct ompi_memkind_t ompi_memkind_t; + +/* +** Given a string of user requested memory alloc kinds, create +** a string with the actually support memory kinds by the library. +** +** @param[IN]: requested_str input string +** @param[OUT]: provided_str result string +** +** @return: OMPI_SUCCESS or error on failure +*/ +OMPI_DECLSPEC int ompi_info_memkind_process (const char* requested_str, + char **provided_str); +/* +** Set the memory_alloc_kind info object on the child object, either +** by copying it from the parent object, or adjusting it based +** on the assert_memory_alloc_kind info object provided by the code +** during object creation +** +** @param[IN]: parent parent object (e.g. comm->super, file->super, etc.) +** @param [INOUT]: child child object +** @param[IN]: info info object provided by code during object creation +** (e.g. MPI_Comm_dup_with_info, MPI_File_open, etc.) +** +** @return: OMPI_SUCCESS or error on failure +*/ +OMPI_DECLSPEC int ompi_info_memkind_copy_or_set (opal_infosubscriber_t *parent, + opal_infosubscriber_t *child, + opal_info_t *info); + +/* +** free the array of available memkinds when shutting down the info +** infrastructure. +*/ +OMPI_DECLSPEC void ompi_info_memkind_free_available (void); + +/* +** Callback function used when registering memkind info object +*/ +OMPI_DECLSPEC const char *ompi_info_memkind_cb (opal_infosubscriber_t *obj, const char *key, const char *value); + +END_C_DECLS + +#endif /* OMPI_INFO_MEMKIND_H */ + diff --git a/ompi/instance/instance.c b/ompi/instance/instance.c index adf2e8ace89..a8646481244 100644 --- a/ompi/instance/instance.c +++ b/ompi/instance/instance.c @@ -33,6 +33,7 @@ #include "ompi/errhandler/errcode.h" #include "ompi/message/message.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/attribute/attribute.h" #include "ompi/op/op.h" #include "ompi/dpm/dpm.h" @@ -857,7 +858,18 @@ int ompi_mpi_instance_init (int ts_level, opal_info_t *info, ompi_errhandler_t /* Copy info if there is one. */ if (OPAL_UNLIKELY(NULL != info)) { + opal_cstring_t *memkind_requested; + int flag; + new_instance->super.s_info = OBJ_NEW(opal_info_t); + opal_info_get(info, "mpi_memory_alloc_kinds", &memkind_requested, &flag); + if (1 == flag) { + char *memkind_provided; + ompi_info_memkind_process (memkind_requested->string, &memkind_provided); + opal_infosubscribe_subscribe (&new_instance->super, "mpi_memory_alloc_kinds", + memkind_provided, ompi_info_memkind_cb); + } + if (info) { opal_info_dup(info, &new_instance->super.s_info); } diff --git a/ompi/win/win.c b/ompi/win/win.c index 2f0974ac016..4bd24e3c86a 100644 --- a/ompi/win/win.c +++ b/ompi/win/win.c @@ -19,6 +19,7 @@ * Copyright (c) 2016-2017 IBM Corporation. All rights reserved. * Copyright (c) 2018-2019 Triad National Security, LLC. All rights * reserved. + * Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -38,6 +39,7 @@ #include "ompi/attribute/attribute.h" #include "ompi/group/group.h" #include "ompi/info/info.h" +#include "ompi/info/info_memkind.h" #include "ompi/mca/osc/base/base.h" #include "ompi/mca/osc/osc.h" @@ -164,6 +166,7 @@ static int alloc_window(struct ompi_communicator_t *comm, opal_info_t *info, int /* Copy the info for the info layer */ win->super.s_info = OBJ_NEW(opal_info_t); + ompi_info_memkind_copy_or_set (&comm->super, &win->super, info); if (info) { opal_info_dup(info, &(win->super.s_info)); } diff --git a/opal/mca/accelerator/accelerator.h b/opal/mca/accelerator/accelerator.h index 6279b7c615e..a9da208e697 100644 --- a/opal/mca/accelerator/accelerator.h +++ b/opal/mca/accelerator/accelerator.h @@ -4,7 +4,7 @@ * reserved. * Copyright (c) Amazon.com, Inc. or its affiliates. * All Rights reserved. - * Copyright (c) 2023 Advanced Micro Devices, Inc. All Rights reserved. + * Copyright (c) 2023-2024 Advanced Micro Devices, Inc. All Rights reserved. * Copyright (c) 2024 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. @@ -654,6 +654,22 @@ typedef int (*opal_accelerator_base_module_get_num_devices_fn_t)(int *num_device */ typedef int (*opal_accelerator_base_module_get_mem_bw_fn_t)(int device, float *bw); +/** + * Get the memkind information of the accelerator component. + * + * @param[OUT] name Name of memory alloc kinds supported by component. + * This field will have to be released by the calling function. + * @param[INOUT] num_restrictors As input, this parameter represents the lenght of the + * restrictors array allocated by the caller. + * At return, this variable will indicate the number of + * restrictors set by the function + * @param[OUT] restrictors Array of restrictors supported by the component. + * The array of char* pointers has been allocated by the caller. + * The elements of the array will have to be released by the caller. + * + */ +typedef void (*opal_accelerator_base_module_get_memkind_fn_t)(char **name, int *num_restrictors, + char **restrictors); /* * the standard public API data structure @@ -700,6 +716,7 @@ typedef struct { opal_accelerator_base_module_get_num_devices_fn_t num_devices; opal_accelerator_base_module_get_mem_bw_fn_t get_mem_bw; + opal_accelerator_base_module_get_memkind_fn_t get_memkind; } opal_accelerator_base_module_t; /** diff --git a/opal/mca/accelerator/cuda/accelerator_cuda.c b/opal/mca/accelerator/cuda/accelerator_cuda.c index 45358ef337e..9bdb106be94 100644 --- a/opal/mca/accelerator/cuda/accelerator_cuda.c +++ b/opal/mca/accelerator/cuda/accelerator_cuda.c @@ -26,6 +26,7 @@ #include "opal/mca/rcache/rcache.h" #include "opal/util/show_help.h" #include "opal/util/proc.h" + /* Accelerator API's */ static int accelerator_cuda_check_addr(const void *addr, int *dev_id, uint64_t *flags); static int accelerator_cuda_create_stream(int dev_id, opal_accelerator_stream_t **stream); @@ -80,10 +81,14 @@ static int accelerator_cuda_get_buffer_id(int dev_id, const void *addr, opal_acc static int accelerator_cuda_sync_stream(opal_accelerator_stream_t *stream); static int accelerator_cuda_get_num_devices(int *num_devices); static int accelerator_cuda_get_mem_bw(int device, float *bw); +static void accelerator_cuda_get_memkind(char **name, int *num_restrictors, char **restrictors); #define GET_STREAM(_stream) \ ((_stream) == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((CUstream *) (_stream)->stream)) +// This value is based on the memory kind MPI side document +#define MCA_ACCELERATOR_CUDA_NUM_RESTRICTORS 3 + opal_accelerator_base_module_t opal_accelerator_cuda_module = { accelerator_cuda_check_addr, @@ -125,7 +130,8 @@ opal_accelerator_base_module_t opal_accelerator_cuda_module = accelerator_cuda_get_buffer_id, accelerator_cuda_get_num_devices, - accelerator_cuda_get_mem_bw + accelerator_cuda_get_mem_bw, + accelerator_cuda_get_memkind }; static inline int opal_accelerator_cuda_delayed_init_check(void) @@ -1218,3 +1224,24 @@ static int accelerator_cuda_get_mem_bw(int device, float *bw) *bw = opal_accelerator_cuda_mem_bw[device]; return OPAL_SUCCESS; } + +static void accelerator_cuda_get_memkind (char **name, int *num_restrictors, char **restrictors) +{ + int n_restrictors = *num_restrictors > MCA_ACCELERATOR_CUDA_NUM_RESTRICTORS ? + MCA_ACCELERATOR_CUDA_NUM_RESTRICTORS : *num_restrictors; + + *name = strdup("cuda"); + + if (n_restrictors > 0) { + restrictors[0] = strdup("host"); + } + if (n_restrictors > 1) { + restrictors[1] = strdup("device"); + } + if (n_restrictors > 2) { + restrictors[2] = strdup("managed"); + } + *num_restrictors = n_restrictors; + + return; +} diff --git a/opal/mca/accelerator/null/accelerator_null_component.c b/opal/mca/accelerator/null/accelerator_null_component.c index 8a6f0f8d810..acf9f4daee2 100644 --- a/opal/mca/accelerator/null/accelerator_null_component.c +++ b/opal/mca/accelerator/null/accelerator_null_component.c @@ -94,6 +94,7 @@ static int accelerator_null_sync_stream(opal_accelerator_stream_t *stream); static int accelerator_null_get_num_devices(int *num_devices); static int accelerator_null_get_mem_bw(int device, float *bw); +static void accelerator_null_get_memkind(char **name, int *num_restrictors, char **restrictors); /* * Instantiate the public struct with all of our public information @@ -174,7 +175,8 @@ opal_accelerator_base_module_t opal_accelerator_null_module = accelerator_null_get_buffer_id, accelerator_null_get_num_devices, - accelerator_null_get_mem_bw + accelerator_null_get_mem_bw, + accelerator_null_get_memkind }; static int accelerator_null_open(void) @@ -393,3 +395,11 @@ static int accelerator_null_get_mem_bw(int device, float *bw) *bw = 1.0; // return something that is not 0 return OPAL_SUCCESS; } + +static void accelerator_null_get_memkind (char **name, int *num_restrictors, char **restrictors) +{ + *name = NULL; + *num_restrictors = 0; + + return; +} diff --git a/opal/mca/accelerator/rocm/accelerator_rocm_module.c b/opal/mca/accelerator/rocm/accelerator_rocm_module.c index 32b1fc3976a..9e44ea5aa35 100644 --- a/opal/mca/accelerator/rocm/accelerator_rocm_module.c +++ b/opal/mca/accelerator/rocm/accelerator_rocm_module.c @@ -74,9 +74,13 @@ static int mca_accelerator_rocm_sync_stream(opal_accelerator_stream_t *stream); static int mca_accelerator_rocm_get_num_devices(int *num_devices); static int mca_accelerator_rocm_get_mem_bw(int device, float *bw); +static void mca_accelerator_rocm_get_memkind(char **name, int *num_restrictors, char **restrictors); #define GET_STREAM(_stream) (_stream == MCA_ACCELERATOR_STREAM_DEFAULT ? 0 : *((hipStream_t *)_stream->stream)) +// This value is based on the memory kind MPI side document +#define MCA_ACCELERATOR_ROCM_NUM_RESTRICTORS 3 + opal_accelerator_base_module_t opal_accelerator_rocm_module = { mca_accelerator_rocm_check_addr, @@ -118,7 +122,8 @@ opal_accelerator_base_module_t opal_accelerator_rocm_module = mca_accelerator_rocm_get_buffer_id, mca_accelerator_rocm_get_num_devices, - mca_accelerator_rocm_get_mem_bw + mca_accelerator_rocm_get_mem_bw, + mca_accelerator_rocm_get_memkind }; @@ -946,3 +951,24 @@ static int mca_accelerator_rocm_get_mem_bw(int device, float *bw) *bw = opal_accelerator_rocm_mem_bw[device]; return OPAL_SUCCESS; } + +static void mca_accelerator_rocm_get_memkind (char **name, int *num_restrictors, char **restrictors) +{ + int n_restrictors = *num_restrictors > MCA_ACCELERATOR_ROCM_NUM_RESTRICTORS ? + MCA_ACCELERATOR_ROCM_NUM_RESTRICTORS : *num_restrictors; + + *name = strdup("rocm"); + + if (n_restrictors > 0) { + restrictors[0] = strdup("host"); + } + if (n_restrictors > 1) { + restrictors[1] = strdup("device"); + } + if (n_restrictors > 2) { + restrictors[2] = strdup("managed"); + } + *num_restrictors = n_restrictors; + + return; +} diff --git a/opal/mca/accelerator/ze/accelerator_ze_module.c b/opal/mca/accelerator/ze/accelerator_ze_module.c index a5f7f37d5ac..471ec2ff0f3 100644 --- a/opal/mca/accelerator/ze/accelerator_ze_module.c +++ b/opal/mca/accelerator/ze/accelerator_ze_module.c @@ -77,6 +77,10 @@ static int mca_accelerator_ze_sync_stream(opal_accelerator_stream_t *stream); static int mca_accelerator_ze_get_num_devices(int *num_devices); static int mca_accelerator_ze_get_mem_bw(int device, float *bw); +static void mca_accelerator_ze_get_memkind(char **name, int *num_restrictors, char **restrictors); + +// This value is based on the memory kind MPI side document +#define MCA_ACCELERATOR_ZE_NUM_RESTRICTORS 3 opal_accelerator_base_module_t opal_accelerator_ze_module = { @@ -118,7 +122,8 @@ opal_accelerator_base_module_t opal_accelerator_ze_module = .get_buffer_id = mca_accelerator_ze_get_buffer_id, .num_devices = mca_accelerator_ze_get_num_devices, - .get_mem_bw = mca_accelerator_ze_get_mem_bw + .get_mem_bw = mca_accelerator_ze_get_mem_bw, + .get_memkind = mca_accelerator_ze_get_memkind }; static int accelerator_ze_dev_handle_to_dev_id(ze_device_handle_t hDevice) @@ -872,4 +877,25 @@ static int mca_accelerator_ze_get_mem_bw(int device, float *bw) * TODO */ return OPAL_ERR_NOT_IMPLEMENTED; -} \ No newline at end of file +} + +static void mca_accelerator_ze_get_memkind (char **name, int *num_restrictors, char **restrictors) +{ + int n_restrictors = *num_restrictors > MCA_ACCELERATOR_ZE_NUM_RESTRICTORS ? + MCA_ACCELERATOR_ZE_NUM_RESTRICTORS : *num_restrictors; + + *name = strdup("level_zero"); + + if (n_restrictors > 0) { + restrictors[0] = strdup("host"); + } + if (n_restrictors > 1) { + restrictors[1] = strdup("device"); + } + if (n_restrictors > 2) { + restrictors[2] = strdup("shared"); + } + *num_restrictors = n_restrictors; + + return; +}