|
35 | 35 | #include "ompi_config.h" |
36 | 36 |
|
37 | 37 | #include <string.h> |
| 38 | +#include <stdlib.h> |
38 | 39 |
|
39 | 40 | #include "osc_rdma.h" |
40 | 41 | #include "osc_rdma_frag.h" |
@@ -84,7 +85,6 @@ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_o |
84 | 85 | static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, const char *key, const char *value); |
85 | 86 |
|
86 | 87 | static char *ompi_osc_rdma_full_connectivity_btls; |
87 | | -static char *ompi_osc_rdma_btl_alternate_names; |
88 | 88 |
|
89 | 89 | static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { |
90 | 90 | {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, |
@@ -257,14 +257,6 @@ static int ompi_osc_rdma_component_register (void) |
257 | 257 | MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_full_connectivity_btls); |
258 | 258 | free(description_str); |
259 | 259 |
|
260 | | - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; |
261 | | - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " |
262 | | - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); |
263 | | - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, |
264 | | - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, |
265 | | - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); |
266 | | - free(description_str); |
267 | | - |
268 | 260 | if (0 == access ("/dev/shm", W_OK)) { |
269 | 261 | mca_osc_rdma_component.backing_directory = "/dev/shm"; |
270 | 262 | } else { |
@@ -875,76 +867,97 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) |
875 | 867 | free(procs); |
876 | 868 | } |
877 | 869 |
|
| 870 | + |
| 871 | +/* |
| 872 | + * qsort() sorting function for ompi_osc_rdma_query_alternate_btls(), |
| 873 | + * using latency as the sorting metric. |
| 874 | + */ |
| 875 | +static int btl_latency_sort_fn(const void *a, const void *b) |
| 876 | +{ |
| 877 | + const struct mca_btl_base_module_t *btl_a = a; |
| 878 | + const struct mca_btl_base_module_t *btl_b = b; |
| 879 | + |
| 880 | + if (btl_a->btl_latency < btl_b->btl_latency) { |
| 881 | + return -1; |
| 882 | + } else if (btl_a->btl_latency == btl_b->btl_latency) { |
| 883 | + return 0; |
| 884 | + } else { |
| 885 | + return 1; |
| 886 | + } |
| 887 | +} |
| 888 | + |
| 889 | + |
878 | 890 | /** |
879 | 891 | * @brief query for alternate BTLs |
880 | 892 | * |
881 | 893 | * @in comm Communicator to query |
882 | | - * @out module OSC module to store BTLs/count to (optional) |
883 | | - * @out |
| 894 | + * @inout module OSC module to store BTLs/count to (optional) |
884 | 895 | * |
885 | 896 | * @return OMPI_SUCCESS if BTLs can be found |
886 | 897 | * @return OMPI_ERR_UNREACH if no BTLs can be found that match |
887 | 898 | * |
888 | | - * In this case an "alternate" BTL is a BTL does not meet the |
889 | | - * requirements of a BTL outlined in ompi_osc_rdma_query_accelerated_btls(). |
890 | | - * Either it does not provide connectivity to all peers, provide |
891 | | - * remote completion, or natively support put/get/atomic.. Since more |
892 | | - * than one BTL may be needed for this support the OSC component will |
893 | | - * disable the use of registration-based RDMA (these BTLs will not be |
894 | | - * used) and will use any remaining BTL. By default the BTLs used will |
895 | | - * be tcp and sm but any single (or pair) of BTLs may be used. |
| 899 | + * We directly use the active message rdma wrappers for alternate |
| 900 | + * BTLs, in all cases. This greatly simplifies the alternate BTL |
| 901 | + * impementation, at the expense of some performance. With the |
| 902 | + * AM wrappers, we can always enforce remote completion and the lack |
| 903 | + * of memory registration, at some performance cost. But we can use |
| 904 | + * as many BTLs as we like. The module's btl list is sorted by |
| 905 | + * latency, so that ompi_osc_rdma_peer_btl_endpoint() picks the lowest |
| 906 | + * available latency btl to communicate with the peer. Unlike the OB1 |
| 907 | + * PML, we only use one BTL per peer. |
| 908 | + * |
| 909 | + * Like the OB1 PML, there is no verification that there is at least |
| 910 | + * one BTL that can communicate with every other peer in the window. |
896 | 911 | */ |
897 | 912 | static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) |
898 | 913 | { |
899 | 914 | mca_btl_base_selected_module_t *item; |
900 | | - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); |
901 | | - int btls_found = 0; |
902 | | - |
903 | | - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); |
904 | | - if (NULL == btls_to_use) { |
905 | | - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
906 | | - "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); |
907 | | - return OMPI_ERR_UNREACH; |
908 | | - } |
| 915 | + int ret; |
909 | 916 |
|
910 | | - if (module) { |
911 | | - module->btls_in_use = 0; |
| 917 | + /* shortcut the trivial query case */ |
| 918 | + if (NULL == module) { |
| 919 | + if (opal_list_is_empty(&mca_btl_base_modules_initialized)) { |
| 920 | + return OMPI_ERR_UNREACH; |
| 921 | + } |
| 922 | + return OMPI_SUCCESS; |
912 | 923 | } |
913 | 924 |
|
914 | | - /* rdma and atomics are only supported with BTLs at the moment */ |
915 | | - for (int i = 0 ; btls_to_use[i] ; ++i) { |
916 | | - opal_output_verbose(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); |
917 | | - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { |
918 | | - if (NULL != item->btl_module->btl_register_mem) { |
919 | | - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
920 | | - "skipping RDMA btl when searching for alternate BTL"); |
921 | | - continue; |
922 | | - } |
923 | | - |
924 | | - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { |
925 | | - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
926 | | - "skipping btl %s", |
927 | | - item->btl_module->btl_component->btl_version.mca_component_name); |
928 | | - continue; |
929 | | - } |
930 | | - |
931 | | - opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
932 | | - "found alternate btl %s", btls_to_use[i]); |
933 | | - |
934 | | - ++btls_found; |
935 | | - if (module) { |
936 | | - mca_btl_base_am_rdma_init(item->btl_module); |
937 | | - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); |
938 | | - } |
939 | | - |
| 925 | + module->btls_in_use = 0; |
| 926 | + |
| 927 | + /* add all alternate btls to the selected_btls list, not worrying |
| 928 | + about ordering yet. We have to add all btls unless we want to |
| 929 | + iterate over all endpoints to build the minimum set of btls |
| 930 | + needed to communicate with all peers. An MCA parameter just |
| 931 | + for osc rdma also wouldn't work, as the BML can decide not to |
| 932 | + add an endpoint for a btl given the priority of another btl. |
| 933 | + For example, it is not uncommon that the only endpoint created |
| 934 | + to a peer on the same host is the sm btl's endpoint. If we |
| 935 | + had an osc rdma specific parameter list, and the user |
| 936 | + specified a combination not including sm, that would result in |
| 937 | + an eventual failure, as no btl would be found to talk to ranks |
| 938 | + on the same host.*/ |
| 939 | + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { |
| 940 | + opal_output_verbose(MCA_BASE_VERBOSE_INFO, ompi_osc_base_framework.framework_output, |
| 941 | + "found alternate btl %s", |
| 942 | + item->btl_module->btl_component->btl_version.mca_component_name); |
| 943 | + ret = mca_btl_base_am_rdma_init(item->btl_module); |
| 944 | + if (OMPI_SUCCESS != ret) { |
| 945 | + return ret; |
940 | 946 | } |
| 947 | + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); |
941 | 948 | } |
942 | 949 |
|
943 | | - opal_argv_free (btls_to_use); |
| 950 | + /* sort based on latency, lowest first */ |
| 951 | + qsort(module->selected_btls, module->btls_in_use, |
| 952 | + sizeof(struct mca_btl_base_module_t*), btl_latency_sort_fn); |
944 | 953 |
|
945 | | - return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; |
| 954 | + /* osc/rdma always use active message RDMA/atomics on alternate btls, whic does not require explicit memory registration */ |
| 955 | + module->use_memory_registration = false; |
| 956 | + |
| 957 | + return module->btls_in_use > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; |
946 | 958 | } |
947 | 959 |
|
| 960 | + |
948 | 961 | /* Check for BTL requirements: |
949 | 962 | * 1) RDMA (put/get) and ATOMIC operations. We only require cswap |
950 | 963 | * and fetch and add and will emulate other opterations with those |
|
0 commit comments