diff --git a/opal/mca/btl/tcp/btl_tcp.c b/opal/mca/btl/tcp/btl_tcp.c index f007565be38..3ba02039741 100644 --- a/opal/mca/btl/tcp/btl_tcp.c +++ b/opal/mca/btl/tcp/btl_tcp.c @@ -15,6 +15,8 @@ * Copyright (c) 2016-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. * Copyright (c) 2016 Intel, Inc. All rights reserved. + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights + * reserved. * * $COPYRIGHT$ * @@ -90,12 +92,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl, continue; } - /* - * Check to make sure that the peer has at least as many interface - * addresses exported as we are trying to use. If not, then - * don't bind this BTL instance to the proc. - */ - OPAL_THREAD_LOCK(&tcp_proc->proc_lock); for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) { diff --git a/opal/mca/btl/tcp/btl_tcp.h b/opal/mca/btl/tcp/btl_tcp.h index a3ca945e9c6..559f1d536a7 100644 --- a/opal/mca/btl/tcp/btl_tcp.h +++ b/opal/mca/btl/tcp/btl_tcp.h @@ -15,6 +15,8 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights * reserved. + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t { uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */ unsigned int tcp_num_links; /**< number of logical links per physical device */ struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */ + opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */ int tcp_free_list_num; /**< initial size of free lists */ int tcp_free_list_max; /**< maximum size of free lists */ int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */ @@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component; */ struct mca_btl_tcp_module_t { mca_btl_base_module_t super; /**< base BTL interface */ + uint32_t btl_index; /**< Local BTL module index, used for vertex + data and used as a hash key when + solving module matching problem */ uint16_t tcp_ifkindex; /** #endif - +#include /** * Modex address structure. * * One of these structures will be sent for every btl module in use by - * the local BTL TCP component. + * the local BTL TCP component. This is used to construct an opal_if_t + * structure for the reachability component as well as populate the + * mca_btl_tcp_addr_t structure on remote procs. These will be used + * for interface matching and filling out the mca_btl_base_endpoint_t + * structure. */ struct mca_btl_tcp_modex_addr_t { uint8_t addr[16]; /* endpoint address. for addr_family of MCA_BTL_TCP_AF_INET, only the first 4 bytes have meaning. */ uint32_t addr_ifkindex; /* endpoint kernel index */ + uint32_t addr_mask; /* ip mask */ + uint32_t addr_bandwidth; /* interface bandwidth */ uint16_t addr_port; /* endpoint listen port */ uint8_t addr_family; /* endpoint address family. Note that this is MCA_BTL_TCP_AF_{INET,INET6}, not the traditional AF_INET/AF_INET6. */ - uint8_t padding[1]; /* padd out to an 8-byte word */ + uint8_t padding[1]; /* pad out to an 8-byte word */ }; typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t; +_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t"); /** * Remote peer address structure * * One of these structures will be allocated for every remote endpoint * associated with a remote proc. The data is pulled from the - * mca_btl_tcp_modex_addr_t structure, except for the addr_inuse - * field, which is local. + * mca_btl_tcp_modex_addr_t structure. */ struct mca_btl_tcp_addr_t { union { @@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t { int addr_ifkindex; /**< remote interface index assigned with this address */ uint8_t addr_family; /**< AF_INET or AF_INET6 */ - bool addr_inuse; /**< local meaning only */ }; typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t; diff --git a/opal/mca/btl/tcp/btl_tcp_component.c b/opal/mca/btl/tcp/btl_tcp_component.c index a8a5b719da0..476c45c4951 100644 --- a/opal/mca/btl/tcp/btl_tcp_component.c +++ b/opal/mca/btl/tcp/btl_tcp_component.c @@ -19,7 +19,8 @@ * Copyright (c) 2014-2015 Intel, Inc. All rights reserved. * Copyright (c) 2014-2017 Research Organization for Information Science * and Technology (RIST). All rights reserved. - * Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -69,6 +70,7 @@ #include "opal/util/net.h" #include "opal/util/fd.h" #include "opal/util/show_help.h" +#include "opal/util/string_copy.h" #include "opal/util/printf.h" #include "opal/constants.h" #include "opal/mca/btl/btl.h" @@ -76,6 +78,7 @@ #include "opal/mca/mpool/base/base.h" #include "opal/mca/btl/base/btl_base_error.h" #include "opal/mca/pmix/pmix.h" +#include "opal/mca/reachable/base/base.h" #include "opal/threads/threads.h" #include "opal/constants.h" @@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void) mca_btl_tcp_component.tcp_btls = NULL; /* initialize objects */ + OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t); OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t); @@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void) OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max); OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user); OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock); + OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs); #if OPAL_CUDA_SUPPORT mca_common_cuda_fini(); @@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void) static int mca_btl_tcp_create(const int if_kindex, const char* if_name) { struct mca_btl_tcp_module_t* btl; + opal_if_t *copied_interface, *selected_interface; char param[256]; - int i; + int i, if_index; struct sockaddr_storage addr; bool found = false; @@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name) * 10.1.0.1 as the one that is published in the modex and used for * connection. */ - for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) { - int ret; - - if (if_kindex != opal_ifindextokindex(i)) { + OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) { + if (if_kindex != selected_interface->if_kernel_index) { continue; } - ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr, - sizeof(struct sockaddr_storage)); - if (OPAL_SUCCESS != ret) { - return ret; - } + if_index = selected_interface->if_index; + + memcpy((struct sockaddr*)&addr, &selected_interface->if_addr, + MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr))); if (addr.ss_family == AF_INET && 4 != mca_btl_tcp_component.tcp_disable_family) { @@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name) btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t)); if(NULL == btl) return OPAL_ERR_OUT_OF_RESOURCE; + copied_interface = OBJ_NEW(opal_if_t); + if (NULL == copied_interface) { + free(btl); + return OPAL_ERR_OUT_OF_RESOURCE; + } memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module)); OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t); OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t); mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl; /* initialize the btl */ + /* This index is used as a key for a hash table used for interface matching. */ + btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1; btl->tcp_ifkindex = (uint16_t) if_kindex; #if MCA_BTL_TCP_STATISTICS btl->tcp_bytes_recv = 0; @@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name) #endif memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage)); + btl->tcp_ifmask = selected_interface->if_mask; /* allow user to specify interface bandwidth */ sprintf(param, "bandwidth_%s", if_name); @@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name) } } + /* Add another entry to the local interface list */ + opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE); + copied_interface->if_index = if_index; + copied_interface->if_kernel_index = btl->tcp_ifkindex; + copied_interface->af_family = btl->tcp_ifaddr.ss_family; + copied_interface->if_flags = selected_interface->if_flags; + copied_interface->if_speed = selected_interface->if_speed; + memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage)); + copied_interface->if_mask = selected_interface->if_mask; + copied_interface->if_bandwidth = btl->super.btl_bandwidth; + memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac)); + copied_interface->ifmtu = selected_interface->ifmtu; + + opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super)); + opal_output_verbose(5, opal_btl_base_framework.framework_output, "btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n", (void*)btl, if_name, (int) btl->tcp_ifkindex, i, @@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void) memcpy(&addrs[i].addr, &(inaddr6->sin6_addr), sizeof(struct in6_addr)); addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port; - addrs[i].addr_ifkindex = btl->tcp_ifkindex; addrs[i].addr_family = MCA_BTL_TCP_AF_INET6; opal_output_verbose(5, opal_btl_base_framework.framework_output, "btl: tcp: exchange: %d %d IPv6 %s", @@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void) memcpy(&addrs[i].addr, &(inaddr->sin_addr), sizeof(struct in_addr)); addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port; - addrs[i].addr_ifkindex = btl->tcp_ifkindex; addrs[i].addr_family = MCA_BTL_TCP_AF_INET; opal_output_verbose(5, opal_btl_base_framework.framework_output, "btl: tcp: exchange: %d %d IPv4 %s", @@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void) BTL_ERROR(("Unexpected address family: %d", addr->sa_family)); return OPAL_ERR_BAD_PARAM; } + + addrs[i].addr_ifkindex = btl->tcp_ifkindex; + addrs[i].addr_mask = btl->tcp_ifmask; + addrs[i].addr_bandwidth = btl->super.btl_bandwidth; } OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL, diff --git a/opal/mca/btl/tcp/btl_tcp_proc.c b/opal/mca/btl/tcp/btl_tcp_proc.c index 5162933c3e2..c31372324b1 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.c +++ b/opal/mca/btl/tcp/btl_tcp_proc.c @@ -16,8 +16,11 @@ * and Technology (RIST). All rights reserved. * Copyright (c) 2015-2016 Los Alamos National Security, LLC. All rights * reserved. - * Copyright (c) 2015-2018 Cisco Systems, Inc. All rights reserved - * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights reserved. + * Copyright (c) 2013-2018 Cisco Systems, Inc. All rights reserved + * Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights + * reserved. + * Copyright (c) 2006 Sandia National Laboratories. All rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -36,6 +39,7 @@ #include "opal/class/opal_hash_table.h" #include "opal/mca/btl/base/btl_base_error.h" +#include "opal/mca/reachable/base/base.h" #include "opal/mca/pmix/pmix.h" #include "opal/util/arch.h" #include "opal/util/argv.h" @@ -44,6 +48,8 @@ #include "opal/util/proc.h" #include "opal/util/show_help.h" #include "opal/util/printf.h" +#include "opal/util/string_copy.h" +#include "opal/util/bipartite_graph.h" #include "btl_tcp.h" #include "btl_tcp_proc.h" @@ -51,21 +57,6 @@ static void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* proc); static void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* proc); -struct mca_btl_tcp_proc_data_t { - mca_btl_tcp_interface_t** local_interfaces; - opal_hash_table_t local_kindex_to_index; - size_t num_local_interfaces, max_local_interfaces; - size_t num_peer_interfaces; - opal_hash_table_t peer_kindex_to_index; - unsigned int *best_assignment; - int max_assignment_weight; - int max_assignment_cardinality; - enum mca_btl_tcp_connection_quality **weights; - struct mca_btl_tcp_addr_t ***best_addr; -}; - -typedef struct mca_btl_tcp_proc_data_t mca_btl_tcp_proc_data_t; - OBJ_CLASS_INSTANCE( mca_btl_tcp_proc_t, opal_list_item_t, mca_btl_tcp_proc_construct, @@ -79,6 +70,8 @@ void mca_btl_tcp_proc_construct(mca_btl_tcp_proc_t* tcp_proc) tcp_proc->proc_endpoints = NULL; tcp_proc->proc_endpoint_count = 0; OBJ_CONSTRUCT(&tcp_proc->proc_lock, opal_mutex_t); + OBJ_CONSTRUCT(&tcp_proc->btl_index_to_endpoint, opal_hash_table_t); + opal_hash_table_init(&tcp_proc->btl_index_to_endpoint, mca_btl_tcp_component.tcp_num_btls); } /* @@ -103,9 +96,270 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc) if(NULL != tcp_proc->proc_addrs) { free(tcp_proc->proc_addrs); } + OBJ_DESTRUCT(&tcp_proc->btl_index_to_endpoint); OBJ_DESTRUCT(&tcp_proc->proc_lock); } +static inline int mca_btl_tcp_proc_is_proc_left(opal_process_name_t a, + opal_process_name_t b) +{ + if (a.jobid != b.jobid) { + return (a.jobid < b.jobid); + } else { + return (a.vpid < b.vpid); + } +} + +#define MCA_BTL_TCP_PROC_LOCAL_VERTEX(index) (index) +#define MCA_BTL_TCP_PROC_REMOTE_VERTEX(index) (index + mca_btl_tcp_component.tcp_num_btls) + +/* This function builds a graph to match local and remote interfaces + * together. It also populates the remote proc object. + * + * @param btl_proc (IN) Remote proc information + * @param remote_addrs (IN) List of addresses from remote interfaces + * @param local_proc_is_left (IN) Boolean indicator. If true, we set local process + * interfaces to be on the left side of the graph. + * If false, we set remote process interfaces to + * be on the left side of the graph. + * @param graph_out (OUT) Constructed and populated bipartite interface + * graph with vertices as interfaces and negative + * reachability weights as costs for the edges. + * @return OPAL error code or success + * + * The vertices of this graph are the local and remote interfaces. Edges in + * this graph are connections between the interfaces. Costs are computed as + * negative weight which is calculated using the reachability framework. + * + * In order to mirror inputs on both the local and remote side when solving + * interface matching from both sides, we require local_proc_is_left to + * indicate whether the local interfaces should be on the left of the graph + * or not. + * + * The remote list and proc_addrs are assembled and populated here so that + * we can ensure that the vertex ordering matches the proc_addr ordering. + * This allows us to pass the correct pointers to the vertex data for storage. + * + */ +static int mca_btl_tcp_proc_create_interface_graph(mca_btl_tcp_proc_t* btl_proc, + mca_btl_tcp_modex_addr_t* remote_addrs, + int local_proc_is_left, + opal_bp_graph_t **graph_out) +{ + opal_bp_graph_t *graph = NULL; + opal_reachable_t *results = NULL; + opal_list_t *local_list = &mca_btl_tcp_component.local_ifs; + opal_list_t *remote_list; + int rc, v_index, x, y, cost, u, v, num_edges = 0; + size_t i; + + remote_list = OBJ_NEW(opal_list_t); + if (NULL == remote_list) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } + + /* the modex and proc structures differ slightly, so copy the + fields needed in the proc version */ + for (i = 0 ; i < btl_proc->proc_addr_count ; i++) { + /* Construct opal_if_t objects for the remote interfaces */ + opal_if_t *interface = OBJ_NEW(opal_if_t); + if (NULL == interface) { + rc = OPAL_ERR_OUT_OF_RESOURCE; + goto out; + } + + if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) { + memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet, + remote_addrs[i].addr, sizeof(struct in_addr)); + btl_proc->proc_addrs[i].addr_family = AF_INET; + + memcpy(&((struct sockaddr_in *)&(interface->if_addr))->sin_addr, + remote_addrs[i].addr, sizeof(struct in_addr)); + ((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET; + interface->af_family = AF_INET; + } else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) { +#if OPAL_ENABLE_IPV6 + memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6, + remote_addrs[i].addr, sizeof(struct in6_addr)); + btl_proc->proc_addrs[i].addr_family = AF_INET6; + + memcpy(&((struct sockaddr_in6 *)&(interface->if_addr))->sin6_addr, + remote_addrs[i].addr, sizeof(struct in6_addr)); + ((struct sockaddr *)&(interface->if_addr))->sa_family = AF_INET6; + interface->af_family = AF_INET6; +#else + rc = OPAL_ERR_NOT_SUPPORTED; + OBJ_RELEASE(interface); + goto out; +#endif + } else { + BTL_ERROR(("Unexpected address family %d", + (int)remote_addrs[i].addr_family)); + rc = OPAL_ERR_BAD_PARAM; + OBJ_RELEASE(interface); + goto out; + } + + btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; + btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; + + interface->if_mask = remote_addrs[i].addr_mask; + interface->if_bandwidth = remote_addrs[i].addr_bandwidth; + + opal_list_append(remote_list, &(interface->super)); + } + + rc = opal_bp_graph_create(NULL, NULL, &graph); + if (OPAL_SUCCESS != rc) { + goto out; + } + results = opal_reachable.reachable(local_list, remote_list); + if (NULL == results) { + rc = OPAL_ERROR; + goto err_graph; + } + + /* Add vertices for each local node. These will store the btl index */ + for (x = 0; x < results->num_local; x++) { + rc = opal_bp_graph_add_vertex(graph, &mca_btl_tcp_component.tcp_btls[x]->btl_index, &v_index); + if (OPAL_SUCCESS != rc) { + goto err_graph; + } + } + + /* Add vertices for each remote node. These will store remote interface information */ + for (y = 0; y < results->num_remote; y++) { + rc = opal_bp_graph_add_vertex(graph, &btl_proc->proc_addrs[y], &v_index); + if (OPAL_SUCCESS != rc) { + goto err_graph; + } + } + + /* Add edges */ + for (x = 0; x < results->num_local; x++) { + for (y = 0; y < results->num_remote; y++) { + /* The bipartite assignment solver will optimize a graph for + * least cost. Since weights vary from 0 as no connection and + * higher weights as better connections (multiplied by some other + * factors), higher weight is better. Thus, to achieve least cost, + * we set cost as negative weight. + */ + cost = -results->weights[x][y]; + /* Skip edges with no connections */ + if (0 == cost) { + continue; + } + if (local_proc_is_left) { + u = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x); + v = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y); + } else { + u = MCA_BTL_TCP_PROC_REMOTE_VERTEX(y); + v = MCA_BTL_TCP_PROC_LOCAL_VERTEX(x); + } + rc = opal_bp_graph_add_edge(graph, u, v, cost, 1, NULL); + if (OPAL_SUCCESS != rc) { + goto err_graph; + } + num_edges++; + } + } + + if (0 == num_edges) { + BTL_ERROR(("Unable to find reachable pairing between local and remote interfaces")); + rc = OPAL_ERR_UNREACH; + } + + *graph_out = graph; + goto out; + +err_graph: + if (NULL != graph) { + opal_bp_graph_free(graph); + } +out: + if (NULL != results) { + free(results); + } + if (NULL != remote_list) { + OBJ_RELEASE(remote_list); + } + return rc; +} + +/* We store the matched interface data by using the btl_index as the key and + * a pointer to a mca_btl_tcp_addr_t struct. + */ +static int mca_btl_tcp_proc_store_matched_interfaces(mca_btl_tcp_proc_t *btl_proc, + int local_proc_is_left, + opal_bp_graph_t *graph, + int num_matched, int *matched_edges) +{ + int rc = OPAL_SUCCESS; + int i, left, right; + uint32_t* local_index; + struct mca_btl_tcp_addr_t *remote_addr; + + for (i = 0; i < num_matched; i++) { + left = matched_edges[2 * i + 0]; + right = matched_edges[2 * i + 1]; + if (local_proc_is_left) { + rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&local_index); + if (OPAL_SUCCESS != rc) { + goto out; + } + rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&remote_addr); + if (OPAL_SUCCESS != rc) { + goto out; + } + } else { + rc = opal_bp_graph_get_vertex_data(graph, right, (void *)&local_index); + if (OPAL_SUCCESS != rc) { + goto out; + } + rc = opal_bp_graph_get_vertex_data(graph, left, (void *)&remote_addr); + if (OPAL_SUCCESS != rc) { + goto out; + } + } + opal_hash_table_set_value_uint32(&btl_proc->btl_index_to_endpoint, *local_index, (void *)remote_addr); + } +out: + return rc; +} + +static int mca_btl_tcp_proc_handle_modex_addresses(mca_btl_tcp_proc_t* btl_proc, + mca_btl_tcp_modex_addr_t* remote_addrs, + int local_proc_is_left) +{ + opal_bp_graph_t *graph = NULL; + int rc = OPAL_SUCCESS; + int num_matched = 0; + int *matched_edges = NULL; + + rc = mca_btl_tcp_proc_create_interface_graph(btl_proc, remote_addrs, local_proc_is_left, &graph); + if (rc) { + goto cleanup; + } + + rc = opal_bp_graph_solve_bipartite_assignment(graph, &num_matched, &matched_edges); + if (rc) { + goto cleanup; + } + + rc = mca_btl_tcp_proc_store_matched_interfaces(btl_proc, local_proc_is_left, + graph, num_matched, matched_edges); + if (rc) { + goto cleanup; + } + +cleanup: + if (NULL != graph) { + opal_bp_graph_free(graph); + } + return rc; +} + /* * Create a TCP process structure. There is a one-to-one correspondence * between a opal_proc_t and a mca_btl_tcp_proc_t instance. We cache @@ -117,9 +371,9 @@ void mca_btl_tcp_proc_destruct(mca_btl_tcp_proc_t* tcp_proc) mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) { mca_btl_tcp_proc_t* btl_proc; - int rc; + int rc, local_proc_is_left; mca_btl_tcp_modex_addr_t *remote_addrs = NULL; - size_t i, size; + size_t size; OPAL_THREAD_LOCK(&mca_btl_tcp_component.tcp_lock); rc = opal_proc_table_get_value(&mca_btl_tcp_component.tcp_procs, @@ -168,34 +422,20 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) goto cleanup; } - /* the modex and proc structures differ slightly, so copy the - fields needed in the proc version */ - for (i = 0 ; i < btl_proc->proc_addr_count ; i++) { - if (MCA_BTL_TCP_AF_INET == remote_addrs[i].addr_family) { - memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet, - remote_addrs[i].addr, sizeof(struct in_addr)); - btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; - btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; - btl_proc->proc_addrs[i].addr_family = AF_INET; - btl_proc->proc_addrs[i].addr_inuse = false; - } else if (MCA_BTL_TCP_AF_INET6 == remote_addrs[i].addr_family) { -#if OPAL_ENABLE_IPV6 - memcpy(&btl_proc->proc_addrs[i].addr_union.addr_inet6, - remote_addrs[i].addr, sizeof(struct in6_addr)); - btl_proc->proc_addrs[i].addr_port = remote_addrs[i].addr_port; - btl_proc->proc_addrs[i].addr_ifkindex = remote_addrs[i].addr_ifkindex; - btl_proc->proc_addrs[i].addr_family = AF_INET6; - btl_proc->proc_addrs[i].addr_inuse = false; -#else - rc = OPAL_ERR_NOT_SUPPORTED; - goto cleanup; -#endif - } else { - BTL_ERROR(("Unexpected address family %d", - (int)remote_addrs[i].addr_family)); - rc = OPAL_ERR_BAD_PARAM; - goto cleanup; - } + /* When solving for bipartite assignment, a graph with equal weights + * can provide different outputs depending on the input parameters. + * Thus two processes can construct different interface matchings. + * To avoid this case, we put the process with the lower jobid on the + * left or if they are equal, we use the lower vpid on the left. + * + * The concept of mirroring the local and remote sides is borrowed + * from the usnic btl implementation of its bipartite assignment solver. + */ + local_proc_is_left = mca_btl_tcp_proc_is_proc_left(proc->proc_name, opal_proc_local_get()->proc_name); + rc = mca_btl_tcp_proc_handle_modex_addresses(btl_proc, remote_addrs, local_proc_is_left); + + if (OPAL_SUCCESS != rc) { + goto cleanup; } /* allocate space for endpoint array - one for each exported address */ @@ -230,236 +470,33 @@ mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc) return btl_proc; } - - -static void evaluate_assignment(mca_btl_tcp_proc_data_t *proc_data, int *a) { - size_t i; - unsigned int max_interfaces = proc_data->num_local_interfaces; - int assignment_weight = 0; - int assignment_cardinality = 0; - - if(max_interfaces < proc_data->num_peer_interfaces) { - max_interfaces = proc_data->num_peer_interfaces; - } - - for(i = 0; i < max_interfaces; ++i) { - if(0 < proc_data->weights[i][a[i]-1]) { - ++assignment_cardinality; - assignment_weight += proc_data->weights[i][a[i]-1]; - } - } - - /* - * check wether current solution beats all previous solutions - */ - if(assignment_cardinality > proc_data->max_assignment_cardinality - || (assignment_cardinality == proc_data->max_assignment_cardinality - && assignment_weight > proc_data->max_assignment_weight)) { - - for(i = 0; i < max_interfaces; ++i) { - proc_data->best_assignment[i] = a[i]-1; - } - proc_data->max_assignment_weight = assignment_weight; - proc_data->max_assignment_cardinality = assignment_cardinality; - } -} - -static void visit(mca_btl_tcp_proc_data_t *proc_data, int k, int level, int siz, int *a) -{ - level = level+1; a[k] = level; - - if (level == siz) { - evaluate_assignment(proc_data, a); - } else { - int i; - for ( i = 0; i < siz; i++) - if (a[i] == 0) - visit(proc_data, i, level, siz, a); - } - - level = level-1; a[k] = 0; -} - - -static void mca_btl_tcp_initialise_interface(mca_btl_tcp_interface_t* tcp_interface, - int ifk_index, int index) -{ - tcp_interface->kernel_index = ifk_index; - tcp_interface->peer_interface = -1; - tcp_interface->ipv4_address = NULL; - tcp_interface->ipv6_address = NULL; - tcp_interface->index = index; - tcp_interface->inuse = 0; -} - -static mca_btl_tcp_interface_t** mca_btl_tcp_retrieve_local_interfaces(mca_btl_tcp_proc_data_t *proc_data) -{ - struct sockaddr_storage local_addr; - char local_if_name[OPAL_IF_NAMESIZE]; - char **include, **exclude, **argv; - int idx; - mca_btl_tcp_interface_t * local_interface; - - assert (NULL == proc_data->local_interfaces); - if( NULL != proc_data->local_interfaces ) - return proc_data->local_interfaces; - - proc_data->max_local_interfaces = MAX_KERNEL_INTERFACES; - proc_data->num_local_interfaces = 0; - proc_data->local_interfaces = (mca_btl_tcp_interface_t**)calloc( proc_data->max_local_interfaces, sizeof(mca_btl_tcp_interface_t*) ); - if( NULL == proc_data->local_interfaces ) - return NULL; - - /* Collect up the list of included and excluded interfaces, if any */ - include = opal_argv_split(mca_btl_tcp_component.tcp_if_include,','); - exclude = opal_argv_split(mca_btl_tcp_component.tcp_if_exclude,','); - - /* - * identify all kernel interfaces and the associated addresses of - * the local node - */ - for( idx = opal_ifbegin(); idx >= 0; idx = opal_ifnext (idx) ) { - int kindex; - uint64_t index; - bool skip = false; - - opal_ifindextoaddr (idx, (struct sockaddr*) &local_addr, sizeof (local_addr)); - opal_ifindextoname (idx, local_if_name, sizeof (local_if_name)); - - /* If we were given a list of included interfaces, then check - * to see if the current one is a member of this set. If so, - * drop down and complete processing. If not, skip it and - * continue on to the next one. Note that providing an include - * list will override providing an exclude list as the two are - * mutually exclusive. This matches how it works in - * mca_btl_tcp_component_create_instances() which is the function - * that exports the interfaces. */ - if(NULL != include) { - argv = include; - skip = true; - while(argv && *argv) { - /* When comparing included interfaces, we look for exact matches. - That is why we are using strcmp() here. */ - if (0 == strcmp(*argv, local_if_name)) { - skip = false; - break; - } - argv++; - } - } else if (NULL != exclude) { - /* If we were given a list of excluded interfaces, then check to see if the - * current one is a member of this set. If not, drop down and complete - * processing. If so, skip it and continue on to the next one. */ - argv = exclude; - while(argv && *argv) { - /* When looking for interfaces to exclude, we only look at - * the number of characters equal to what the user provided. - * For example, excluding "lo" excludes "lo", "lo0" and - * anything that starts with "lo" */ - if(0 == strncmp(*argv, local_if_name, strlen(*argv))) { - skip = true; - break; - } - argv++; - } - } - if (true == skip) { - /* This interface is not part of the requested set, so skip it */ - continue; - } - - kindex = opal_ifindextokindex(idx); - int rc = opal_hash_table_get_value_uint32(&proc_data->local_kindex_to_index, kindex, (void**) &index); - - /* create entry for this kernel index previously not seen */ - if (OPAL_SUCCESS != rc) { - index = proc_data->num_local_interfaces++; - opal_hash_table_set_value_uint32(&proc_data->local_kindex_to_index, kindex, (void*)(uintptr_t) index); - - if( proc_data->num_local_interfaces == proc_data->max_local_interfaces ) { - proc_data->max_local_interfaces <<= 1; - proc_data->local_interfaces = (mca_btl_tcp_interface_t**)realloc( proc_data->local_interfaces, - proc_data->max_local_interfaces * sizeof(mca_btl_tcp_interface_t*) ); - if( NULL == proc_data->local_interfaces ) - goto cleanup; - } - proc_data->local_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t)); - assert(NULL != proc_data->local_interfaces[index]); - mca_btl_tcp_initialise_interface(proc_data->local_interfaces[index], kindex, index); - } - - local_interface = proc_data->local_interfaces[index]; - switch(local_addr.ss_family) { - case AF_INET: - /* if AF is disabled, skip it completely */ - if (4 == mca_btl_tcp_component.tcp_disable_family) { - continue; - } - - local_interface->ipv4_address = - (struct sockaddr_storage*) malloc(sizeof(local_addr)); - memcpy(local_interface->ipv4_address, - &local_addr, sizeof(local_addr)); - opal_ifindextomask(idx, - &local_interface->ipv4_netmask, - sizeof(int)); - break; - case AF_INET6: - /* if AF is disabled, skip it completely */ - if (6 == mca_btl_tcp_component.tcp_disable_family) { - continue; - } - - local_interface->ipv6_address - = (struct sockaddr_storage*) malloc(sizeof(local_addr)); - memcpy(local_interface->ipv6_address, - &local_addr, sizeof(local_addr)); - opal_ifindextomask(idx, - &local_interface->ipv6_netmask, - sizeof(int)); - break; - default: - opal_output(0, "unknown address family for tcp: %d\n", - local_addr.ss_family); - } - } -cleanup: - if (NULL != include) { - opal_argv_free(include); - } - if (NULL != exclude) { - opal_argv_free(exclude); - } - - return proc_data->local_interfaces; -} /* * Note that this routine must be called with the lock on the process * already held. Insert a btl instance into the proc array and assign * it an address. */ -int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, - mca_btl_base_endpoint_t* btl_endpoint ) +int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t* btl_proc, + mca_btl_base_endpoint_t* btl_endpoint) { - struct sockaddr_storage endpoint_addr_ss; + mca_btl_tcp_module_t* tcp_btl = btl_endpoint->endpoint_btl; const char *proc_hostname; - unsigned int perm_size = 0; - int rc, *a = NULL; - size_t i, j; - mca_btl_tcp_interface_t** peer_interfaces = NULL; - mca_btl_tcp_proc_data_t _proc_data, *proc_data=&_proc_data; - size_t max_peer_interfaces; - char str_local[128], str_remote[128]; + mca_btl_tcp_addr_t *remote_addr; + int rc = OPAL_SUCCESS; if (NULL == (proc_hostname = opal_get_proc_hostname(btl_proc->proc_opal))) { - return OPAL_ERR_UNREACH; + rc = OPAL_ERR_UNREACH; + goto out; } - memset(proc_data, 0, sizeof(mca_btl_tcp_proc_data_t)); - OBJ_CONSTRUCT(&_proc_data.local_kindex_to_index, opal_hash_table_t); - opal_hash_table_init(&_proc_data.local_kindex_to_index, 8); - OBJ_CONSTRUCT(&_proc_data.peer_kindex_to_index, opal_hash_table_t); - opal_hash_table_init(&_proc_data.peer_kindex_to_index, 8); + rc = opal_hash_table_get_value_uint32(&btl_proc->btl_index_to_endpoint, tcp_btl->btl_index, (void **)&remote_addr); + if (OPAL_SUCCESS != rc) { + opal_output_verbose(10, opal_btl_base_framework.framework_output, + "btl:tcp: host %s, process %s UNREACHABLE", + proc_hostname, + OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); + goto out; + } + btl_endpoint->endpoint_addr = remote_addr; #ifndef WORDS_BIGENDIAN /* if we are little endian and our peer is not so lucky, then we @@ -476,304 +513,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc, btl_endpoint->endpoint_proc = btl_proc; btl_proc->proc_endpoints[btl_proc->proc_endpoint_count++] = btl_endpoint; - /* sanity checks */ - if( NULL == mca_btl_tcp_retrieve_local_interfaces(proc_data) ) - return OPAL_ERR_OUT_OF_RESOURCE; - if( 0 == proc_data->num_local_interfaces ) { - return OPAL_ERR_UNREACH; - } - - max_peer_interfaces = proc_data->max_local_interfaces; - peer_interfaces = (mca_btl_tcp_interface_t**)calloc( max_peer_interfaces, sizeof(mca_btl_tcp_interface_t*) ); - if (NULL == peer_interfaces) { - max_peer_interfaces = 0; - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - proc_data->num_peer_interfaces = 0; - - /* - * identify all kernel interfaces and the associated addresses of - * the peer - */ - - for( i = 0; i < btl_proc->proc_addr_count; i++ ) { - - uint64_t index; - - mca_btl_tcp_addr_t* endpoint_addr = btl_proc->proc_addrs + i; - - mca_btl_tcp_proc_tosocks (endpoint_addr, &endpoint_addr_ss); - - rc = opal_hash_table_get_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void**) &index); - - if (OPAL_SUCCESS != rc) { - index = proc_data->num_peer_interfaces++; - opal_hash_table_set_value_uint32(&proc_data->peer_kindex_to_index, endpoint_addr->addr_ifkindex, (void*)(uintptr_t) index); - if( proc_data->num_peer_interfaces == max_peer_interfaces ) { - max_peer_interfaces <<= 1; - peer_interfaces = (mca_btl_tcp_interface_t**)realloc( peer_interfaces, - max_peer_interfaces * sizeof(mca_btl_tcp_interface_t*) ); - if( NULL == peer_interfaces ) { - return OPAL_ERR_OUT_OF_RESOURCE; - } - } - peer_interfaces[index] = (mca_btl_tcp_interface_t *) malloc(sizeof(mca_btl_tcp_interface_t)); - mca_btl_tcp_initialise_interface(peer_interfaces[index], - endpoint_addr->addr_ifkindex, index); - } - - /* - * in case the peer address has created all intended connections, - * mark the complete peer interface as 'not available' - */ - if(endpoint_addr->addr_inuse >= mca_btl_tcp_component.tcp_num_links) { - peer_interfaces[index]->inuse = 1; - } - - switch(endpoint_addr_ss.ss_family) { - case AF_INET: - peer_interfaces[index]->ipv4_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss)); - peer_interfaces[index]->ipv4_endpoint_addr = endpoint_addr; - memcpy(peer_interfaces[index]->ipv4_address, - &endpoint_addr_ss, sizeof(endpoint_addr_ss)); - break; - case AF_INET6: - peer_interfaces[index]->ipv6_address = (struct sockaddr_storage*) malloc(sizeof(endpoint_addr_ss)); - peer_interfaces[index]->ipv6_endpoint_addr = endpoint_addr; - memcpy(peer_interfaces[index]->ipv6_address, - &endpoint_addr_ss, sizeof(endpoint_addr_ss)); - break; - default: - opal_output(0, "unknown address family for tcp: %d\n", - endpoint_addr_ss.ss_family); - return OPAL_ERR_UNREACH; - } - } - - /* - * assign weights to each possible pair of interfaces - */ - - perm_size = proc_data->num_local_interfaces; - if(proc_data->num_peer_interfaces > perm_size) { - perm_size = proc_data->num_peer_interfaces; - } - - proc_data->weights = (enum mca_btl_tcp_connection_quality**) malloc(perm_size - * sizeof(enum mca_btl_tcp_connection_quality*)); - assert(NULL != proc_data->weights); - - proc_data->best_addr = (mca_btl_tcp_addr_t ***) malloc(perm_size - * sizeof(mca_btl_tcp_addr_t **)); - assert(NULL != proc_data->best_addr); - for(i = 0; i < perm_size; ++i) { - proc_data->weights[i] = (enum mca_btl_tcp_connection_quality*) calloc(perm_size, - sizeof(enum mca_btl_tcp_connection_quality)); - assert(NULL != proc_data->weights[i]); - - proc_data->best_addr[i] = (mca_btl_tcp_addr_t **) calloc(perm_size, - sizeof(mca_btl_tcp_addr_t *)); - assert(NULL != proc_data->best_addr[i]); - } - - - for( i = 0; i < proc_data->num_local_interfaces; ++i ) { - mca_btl_tcp_interface_t* local_interface = proc_data->local_interfaces[i]; - for( j = 0; j < proc_data->num_peer_interfaces; ++j ) { - - /* initially, assume no connection is possible */ - proc_data->weights[i][j] = CQ_NO_CONNECTION; - - /* check state of ipv4 address pair */ - if(NULL != proc_data->local_interfaces[i]->ipv4_address && - NULL != peer_interfaces[j]->ipv4_address) { - - /* Convert the IPv4 addresses into nicely-printable strings for verbose debugging output */ - inet_ntop(AF_INET, &(((struct sockaddr_in*) proc_data->local_interfaces[i]->ipv4_address))->sin_addr, - str_local, sizeof(str_local)); - inet_ntop(AF_INET, &(((struct sockaddr_in*) peer_interfaces[j]->ipv4_address))->sin_addr, - str_remote, sizeof(str_remote)); - - if(opal_net_addr_isipv4public((struct sockaddr*) local_interface->ipv4_address) && - opal_net_addr_isipv4public((struct sockaddr*) peer_interfaces[j]->ipv4_address)) { - if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address, - (struct sockaddr*) peer_interfaces[j]->ipv4_address, - local_interface->ipv4_netmask)) { - proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV4 PUBLIC SAME NETWORK", - str_local, str_remote); - } else { - proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV4 PUBLIC DIFFERENT NETWORK", - str_local, str_remote); - } - proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr; - continue; - } - if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv4_address, - (struct sockaddr*) peer_interfaces[j]->ipv4_address, - local_interface->ipv4_netmask)) { - proc_data->weights[i][j] = CQ_PRIVATE_SAME_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV4 PRIVATE SAME NETWORK", - str_local, str_remote); - } else { - proc_data->weights[i][j] = CQ_PRIVATE_DIFFERENT_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV4 PRIVATE DIFFERENT NETWORK", - str_local, str_remote); - } - proc_data->best_addr[i][j] = peer_interfaces[j]->ipv4_endpoint_addr; - continue; - } - - /* check state of ipv6 address pair - ipv6 is always public, - * since link-local addresses are skipped in opal_ifinit() - */ - if(NULL != local_interface->ipv6_address && - NULL != peer_interfaces[j]->ipv6_address) { - - /* Convert the IPv6 addresses into nicely-printable strings for verbose debugging output */ - inet_ntop(AF_INET6, &(((struct sockaddr_in6*) local_interface->ipv6_address))->sin6_addr, - str_local, sizeof(str_local)); - inet_ntop(AF_INET6, &(((struct sockaddr_in6*) peer_interfaces[j]->ipv6_address))->sin6_addr, - str_remote, sizeof(str_remote)); - - if(opal_net_samenetwork((struct sockaddr*) local_interface->ipv6_address, - (struct sockaddr*) peer_interfaces[j]->ipv6_address, - local_interface->ipv6_netmask)) { - proc_data->weights[i][j] = CQ_PUBLIC_SAME_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV6 PUBLIC SAME NETWORK", - str_local, str_remote); - } else { - proc_data->weights[i][j] = CQ_PUBLIC_DIFFERENT_NETWORK; - opal_output_verbose(20, opal_btl_base_framework.framework_output, - "btl:tcp: path from %s to %s: IPV6 PUBLIC DIFFERENT NETWORK", - str_local, str_remote); - } - proc_data->best_addr[i][j] = peer_interfaces[j]->ipv6_endpoint_addr; - continue; - } - - } /* for each peer interface */ - } /* for each local interface */ - - /* - * determine the size of the set to permute (max number of - * interfaces - */ - - proc_data->best_assignment = (unsigned int *) malloc (perm_size * sizeof(int)); - - a = (int *) malloc(perm_size * sizeof(int)); - if (NULL == a) { - rc = OPAL_ERR_OUT_OF_RESOURCE; - goto exit; - } - - /* Can only find the best set of connections when the number of - * interfaces is not too big. When it gets larger, we fall back - * to a simpler and faster (and not as optimal) algorithm. - * See ticket https://svn.open-mpi.org/trac/ompi/ticket/2031 - * for more details about this issue. */ - if (perm_size <= MAX_PERMUTATION_INTERFACES) { - memset(a, 0, perm_size * sizeof(int)); - proc_data->max_assignment_cardinality = -1; - proc_data->max_assignment_weight = -1; - visit(proc_data, 0, -1, perm_size, a); - - rc = OPAL_ERR_UNREACH; - for(i = 0; i < perm_size; ++i) { - unsigned int best = proc_data->best_assignment[i]; - if(best > proc_data->num_peer_interfaces - || proc_data->weights[i][best] == CQ_NO_CONNECTION - || peer_interfaces[best]->inuse - || NULL == peer_interfaces[best]) { - continue; - } - peer_interfaces[best]->inuse++; - btl_endpoint->endpoint_addr = proc_data->best_addr[i][best]; - btl_endpoint->endpoint_addr->addr_inuse = true; - rc = OPAL_SUCCESS; - break; - } - } else { - enum mca_btl_tcp_connection_quality max; - int i_max = 0, j_max = 0; - /* Find the best connection that is not in use. Save away - * the indices of the best location. */ - max = CQ_NO_CONNECTION; - for(i=0; inum_local_interfaces; ++i) { - for(j=0; jnum_peer_interfaces; ++j) { - if (!peer_interfaces[j]->inuse) { - if (proc_data->weights[i][j] > max) { - max = proc_data->weights[i][j]; - i_max = i; - j_max = j; - } - } - } - } - /* Now see if there is a some type of connection available. */ - rc = OPAL_ERR_UNREACH; - if (CQ_NO_CONNECTION != max) { - peer_interfaces[j_max]->inuse++; - btl_endpoint->endpoint_addr = proc_data->best_addr[i_max][j_max]; - btl_endpoint->endpoint_addr->addr_inuse = true; - rc = OPAL_SUCCESS; - } - } - if (OPAL_ERR_UNREACH == rc) { - opal_output_verbose(10, opal_btl_base_framework.framework_output, - "btl:tcp: host %s, process %s UNREACHABLE", - proc_hostname, - OPAL_NAME_PRINT(btl_proc->proc_opal->proc_name)); - } - - exit: - // Ok to always free because proc_data() was memset() to 0 before - // any possible return (and free(NULL) is fine). - for(i = 0; i < perm_size; ++i) { - free(proc_data->weights[i]); - free(proc_data->best_addr[i]); - } - - for(i = 0; i < proc_data->num_peer_interfaces; ++i) { - if(NULL != peer_interfaces[i]->ipv4_address) { - free(peer_interfaces[i]->ipv4_address); - } - if(NULL != peer_interfaces[i]->ipv6_address) { - free(peer_interfaces[i]->ipv6_address); - } - free(peer_interfaces[i]); - } - free(peer_interfaces); - - for(i = 0; i < proc_data->num_local_interfaces; ++i) { - if(NULL != proc_data->local_interfaces[i]->ipv4_address) { - free(proc_data->local_interfaces[i]->ipv4_address); - } - if(NULL != proc_data->local_interfaces[i]->ipv6_address) { - free(proc_data->local_interfaces[i]->ipv6_address); - } - free(proc_data->local_interfaces[i]); - } - free(proc_data->local_interfaces); proc_data->local_interfaces = NULL; - proc_data->max_local_interfaces = 0; - - free(proc_data->weights); proc_data->weights = NULL; - free(proc_data->best_addr); proc_data->best_addr = NULL; - free(proc_data->best_assignment); proc_data->best_assignment = NULL; - - OBJ_DESTRUCT(&_proc_data.local_kindex_to_index); - OBJ_DESTRUCT(&_proc_data.peer_kindex_to_index); - - free(a); - +out: return rc; } @@ -796,12 +536,6 @@ int mca_btl_tcp_proc_remove(mca_btl_tcp_proc_t* btl_proc, mca_btl_base_endpoint_ OBJ_RELEASE(btl_proc); return OPAL_SUCCESS; } - /* The endpoint_addr may still be NULL if this endpoint is - being removed early in the wireup sequence (e.g., if it - is unreachable by all other procs) */ - if (NULL != btl_endpoint->endpoint_addr) { - btl_endpoint->endpoint_addr->addr_inuse = false; - } break; } } diff --git a/opal/mca/btl/tcp/btl_tcp_proc.h b/opal/mca/btl/tcp/btl_tcp_proc.h index d316134e2b2..abbb9baf864 100644 --- a/opal/mca/btl/tcp/btl_tcp_proc.h +++ b/opal/mca/btl/tcp/btl_tcp_proc.h @@ -10,6 +10,8 @@ * Copyright (c) 2004-2005 The Regents of the University of California. * All rights reserved. * Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved + * Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights + * reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -53,57 +55,15 @@ struct mca_btl_tcp_proc_t { size_t proc_endpoint_count; /**< number of endpoints */ + opal_hash_table_t btl_index_to_endpoint; + /**< interface match table, matches btl_index to remote addresses of type mca_btl_tcp_addr_t */ + opal_mutex_t proc_lock; /**< lock to protect against concurrent access to proc state */ }; typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t; OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t); -/* the highest possible interface kernel index we can handle */ -#define MAX_KERNEL_INTERFACE_INDEX 65536 - -/* the maximum number of kernel interfaces we can handle */ -#define MAX_KERNEL_INTERFACES 8 - -/* The maximum number of interfaces that we can have and use the - * recursion code for determining the best set of connections. When - * the number is greater than this, we switch to a simpler algorithm - * to speed things up. */ -#define MAX_PERMUTATION_INTERFACES 8 - -/* - * FIXME: this should probably be part of an ompi list, so we need the - * appropriate definitions - */ - -struct mca_btl_tcp_interface_t { - struct sockaddr_storage* ipv4_address; - struct sockaddr_storage* ipv6_address; - mca_btl_tcp_addr_t* ipv4_endpoint_addr; - mca_btl_tcp_addr_t* ipv6_endpoint_addr; - uint32_t ipv4_netmask; - uint32_t ipv6_netmask; - int kernel_index; - int peer_interface; - int index; - int inuse; -}; - -typedef struct mca_btl_tcp_interface_t mca_btl_tcp_interface_t; - -/* - * describes the quality of a possible connection between a local and - * a remote network interface - */ -enum mca_btl_tcp_connection_quality { - CQ_NO_CONNECTION, - CQ_PRIVATE_DIFFERENT_NETWORK, - CQ_PRIVATE_SAME_NETWORK, - CQ_PUBLIC_DIFFERENT_NETWORK, - CQ_PUBLIC_SAME_NETWORK -}; - - mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc); mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t* name); int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);