Skip to content

Commit fc8c7a5

Browse files
authored
Merge pull request #7134 from wckzhang/btl_tcp_interface_match
btl tcp: Use reachability and graph solving for global interface matching
2 parents 10f6a77 + e958f3c commit fc8c7a5

File tree

6 files changed

+372
-640
lines changed

6 files changed

+372
-640
lines changed

opal/mca/btl/tcp/btl_tcp.c

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* Copyright (c) 2016-2017 Research Organization for Information Science
1616
* and Technology (RIST). All rights reserved.
1717
* Copyright (c) 2016 Intel, Inc. All rights reserved.
18+
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
19+
* reserved.
1820
*
1921
* $COPYRIGHT$
2022
*
@@ -101,12 +103,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
101103
continue;
102104
}
103105

104-
/*
105-
* Check to make sure that the peer has at least as many interface
106-
* addresses exported as we are trying to use. If not, then
107-
* don't bind this BTL instance to the proc.
108-
*/
109-
110106
OPAL_THREAD_LOCK(&tcp_proc->proc_lock);
111107

112108
for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {

opal/mca/btl/tcp/btl_tcp.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
* and Technology (RIST). All rights reserved.
1616
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
1717
* reserved.
18+
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
19+
* reserved.
1820
* $COPYRIGHT$
1921
*
2022
* Additional copyrights may follow
@@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
107109
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
108110
unsigned int tcp_num_links; /**< number of logical links per physical device */
109111
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
112+
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
110113
int tcp_free_list_num; /**< initial size of free lists */
111114
int tcp_free_list_max; /**< maximum size of free lists */
112115
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
@@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
163166
*/
164167
struct mca_btl_tcp_module_t {
165168
mca_btl_base_module_t super; /**< base BTL interface */
169+
uint32_t btl_index; /**< Local BTL module index, used for vertex
170+
data and used as a hash key when
171+
solving module matching problem */
166172
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
167173
struct sockaddr_storage tcp_ifaddr; /**< First address
168174
discovered for this

opal/mca/btl/tcp/btl_tcp_addr.h

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99
* University of Stuttgart. All rights reserved.
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
12+
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
13+
* reserved.
14+
*
1215
* $COPYRIGHT$
1316
*
1417
* Additional copyrights may follow
@@ -30,37 +33,43 @@
3033
#ifdef HAVE_NETINET_IN_H
3134
#include <netinet/in.h>
3235
#endif
33-
36+
#include <assert.h>
3437

3538
/**
3639
* Modex address structure.
3740
*
3841
* One of these structures will be sent for every btl module in use by
39-
* the local BTL TCP component.
42+
* the local BTL TCP component. This is used to construct an opal_if_t
43+
* structure for the reachability component as well as populate the
44+
* mca_btl_tcp_addr_t structure on remote procs. These will be used
45+
* for interface matching and filling out the mca_btl_base_endpoint_t
46+
* structure.
4047
*/
4148
struct mca_btl_tcp_modex_addr_t {
4249
uint8_t addr[16]; /* endpoint address. for addr_family
4350
of MCA_BTL_TCP_AF_INET, only the
4451
first 4 bytes have meaning. */
4552
uint32_t addr_ifkindex; /* endpoint kernel index */
53+
uint32_t addr_mask; /* ip mask */
54+
uint32_t addr_bandwidth; /* interface bandwidth */
4655
uint16_t addr_port; /* endpoint listen port */
4756
uint8_t addr_family; /* endpoint address family. Note that
4857
this is
4958
MCA_BTL_TCP_AF_{INET,INET6}, not
5059
the traditional
5160
AF_INET/AF_INET6. */
52-
uint8_t padding[1]; /* padd out to an 8-byte word */
61+
uint8_t padding[1]; /* pad out to an 8-byte word */
5362
};
5463
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;
5564

65+
_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");
5666

5767
/**
5868
* Remote peer address structure
5969
*
6070
* One of these structures will be allocated for every remote endpoint
6171
* associated with a remote proc. The data is pulled from the
62-
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
63-
* field, which is local.
72+
* mca_btl_tcp_modex_addr_t structure.
6473
*/
6574
struct mca_btl_tcp_addr_t {
6675
union {
@@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
7382
int addr_ifkindex; /**< remote interface index assigned with
7483
this address */
7584
uint8_t addr_family; /**< AF_INET or AF_INET6 */
76-
bool addr_inuse; /**< local meaning only */
7785
};
7886
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;
7987

opal/mca/btl/tcp/btl_tcp_component.c

Lines changed: 41 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
2020
* Copyright (c) 2014-2017 Research Organization for Information Science
2121
* and Technology (RIST). All rights reserved.
22-
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
22+
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
23+
* reserved.
2324
* $COPYRIGHT$
2425
*
2526
* Additional copyrights may follow
@@ -69,13 +70,15 @@
6970
#include "opal/util/net.h"
7071
#include "opal/util/fd.h"
7172
#include "opal/util/show_help.h"
73+
#include "opal/util/string_copy.h"
7274
#include "opal/util/printf.h"
7375
#include "opal/constants.h"
7476
#include "opal/mca/btl/btl.h"
7577
#include "opal/mca/btl/base/base.h"
7678
#include "opal/mca/mpool/base/base.h"
7779
#include "opal/mca/btl/base/btl_base_error.h"
7880
#include "opal/mca/pmix/pmix.h"
81+
#include "opal/mca/reachable/base/base.h"
7982
#include "opal/threads/threads.h"
8083

8184
#include "opal/constants.h"
@@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
368371
mca_btl_tcp_component.tcp_btls = NULL;
369372

370373
/* initialize objects */
374+
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
371375
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
372376
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
373377
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
@@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
477481
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
478482
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
479483
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
484+
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);
480485

481486
#if OPAL_CUDA_SUPPORT
482487
mca_common_cuda_fini();
@@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
493498
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
494499
{
495500
struct mca_btl_tcp_module_t* btl;
501+
opal_if_t *copied_interface, *selected_interface;
496502
char param[256];
497-
int i;
503+
int i, if_index;
498504
struct sockaddr_storage addr;
499505
bool found = false;
500506

@@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
515521
* 10.1.0.1 as the one that is published in the modex and used for
516522
* connection.
517523
*/
518-
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
519-
int ret;
520-
521-
if (if_kindex != opal_ifindextokindex(i)) {
524+
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
525+
if (if_kindex != selected_interface->if_kernel_index) {
522526
continue;
523527
}
524528

525-
ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
526-
sizeof(struct sockaddr_storage));
527-
if (OPAL_SUCCESS != ret) {
528-
return ret;
529-
}
529+
if_index = selected_interface->if_index;
530+
531+
memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
532+
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));
530533

531534
if (addr.ss_family == AF_INET &&
532535
4 != mca_btl_tcp_component.tcp_disable_family) {
@@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
548551
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
549552
if(NULL == btl)
550553
return OPAL_ERR_OUT_OF_RESOURCE;
554+
copied_interface = OBJ_NEW(opal_if_t);
555+
if (NULL == copied_interface) {
556+
free(btl);
557+
return OPAL_ERR_OUT_OF_RESOURCE;
558+
}
551559
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
552560
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
553561
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
554562
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;
555563

556564
/* initialize the btl */
565+
/* This index is used as a key for a hash table used for interface matching. */
566+
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
557567
btl->tcp_ifkindex = (uint16_t) if_kindex;
558568
#if MCA_BTL_TCP_STATISTICS
559569
btl->tcp_bytes_recv = 0;
@@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
562572
#endif
563573

564574
memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
575+
btl->tcp_ifmask = selected_interface->if_mask;
565576

566577
/* allow user to specify interface bandwidth */
567578
sprintf(param, "bandwidth_%s", if_name);
@@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
603614
}
604615
}
605616

617+
/* Add another entry to the local interface list */
618+
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
619+
copied_interface->if_index = if_index;
620+
copied_interface->if_kernel_index = btl->tcp_ifkindex;
621+
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
622+
copied_interface->if_flags = selected_interface->if_flags;
623+
copied_interface->if_speed = selected_interface->if_speed;
624+
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
625+
copied_interface->if_mask = selected_interface->if_mask;
626+
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
627+
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
628+
copied_interface->ifmtu = selected_interface->ifmtu;
629+
630+
opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));
631+
606632
opal_output_verbose(5, opal_btl_base_framework.framework_output,
607633
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
608634
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
@@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
11881214
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
11891215
sizeof(struct in6_addr));
11901216
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
1191-
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
11921217
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
11931218
opal_output_verbose(5, opal_btl_base_framework.framework_output,
11941219
"btl: tcp: exchange: %d %d IPv6 %s",
@@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
12021227
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
12031228
sizeof(struct in_addr));
12041229
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
1205-
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
12061230
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
12071231
opal_output_verbose(5, opal_btl_base_framework.framework_output,
12081232
"btl: tcp: exchange: %d %d IPv4 %s",
@@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
12121236
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
12131237
return OPAL_ERR_BAD_PARAM;
12141238
}
1239+
1240+
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
1241+
addrs[i].addr_mask = btl->tcp_ifmask;
1242+
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
12151243
}
12161244

12171245
OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,

0 commit comments

Comments
 (0)