diff --git a/src/ucs/sys/netlink.c b/src/ucs/sys/netlink.c index 23885742425..c3bc40e516d 100644 --- a/src/ucs/sys/netlink.c +++ b/src/ucs/sys/netlink.c @@ -29,6 +29,8 @@ typedef struct { const struct sockaddr *sa_remote; int if_index; int found; + int allow_default_gw; /* Allow matching default + gateway routes */ } ucs_netlink_route_info_t; @@ -174,7 +176,7 @@ ucs_netlink_send_request(int protocol, unsigned short nlmsg_type, static ucs_status_t ucs_netlink_get_route_info(const struct rtattr *rta, int len, int *if_index_p, - const void **dst_in_addr) + const void **dst_in_addr, size_t rtm_dst_len) { *if_index_p = -1; *dst_in_addr = NULL; @@ -187,7 +189,10 @@ ucs_netlink_get_route_info(const struct rtattr *rta, int len, int *if_index_p, } } - if ((*if_index_p == -1) || (*dst_in_addr == NULL)) { + if (/* Network interface index is not valid */ + (*if_index_p == -1) || + /* dst_in_addr required but not present */ + ((rtm_dst_len != 0) && (*dst_in_addr == NULL))) { return UCS_ERR_INVALID_PARAM; } @@ -206,7 +211,8 @@ ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, void *arg) int khret; if (ucs_netlink_get_route_info(RTM_RTA(rt_msg), RTM_PAYLOAD(nlh), - &iface_index, &dst_in_addr) != UCS_OK) { + &iface_index, &dst_in_addr, + rt_msg->rtm_dst_len) != UCS_OK) { return UCS_INPROGRESS; } @@ -228,12 +234,14 @@ ucs_netlink_parse_rt_entry_cb(const struct nlmsghdr *nlh, void *arg) ucs_error("could not allocate route entry"); return UCS_ERR_NO_MEMORY); - memset(&new_rule->dest, 0, sizeof(sizeof(new_rule->dest))); + memset(&new_rule->dest, 0, sizeof(new_rule->dest)); new_rule->dest.ss_family = rt_msg->rtm_family; - if (UCS_OK != ucs_sockaddr_set_inet_addr((struct sockaddr *)&new_rule->dest, - dst_in_addr)) { - ucs_array_pop_back(iface_rules); - return UCS_ERR_IO_ERROR; + if (dst_in_addr != NULL) { + if (ucs_sockaddr_set_inet_addr((struct sockaddr *)&new_rule->dest, + dst_in_addr) != UCS_OK) { + ucs_array_pop_back(iface_rules); + return UCS_ERR_IO_ERROR; + } } new_rule->subnet_prefix_len = rt_msg->rtm_dst_len; @@ -256,6 +264,13 @@ static void ucs_netlink_lookup_route(ucs_netlink_route_info_t *info) iface_rules = &kh_val(&ucs_netlink_routing_table_cache, iter); ucs_array_for_each(curr_entry, iface_rules) { + + if ((curr_entry->subnet_prefix_len == 0) && !info->allow_default_gw) { + ucs_trace("iface_index=%d: skipping default gateway route", + info->if_index); + continue; + } + if (ucs_sockaddr_is_same_subnet( info->sa_remote, (const struct sockaddr *)&curr_entry->dest, @@ -266,7 +281,8 @@ static void ucs_netlink_lookup_route(ucs_netlink_route_info_t *info) } } -int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote) +int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote, + int allow_default_gw) { static ucs_init_once_t init_once = UCS_INIT_ONCE_INITIALIZER; struct rtmsg rtm = {0}; @@ -285,9 +301,11 @@ int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote) NULL); } - info.if_index = if_index; - info.sa_remote = sa_remote; - info.found = 0; + info.if_index = if_index; + info.sa_remote = sa_remote; + info.found = 0; + info.allow_default_gw = allow_default_gw; + ucs_netlink_lookup_route(&info); return info.found; diff --git a/src/ucs/sys/netlink.h b/src/ucs/sys/netlink.h index eabcc265b1d..5d1103899aa 100644 --- a/src/ucs/sys/netlink.h +++ b/src/ucs/sys/netlink.h @@ -48,14 +48,17 @@ ucs_netlink_send_request(int protocol, unsigned short nlmsg_type, * Check whether a routing table rule exists for a given network * interface name and a destination address. * - * @param [in] if_index A global index representing the network interface, - as assigned by the system (e.g., obtained via - if_nametoindex()). - * @param [in] sa_remote Pointer to the destination address. + * @param [in] if_index A global index representing the network + interface, as assigned by the system + (e.g., obtained via if_nametoindex()). + * @param [in] sa_remote Pointer to the destination address. + * @param [in] allow_default_gw Allow matching default gateway routes (1) or + * only specific subnet routes (0). * * @return 1 if rule exists, or 0 otherwise. */ -int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote); +int ucs_netlink_route_exists(int if_index, const struct sockaddr *sa_remote, + int allow_default_gw); END_C_DECLS diff --git a/src/ucs/sys/sys.c b/src/ucs/sys/sys.c index aa042425909..7c8b976b095 100644 --- a/src/ucs/sys/sys.c +++ b/src/ucs/sys/sys.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -177,6 +178,21 @@ ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p) return UCS_OK; } +int ucs_netif_is_ipoib(const char *if_name) +{ + struct ifreq ifr; + ucs_status_t status; + + status = ucs_netif_ioctl(if_name, SIOCGIFHWADDR, &ifr); + if (status != UCS_OK) { + /* If we can't determine the hardware type, assume it's not IPoIB */ + ucs_debug("failed to get hardware address for %s", if_name); + return 0; + } + + return ifr.ifr_hwaddr.sa_family == ARPHRD_INFINIBAND; +} + static uint64_t ucs_get_mac_address() { static uint64_t mac_address = 0; diff --git a/src/ucs/sys/sys.h b/src/ucs/sys/sys.h index 986826660a7..9ad9b8991e4 100644 --- a/src/ucs/sys/sys.h +++ b/src/ucs/sys/sys.h @@ -192,6 +192,16 @@ uint32_t ucs_file_checksum(const char *filename); ucs_status_t ucs_ifname_to_index(const char *ndev_name, unsigned *ndev_index_p); +/** + * Check if a network interface is an IPoIB (IP over InfiniBand) device. + * + * @param [in] if_name Network interface name to check. + * + * @return 1 if the interface is IPoIB, 0 otherwise. + */ +int ucs_netif_is_ipoib(const char *if_name); + + /** * Get a globally unique identifier of the machine running the current process. */ diff --git a/src/uct/ib/base/ib_iface.c b/src/uct/ib/base/ib_iface.c index 3fb49df0964..55c847c8eda 100644 --- a/src/uct/ib/base/ib_iface.c +++ b/src/uct/ib/base/ib_iface.c @@ -710,7 +710,7 @@ uct_ib_iface_roce_is_routable(uct_ib_iface_t *iface, uint8_t gid_index, return 0; } - if (!ucs_netlink_route_exists(ndev_index, sa_remote)) { + if (!ucs_netlink_route_exists(ndev_index, sa_remote, 1)) { /* try to use loopback interface for reachability check, because it may * be used for routing in case of an interface with VRF is configured * and a RoCE IP interface uses this VRF table for routing. @@ -721,7 +721,7 @@ uct_ib_iface_roce_is_routable(uct_ib_iface_t *iface, uint8_t gid_index, return 0; } - if (!ucs_netlink_route_exists(lo_ndev_index, sa_remote)) { + if (!ucs_netlink_route_exists(lo_ndev_index, sa_remote, 1)) { uct_iface_fill_info_str_buf(params, "remote address %s is not routable " "neither by interface "UCT_IB_IFACE_FMT diff --git a/src/uct/tcp/tcp.h b/src/uct/tcp/tcp.h index e96591b7cee..73df0299862 100644 --- a/src/uct/tcp/tcp.h +++ b/src/uct/tcp/tcp.h @@ -295,7 +295,12 @@ typedef enum uct_tcp_device_addr_flags { * Device address is extended by additional information: * @ref uct_iface_local_addr_ns_t for loopback reachability */ - UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK = UCS_BIT(0) + UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK = UCS_BIT(0), + + /** + * Allow communication with default gateway + */ + UCT_TCP_DEVICE_ADDR_FLAG_ALLOW_DEFAULT_GW = UCS_BIT(1) } uct_tcp_device_addr_flags_t; diff --git a/src/uct/tcp/tcp_iface.c b/src/uct/tcp/tcp_iface.c index 2aff8d7895d..cc2cf010c32 100644 --- a/src/uct/tcp/tcp_iface.c +++ b/src/uct/tcp/tcp_iface.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -138,6 +139,11 @@ static ucs_status_t uct_tcp_iface_get_device_address(uct_iface_h tl_iface, dev_addr->flags = 0; dev_addr->sa_family = saddr->sa_family; + /* Default gateway is not relevant for IPoIB interfaces */ + if (!ucs_netif_is_ipoib(iface->if_name)) { + dev_addr->flags |= UCT_TCP_DEVICE_ADDR_FLAG_ALLOW_DEFAULT_GW; + } + if (ucs_sockaddr_is_inaddr_loopback(saddr)) { dev_addr->flags |= UCT_TCP_DEVICE_ADDR_FLAG_LOOPBACK; memset(pack_ptr, 0, sizeof(uct_iface_local_addr_ns_t)); @@ -205,6 +211,7 @@ uct_tcp_iface_is_reachable_v2(const uct_iface_h tl_iface, struct sockaddr_storage remote_addr; char remote_addr_str[UCS_SOCKADDR_STRING_LEN]; unsigned ndev_index; + int allow_default_gw; ucs_status_t status; if (!uct_iface_is_reachable_params_valid( @@ -263,8 +270,12 @@ uct_tcp_iface_is_reachable_v2(const uct_iface_h tl_iface, return 0; } + allow_default_gw = !!(tcp_dev_addr->flags & + UCT_TCP_DEVICE_ADDR_FLAG_ALLOW_DEFAULT_GW); + if (!ucs_netlink_route_exists(ndev_index, - (const struct sockaddr *)&remote_addr)) { + (const struct sockaddr *)&remote_addr, + allow_default_gw)) { uct_iface_fill_info_str_buf( params, "no route to %s", ucs_sockaddr_str((const struct sockaddr *)&remote_addr,