Skip to content

Commit 020b684

Browse files
authored
[ICRDMA] Add RDMA aware memory pool and related code. (#26448)
This commit contains: - RDMA aware memory pool - common wrapper for ibverbs context - link manager to select right context using ip address (it supports RoCEv2 only)
1 parent c38c114 commit 020b684

File tree

15 files changed

+1434
-35
lines changed

15 files changed

+1434
-35
lines changed
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
*This directory contains rdma (RoCEv2) related IC code
2+
3+
link_manager.* - The component which scans all devices and ports during startup
4+
and allows to get ibv context associated with given ip address.
5+
6+
mem_pool.* - Rdma aware momory pool. Allows to allocate the memory region
7+
registered in the RDMA hardware.
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
#include "ctx.h"
2+
3+
#include <util/stream/output.h>
4+
#include <util/string/builder.h>
5+
6+
#include <arpa/inet.h>
7+
8+
namespace NInterconnect::NRdma {
9+
10+
TDeviceCtx::TDeviceCtx(ibv_context* ctx, ibv_pd* pd)
11+
: Context(ctx)
12+
, ProtDomain(pd)
13+
{}
14+
15+
TRdmaCtx::TRdmaCtx(
16+
std::shared_ptr<TDeviceCtx> deviceCtx, ibv_device_attr devAttr, const char* deviceName,
17+
ui32 portNum, ibv_port_attr portAttr, int gidIndex, ibv_gid gid
18+
)
19+
: DeviceCtx(std::move(deviceCtx))
20+
, DevAttr(devAttr)
21+
, DeviceName(deviceName)
22+
, PortNum(portNum)
23+
, PortAttr(portAttr)
24+
, GidIndex(gidIndex)
25+
, Gid(gid)
26+
{}
27+
28+
TDeviceCtx::~TDeviceCtx() {
29+
ibv_dealloc_pd(ProtDomain);
30+
ibv_close_device(Context);
31+
}
32+
33+
std::shared_ptr<TRdmaCtx> TRdmaCtx::Create(std::shared_ptr<TDeviceCtx> deviceCtx, ui32 portNum, int gidIndex) {
34+
const char* deviceName = ibv_get_device_name(deviceCtx->Context->device);
35+
36+
ibv_device_attr devAttr;
37+
int err = ibv_query_device(deviceCtx->Context, &devAttr);
38+
if (err) {
39+
Cerr << "ibv_query_device failed on {device# " << deviceName << "} : " << strerror(errno) << Endl;
40+
return nullptr;
41+
}
42+
43+
ibv_port_attr portAttr;
44+
err = ibv_query_port(deviceCtx->Context, portNum, &portAttr);
45+
if (err) {
46+
Cerr << "ibv_query_port failed on {device# " << deviceName << "} : " << strerror(errno) << Endl;
47+
return nullptr;
48+
}
49+
50+
if (portAttr.link_layer != IBV_LINK_LAYER_ETHERNET) {
51+
Cerr << "{device# " << deviceName << ", port# " << (int)portNum << "} is not RoCE" << Endl;
52+
return nullptr;
53+
}
54+
55+
ibv_gid gid;
56+
err = ibv_query_gid(deviceCtx->Context, portNum, gidIndex, &gid);
57+
if (err) {
58+
Cerr << "ibv_query_gid failed on {device# " << deviceName << ", port# " << (int)portNum << ", gidIndex# " << gidIndex << "} : " << strerror(errno) << Endl;
59+
return nullptr;
60+
}
61+
62+
if (gid.global.interface_id == 0) {
63+
// there are a lot of devices with no GID, so we just skip them
64+
return nullptr;
65+
}
66+
67+
TRdmaCtx* ctx = new TRdmaCtx(std::move(deviceCtx), devAttr, deviceName, portNum, portAttr, gidIndex, gid);
68+
return std::shared_ptr<TRdmaCtx>(ctx);
69+
}
70+
71+
TString TRdmaCtx::ToString() const {
72+
TStringStream str;
73+
Output(str);
74+
return str.Str();
75+
}
76+
77+
void TRdmaCtx::Output(IOutputStream &str) const {
78+
str << "{device_name# " << GetDeviceName()
79+
<< " port_num# " << GetPortNum()
80+
<< " gid_index# " << GetGidIndex()
81+
<< " gid# " << GetGid() << "}";
82+
}
83+
84+
}
85+
86+
IOutputStream& operator<<(IOutputStream& os, const ibv_gid& gid) {
87+
char gidStr[INET6_ADDRSTRLEN];
88+
inet_ntop(AF_INET6, &gid, gidStr, INET6_ADDRSTRLEN);
89+
os << gidStr;
90+
return os;
91+
}
92+
93+
IOutputStream& operator<<(IOutputStream& os, const NInterconnect::NRdma::TRdmaCtx& rdmaCtx) {
94+
rdmaCtx.Output(os);
95+
return os;
96+
}
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
#pragma once
2+
3+
#include <memory>
4+
#include <util/generic/noncopyable.h>
5+
#include <util/system/types.h>
6+
#include <util/generic/string.h>
7+
8+
#include <contrib/libs/ibdrv/include/infiniband/verbs.h>
9+
10+
extern "C" {
11+
12+
struct ibv_context;
13+
struct ibv_pd;
14+
15+
}
16+
17+
namespace NInterconnect::NRdma {
18+
19+
namespace NLinkMgr {
20+
class TRdmaLinkManager;
21+
}
22+
23+
struct TDeviceCtx : public NNonCopyable::TNonCopyable {
24+
TDeviceCtx(ibv_context* ctx, ibv_pd* pd);
25+
26+
~TDeviceCtx();
27+
28+
ibv_context* const Context;
29+
ibv_pd* const ProtDomain;
30+
};
31+
32+
class TRdmaCtx : public NNonCopyable::TNonCopyable {
33+
friend class NLinkMgr::TRdmaLinkManager;
34+
TRdmaCtx(
35+
std::shared_ptr<TDeviceCtx> deviceCtx, ibv_device_attr devAttr, const char* deviceName,
36+
ui32 portNum, ibv_port_attr portAttr, int gidIndex, ibv_gid gid
37+
);
38+
39+
public:
40+
static std::shared_ptr<TRdmaCtx> Create(std::shared_ptr<TDeviceCtx> deviceCtx, ui32 portNum, int gidIndex);
41+
42+
~TRdmaCtx() = default;
43+
44+
ibv_context* GetContext() const {
45+
return DeviceCtx->Context;
46+
}
47+
ibv_pd* GetProtDomain() const {
48+
return DeviceCtx->ProtDomain;
49+
}
50+
const ibv_device_attr& GetDevAttr() const {
51+
return DevAttr;
52+
}
53+
const char* GetDeviceName() const {
54+
return DeviceName;
55+
}
56+
ui32 GetPortNum() const {
57+
return PortNum;
58+
}
59+
const ibv_port_attr& GetPortAttr() const {
60+
return PortAttr;
61+
}
62+
int GetGidIndex() const {
63+
return GidIndex;
64+
}
65+
const ibv_gid& GetGid() const {
66+
return Gid;
67+
}
68+
size_t GetDeviceIndex() const {
69+
return DeviceIndex;
70+
}
71+
72+
void Output(IOutputStream &str) const;
73+
TString ToString() const;
74+
75+
private:
76+
const std::shared_ptr<TDeviceCtx> DeviceCtx;
77+
const ibv_device_attr DevAttr;
78+
const char* DeviceName;
79+
const ui32 PortNum;
80+
const ibv_port_attr PortAttr;
81+
const int GidIndex;
82+
const ibv_gid Gid;
83+
size_t DeviceIndex;
84+
};
85+
86+
}
87+
88+
IOutputStream& operator<<(IOutputStream& os, const ibv_gid& gid);
89+
IOutputStream& operator<<(IOutputStream& os, const NInterconnect::NRdma::TRdmaCtx& rdmaCtx);
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
#include "link_manager.h"
2+
#include "ctx.h"
3+
4+
#include <contrib/libs/ibdrv/include/infiniband/verbs.h>
5+
6+
#include <util/generic/scope.h>
7+
#include <util/generic/string.h>
8+
#include <util/stream/output.h>
9+
#include <util/string/printf.h>
10+
#include <util/string/builder.h>
11+
12+
#include <errno.h>
13+
14+
#include <memory>
15+
16+
#include <util/network/address.h>
17+
18+
template <>
19+
struct std::less<ibv_gid> {
20+
std::size_t operator()(const ibv_gid& a, const ibv_gid& b) const {
21+
return std::tie(a.global.subnet_prefix, a.global.interface_id) <
22+
std::tie(b.global.subnet_prefix, b.global.interface_id);
23+
}
24+
};
25+
template <>
26+
struct std::equal_to<ibv_gid> {
27+
bool operator()(const ibv_gid& a, const ibv_gid& b) const {
28+
return a.global.interface_id == b.global.interface_id
29+
&& a.global.subnet_prefix == b.global.subnet_prefix;
30+
}
31+
};
32+
33+
namespace NInterconnect::NRdma::NLinkMgr {
34+
35+
static class TRdmaLinkManager {
36+
public:
37+
const TCtxsMap& GetAllCtxs() {
38+
return CtxMap;
39+
}
40+
41+
TRdmaCtx* GetCtx(const ibv_gid& gid) {
42+
auto it = std::lower_bound(
43+
CtxMap.begin(), CtxMap.end(),
44+
std::pair<ibv_gid, std::shared_ptr<NInterconnect::NRdma::TRdmaCtx>>{gid, nullptr},
45+
[](const auto& a, const auto& b) {
46+
return std::less<ibv_gid>()(a.first, b.first);
47+
}
48+
);
49+
if (it != CtxMap.end() && std::equal_to<ibv_gid>()(it->first, gid)) {
50+
return it->second.get();
51+
}
52+
Cerr << "No RDMA context found for GID: " << gid << Endl;
53+
return nullptr;
54+
}
55+
56+
TRdmaLinkManager() {
57+
ScanDevices();
58+
}
59+
private:
60+
TCtxsMap CtxMap;
61+
62+
void ScanDevices() {
63+
int numDevices = 0;
64+
int err;
65+
ibv_device** deviceList = ibv_get_device_list(&numDevices);
66+
if (!deviceList) {
67+
ErrNo = errno;
68+
Err = TString(strerror(errno));
69+
return;
70+
}
71+
72+
Y_DEFER{ ibv_free_device_list(deviceList); };
73+
74+
for (int i = 0; i < numDevices; i++) {
75+
ibv_device* dev = deviceList[i];
76+
ibv_context* ctx = ibv_open_device(dev);
77+
if (!ctx) {
78+
Err = Sprintf("Failed to open ib device '%s'", ibv_get_device_name(dev));
79+
continue;
80+
}
81+
82+
ibv_pd* pd = ibv_alloc_pd(ctx);
83+
if (!pd) {
84+
ibv_close_device(ctx);
85+
continue;
86+
}
87+
88+
auto deviceCtx = std::make_shared<TDeviceCtx>(ctx, pd);
89+
90+
ibv_device_attr devAttrs;
91+
err = ibv_query_device(ctx, &devAttrs);
92+
93+
if (err < 0) {
94+
continue;
95+
}
96+
97+
for (uint8_t portNum = 1; portNum <= devAttrs.phys_port_cnt; portNum++) {
98+
ibv_port_attr portAttrs;
99+
err = ibv_query_port(ctx, portNum, &portAttrs);
100+
if (err == 0) {
101+
for (int gidIndex = 0; gidIndex < portAttrs.gid_tbl_len; gidIndex++ ) {
102+
auto ctx = TRdmaCtx::Create(deviceCtx, portNum, gidIndex);
103+
if (!ctx) {
104+
continue;
105+
}
106+
107+
CtxMap.emplace_back(ctx->GetGid(), ctx);
108+
}
109+
}
110+
}
111+
}
112+
std::sort(CtxMap.begin(), CtxMap.end(),
113+
[](const auto& a, const auto& b) {
114+
return std::less<ibv_gid>()(a.first, b.first);
115+
});
116+
117+
// check for duplicates
118+
for (size_t i = 0; i < CtxMap.size(); ++i) {
119+
auto ctx = CtxMap[i].second;
120+
ctx->DeviceIndex = i;
121+
122+
if (i > 0) {
123+
auto prevCtx = CtxMap[i - 1].second;
124+
if (std::equal_to<ibv_gid>()(prevCtx->GetGid(), ctx->GetGid())) {
125+
Cerr << "Duplicate GID found: ctx1=" << prevCtx->ToString() << ", ctx2=" << ctx->ToString() << Endl;
126+
}
127+
}
128+
}
129+
}
130+
131+
int ErrNo = 0;
132+
TString Err;
133+
134+
} RdmaLinkManager;
135+
136+
TRdmaCtx* GetCtx(int sockfd) {
137+
sockaddr_storage addr;
138+
socklen_t addrLen = sizeof(addr);
139+
if (getsockname(sockfd, reinterpret_cast<sockaddr*>(&addr), &addrLen) < 0) {
140+
Cerr << "getsockname failed: " << strerror(errno) << Endl;
141+
return nullptr;
142+
}
143+
sockaddr_in6* addr_in6 = (sockaddr_in6*)&addr;
144+
char str[INET6_ADDRSTRLEN];
145+
inet_ntop(AF_INET6, &addr_in6->sin6_addr, str, INET6_ADDRSTRLEN);
146+
return GetCtx(addr_in6->sin6_addr);
147+
}
148+
149+
TRdmaCtx* GetCtx(const in6_addr& ip) {
150+
const ibv_gid* gid = reinterpret_cast<const ibv_gid*>(&ip);
151+
return RdmaLinkManager.GetCtx(*gid);
152+
}
153+
154+
const TCtxsMap& GetAllCtxs() {
155+
return RdmaLinkManager.GetAllCtxs();
156+
}
157+
158+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#pragma once
2+
3+
#include <contrib/libs/ibdrv/include/infiniband/verbs.h>
4+
5+
#include <util/generic/fwd.h>
6+
7+
8+
struct in6_addr;
9+
10+
namespace NInterconnect::NRdma {
11+
class TRdmaCtx;
12+
}
13+
14+
// LinkManager is a component returning global context associated
15+
// with RDMA device by given ipv6 address.
16+
// In case of ipv4 address must be propogated to ipv6
17+
namespace NInterconnect::NRdma::NLinkMgr {
18+
19+
using TCtxsMap = std::vector<std::pair<ibv_gid, std::shared_ptr<NInterconnect::NRdma::TRdmaCtx>>>;
20+
21+
TRdmaCtx* GetCtx(int sockfd);
22+
TRdmaCtx* GetCtx(const in6_addr& );
23+
const TCtxsMap& GetAllCtxs();
24+
25+
}

0 commit comments

Comments
 (0)