Skip to content
This repository was archived by the owner on Dec 9, 2025. It is now read-only.

Commit 244fa79

Browse files
committed
feat: discovery of rdma raw devices
Problem: we only see rdma devices associated with a netlink. Solution: do a discovery loop that also lists all rdma devices. Signed-off-by: vsoch <vsoch@users.noreply.github.com>
1 parent a15ec9b commit 244fa79

File tree

2 files changed

+162
-33
lines changed

2 files changed

+162
-33
lines changed

README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,14 @@ Install the latest stable version of DraNet using the provided manifest:
103103
kubectl apply -f https://raw.githubusercontent.com/google/dranet/refs/heads/main/install.yaml
104104
```
105105

106+
### Development
107+
108+
To build your own image for testing. Here is an example with a custom registry `ghcr.io/converged-computing`:
109+
110+
```sh
111+
REGISTRY=ghcr.io/converged-computing make image
112+
```
113+
106114
### How to Use It
107115

108116
Once DraNet is running, you can inspect the network interfaces and their

pkg/inventory/db.go

Lines changed: 154 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"context"
2121
"fmt"
2222
"net"
23+
"os"
24+
"path/filepath"
2325
"strings"
2426
"sync"
2527
"time"
@@ -116,10 +118,6 @@ func (db *DB) GetPodNamespace(pod string) string {
116118
func (db *DB) Run(ctx context.Context) error {
117119
defer close(db.notifications)
118120

119-
nlHandle, err := netlink.NewHandle()
120-
if err != nil {
121-
return fmt.Errorf("error creating netlink handle %v", err)
122-
}
123121
// Resources are published periodically or if there is a netlink notification
124122
// indicating a new interfaces was added or changed
125123
nlChannel := make(chan netlink.LinkUpdate)
@@ -131,54 +129,66 @@ func (db *DB) Run(ctx context.Context) error {
131129

132130
// Obtain data that will not change after the startup
133131
db.instance = getInstanceProperties(ctx)
134-
// TODO: it is not common but may happen in edge cases that the default gateway changes
135-
// revisit once we have more evidence this can be a potential problem or break some use
136-
// cases.
137-
gwInterfaces := getDefaultGwInterfaces()
138132

139133
for {
134+
140135
err := db.rateLimiter.Wait(ctx)
141136
if err != nil {
142137
klog.Error(err, "unexpected rate limited error trying to get system interfaces")
143138
}
144139

145-
devices := []resourceapi.Device{}
146-
ifaces, err := nlHandle.LinkList()
140+
// Device lookup to prevent duplicated
141+
devices := make(map[string]*resourceapi.Device)
142+
143+
// We need to not register an RDMA device twice
144+
seenRdmaDevices := sets.New[string]()
145+
146+
// Kernel network interfaces
147+
err, netlinkDevices := db.discoverNetlinkDevices()
147148
if err != nil {
148-
klog.Error(err, "unexpected error trying to get system interfaces")
149+
return err
149150
}
150-
for _, iface := range ifaces {
151-
klog.V(7).InfoS("Checking network interface", "name", iface.Attrs().Name)
152-
if gwInterfaces.Has(iface.Attrs().Name) {
153-
klog.V(4).Infof("iface %s is an uplink interface", iface.Attrs().Name)
154-
continue
155-
}
151+
for pciAddr, device := range netlinkDevices {
156152

157-
if ignoredInterfaceNames.Has(iface.Attrs().Name) {
158-
klog.V(4).Infof("iface %s is in the list of ignored interfaces", iface.Attrs().Name)
153+
// I don't think we want to add un-named network interfaces
154+
ifName, ok := device.Basic.Attributes["dra.net/ifName"]
155+
if !ok {
159156
continue
160157
}
161158

162-
// skip loopback interfaces
163-
if iface.Attrs().Flags&net.FlagLoopback != 0 {
164-
continue
159+
// If it has RDMA, mark as seen
160+
if rdmaName, err := rdmamap.GetRdmaDeviceForNetdevice(*ifName.StringValue); err == nil && rdmaName != "" {
161+
klog.V(4).Infof("Found netdev '%s' with associated RDMA device '%s'. Merging.", ifName, rdmaName)
162+
163+
// This is the value from getPciAddress
164+
seenRdmaDevices.Insert(pciAddr)
165165
}
166+
devices[*ifName.StringValue] = device
167+
168+
}
169+
// We only allow rdma devices that have PCI address
170+
for pciAddr, rdmaDevice := range db.discoverRawRdmaDevices() {
166171

167-
// publish this network interface
168-
device, err := db.netdevToDRAdev(iface)
169-
if err != nil {
170-
klog.V(2).Infof("could not obtain attributes for iface %s : %v", iface.Attrs().Name, err)
172+
// Have we already seen it?
173+
_, ok := seenRdmaDevices[pciAddr]
174+
if ok {
171175
continue
172176
}
177+
devices[rdmaDevice.Name] = rdmaDevice
178+
}
173179

174-
devices = append(devices, *device)
175-
klog.V(4).Infof("Found following network interface %s", iface.Attrs().Name)
180+
// Create final list to publish
181+
finalDevices := make([]resourceapi.Device, 0, len(devices))
182+
for _, device := range devices {
183+
finalDevices = append(finalDevices, *device)
176184
}
177185

178-
klog.V(4).Infof("Found %d devices", len(devices))
179-
if len(devices) > 0 {
180-
db.notifications <- devices
186+
klog.V(4).Infof("Found %d devices", len(finalDevices))
187+
if len(finalDevices) > 0 {
188+
db.notifications <- finalDevices
181189
}
190+
191+
// Wait for the next event or timeout
182192
select {
183193
// trigger a reconcile
184194
case <-nlChannel:
@@ -197,6 +207,105 @@ func (db *DB) GetResources(ctx context.Context) <-chan []resourceapi.Device {
197207
return db.notifications
198208
}
199209

210+
// discoverNetlinkDevices scans for kernel network interfaces
211+
func (db *DB) discoverNetlinkDevices() (error, map[string]*resourceapi.Device) {
212+
klog.V(4).Info("Starting netlink device discovery...")
213+
devices := make(map[string]*resourceapi.Device)
214+
215+
// TODO: it is not common but may happen in edge cases that the default gateway changes
216+
// revisit once we have more evidence this can be a potential problem or break some use
217+
// cases.
218+
gwInterfaces := getDefaultGwInterfaces()
219+
nlHandle, err := netlink.NewHandle()
220+
if err != nil {
221+
return fmt.Errorf("error creating netlink handle %v", err), devices
222+
}
223+
224+
// Don't return early - we want print to user at end of function
225+
// @vsoch This is logic from previous refactored version.
226+
ifaces, err := nlHandle.LinkList()
227+
if err != nil {
228+
klog.Error(err, "unexpected error trying to get system interfaces")
229+
}
230+
231+
for _, iface := range ifaces {
232+
attrs := iface.Attrs()
233+
234+
klog.V(7).InfoS("Checking network interface", "name", attrs.Name)
235+
if gwInterfaces.Has(attrs.Name) {
236+
klog.V(4).Infof("iface %s is an uplink interface", attrs.Name)
237+
continue
238+
}
239+
240+
if ignoredInterfaceNames.Has(attrs.Name) {
241+
klog.V(4).Infof("iface %s is in the list of ignored interfaces", attrs.Name)
242+
continue
243+
}
244+
245+
// skip loopback interfaces
246+
if attrs.Flags&net.FlagLoopback != 0 {
247+
continue
248+
}
249+
250+
// publish this network interface
251+
device, err := db.netdevToDRAdev(iface)
252+
if err != nil {
253+
klog.V(2).Infof("could not obtain attributes for iface %s : %v", attrs.Name, err)
254+
continue
255+
}
256+
257+
// This could be error prone if a missing address leads to a second entry in rdma devices
258+
pciAddr, err := getPciAddress(attrs.Name, sysnetPath)
259+
if err != nil {
260+
klog.Warningf("could not get PCI address for netdev %s, using fallback key. error: %v", attrs.Name, err)
261+
pciAddr = "netdev-" + attrs.Name
262+
}
263+
devices[pciAddr] = device
264+
}
265+
klog.V(4).Infof("Finished netlink discovery. Found %d devices.", len(devices))
266+
return nil, devices
267+
}
268+
269+
// discoverRawRdmaDevices scans for raw RDMA devices using rdmamap listing
270+
func (db *DB) discoverRawRdmaDevices() map[string]*resourceapi.Device {
271+
klog.V(4).Info("Starting raw RDMA device discovery...")
272+
devices := make(map[string]*resourceapi.Device)
273+
274+
// This was tested to work to list an Infiniband device without an associated netlink
275+
deviceNames := rdmamap.GetRdmaDeviceList()
276+
277+
for _, rdmaName := range deviceNames {
278+
pciAddr, err := getPciAddress(rdmaName, rdmamap.RdmaClassDir)
279+
280+
// Assume that a missing PCI address would be missing for both netlink and rdma (not sure if this is true)
281+
// I think there are cases when we wouldn't have one, but I want to be conservative and only
282+
// allow RDMA interfaces with associated PCI addresses. This can change if needed.
283+
if err != nil {
284+
klog.Warningf("could not get PCI address for RDMA device %s, skipping: %v", rdmaName, err)
285+
continue
286+
}
287+
sanitizedName := names.SetDeviceName(rdmaName)
288+
289+
// Create a new resourceapi device for the RDMA raw device
290+
device := &resourceapi.Device{
291+
Name: sanitizedName,
292+
Basic: &resourceapi.BasicDevice{
293+
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
294+
"dra.net/rdma": {BoolValue: ptr.To(true)},
295+
"dra.net/ifName": {StringValue: &rdmaName},
296+
// https://github.com/vishvananda/netlink/blob/master/nl/nl_linux.go#L143
297+
// This could also be ib, but "infiniband" is more clear
298+
"dra.net/type": {StringValue: ptr.To("infiniband")},
299+
},
300+
},
301+
}
302+
addPCIAttributes(device.Basic, rdmaName, rdmamap.RdmaClassDir)
303+
devices[pciAddr] = device
304+
}
305+
klog.V(4).Infof("Finished raw RDMA discovery. Found %d devices.", len(devices))
306+
return devices
307+
}
308+
200309
func (db *DB) netdevToDRAdev(link netlink.Link) (*resourceapi.Device, error) {
201310
ifName := link.Attrs().Name
202311
device := resourceapi.Device{
@@ -207,6 +316,7 @@ func (db *DB) netdevToDRAdev(link netlink.Link) (*resourceapi.Device, error) {
207316
}
208317
// Set the device name. It will be normalized only if necessary.
209318
device.Name = names.SetDeviceName(ifName)
319+
210320
// expose the real interface name as an attribute in case it is normalized.
211321
device.Basic.Attributes["dra.net/ifName"] = resourceapi.DeviceAttribute{StringValue: &ifName}
212322

@@ -291,6 +401,7 @@ func (db *DB) netdevToDRAdev(link netlink.Link) (*resourceapi.Device, error) {
291401
return &device, nil
292402
}
293403

404+
// addPCIAttributes adds dra.net device.Attributes
294405
func addPCIAttributes(device *resourceapi.BasicDevice, ifName string, path string) {
295406
device.Attributes["dra.net/virtual"] = resourceapi.DeviceAttribute{BoolValue: ptr.To(false)}
296407

@@ -303,7 +414,7 @@ func addPCIAttributes(device *resourceapi.BasicDevice, ifName string, path strin
303414
device.Attributes["resource.kubernetes.io/pcieRoot"] = resourceapi.DeviceAttribute{StringValue: &pcieRoot}
304415
}
305416
} else {
306-
klog.Infof("could not get pci root : %v", err)
417+
klog.Infof("could not get pci root for %s: %v", ifName, err)
307418
}
308419

309420
entry, err := ids(ifName, path)
@@ -318,11 +429,21 @@ func addPCIAttributes(device *resourceapi.BasicDevice, ifName string, path strin
318429
device.Attributes["dra.net/pciSubsystem"] = resourceapi.DeviceAttribute{StringValue: &entry.Subsystem}
319430
}
320431
} else {
321-
klog.Infof("could not get pci vendor information : %v", err)
432+
klog.Infof("could not get pci vendor information for %s: %v", ifName, err)
322433
}
323434

324435
numa, err := numaNode(ifName, path)
325436
if err == nil {
326437
device.Attributes["dra.net/numaNode"] = resourceapi.DeviceAttribute{IntValue: &numa}
327438
}
328439
}
440+
441+
// getPciAddress is a helper function to get PCI address
442+
func getPciAddress(ifName string, basePath string) (string, error) {
443+
devicePath := filepath.Join(basePath, ifName, "device")
444+
pciLinkTarget, err := os.Readlink(devicePath)
445+
if err != nil {
446+
return "", err
447+
}
448+
return filepath.Base(pciLinkTarget), nil
449+
}

0 commit comments

Comments
 (0)