-
Notifications
You must be signed in to change notification settings - Fork 55
Expand file tree
/
Copy pathlib.rs
More file actions
2214 lines (2047 loc) · 79.3 KB
/
lib.rs
File metadata and controls
2214 lines (2047 loc) · 79.3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
//! Rust API wrapping the `ibverbs` RDMA library.
//!
//! `libibverbs` is a library that allows userspace processes to use RDMA "verbs" to perform
//! high-throughput, low-latency network operations for both Infiniband (according to the
//! Infiniband specifications) and iWarp (iWARP verbs specifications). It handles the control path
//! of creating, modifying, querying and destroying resources such as Protection Domains,
//! Completion Queues, Queue-Pairs, Shared Receive Queues, Address Handles, and Memory Regions. It
//! also handles sending and receiving data posted to QPs and SRQs, and getting completions from
//! CQs using polling and completions events.
//!
//! A good place to start is to look at the programs in [`examples/`](examples/), and the upstream
//! [C examples]. You can test RDMA programs on modern Linux kernels even without specialized RDMA
//! hardware by using [SoftRoCE][soft].
//!
//! # For the detail-oriented
//!
//! The control path is implemented through system calls to the `uverbs` kernel module, which
//! further calls the low-level HW driver. The data path is implemented through calls made to
//! low-level HW library which, in most cases, interacts directly with the HW provides kernel and
//! network stack bypass (saving context/mode switches) along with zero copy and an asynchronous
//! I/O model.
//!
//! iWARP ethernet NICs support RDMA over hardware-offloaded TCP/IP, while InfiniBand is a general
//! high-throughput, low-latency networking technology. InfiniBand host channel adapters (HCAs) and
//! iWARP NICs commonly support direct hardware access from userspace (kernel bypass), and
//! `libibverbs` supports this when available.
//!
//! For more information on RDMA verbs, see the [InfiniBand Architecture Specification][infini]
//! vol. 1, especially chapter 11, and the RDMA Consortium's [RDMA Protocol Verbs
//! Specification][RFC5040]. See also the upstream [`libibverbs/verbs.h`] file for the original C
//! definitions, as well as the manpages for the `ibv_*` methods.
//!
//! # Library dependency
//!
//! `libibverbs` is usually available as a free-standing [library package]. It [used to be][1]
//! self-contained, but has recently been adopted into [`rdma-core`]. `cargo` will automatically
//! build the necessary library files and place them in `vendor/rdma-core/build/lib`. If a
//! system-wide installation is not available, those library files can be used instead by copying
//! them to `/usr/lib`, or by adding that path to the dynamic linking search path.
//!
//! # Thread safety
//!
//! All interfaces are `Sync` and `Send` since the underlying ibverbs API [is thread safe][safe].
//!
//! # Documentation
//!
//! Much of the documentation of this crate borrows heavily from the excellent posts over at
//! [RDMAmojo]. If you are going to be working a lot with ibverbs, chances are you will want to
//! head over there. In particular, [this overview post][1] may be a good place to start.
//!
//! [`rdma-core`]: https://github.com/linux-rdma/rdma-core
//! [`libibverbs/verbs.h`]: https://github.com/linux-rdma/rdma-core/blob/master/libibverbs/verbs.h
//! [library package]: https://launchpad.net/ubuntu/+source/libibverbs
//! [C examples]: https://github.com/linux-rdma/rdma-core/tree/master/libibverbs/examples
//! [1]: https://git.kernel.org/pub/scm/libs/infiniband/libibverbs.git/about/
//! [infini]: http://www.infinibandta.org/content/pages.php?pg=technology_public_specification
//! [RFC5040]: https://tools.ietf.org/html/rfc5040
//! [safe]: http://www.rdmamojo.com/2013/07/26/libibverbs-thread-safe-level/
//! [soft]: https://github.com/SoftRoCE/rxe-dev/wiki/rxe-dev:-Home
//! [RDMAmojo]: http://www.rdmamojo.com/
//! [1]: http://www.rdmamojo.com/2012/05/18/libibverbs/
#![deny(missing_docs)]
#![warn(rust_2018_idioms)]
// avoid warnings about RDMAmojo, iWARP, InfiniBand, etc. not being in backticks
#![allow(clippy::doc_markdown)]
use std::convert::TryInto;
use std::ffi::CStr;
use std::io;
use std::ops::RangeBounds;
use std::os::fd::BorrowedFd;
use std::os::raw::c_void;
use std::ptr;
use std::sync::Arc;
use std::time::Duration;
const PORT_NUM: u8 = 1;
/// Direct access to low-level libverbs FFI.
pub use ffi::ibv_gid_type;
pub use ffi::ibv_mtu;
pub use ffi::ibv_qp_type;
pub use ffi::ibv_wc;
pub use ffi::ibv_wc_opcode;
pub use ffi::ibv_wc_status;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
/// Access flags for use with `QueuePair` and `MemoryRegion`.
pub use ffi::ibv_access_flags;
/// Default access flags.
pub const DEFAULT_ACCESS_FLAGS: ffi::ibv_access_flags = ffi::ibv_access_flags(
ffi::ibv_access_flags::IBV_ACCESS_LOCAL_WRITE.0
| ffi::ibv_access_flags::IBV_ACCESS_REMOTE_WRITE.0
| ffi::ibv_access_flags::IBV_ACCESS_REMOTE_READ.0
| ffi::ibv_access_flags::IBV_ACCESS_REMOTE_ATOMIC.0
| ffi::ibv_access_flags::IBV_ACCESS_RELAXED_ORDERING.0,
);
/// Get list of available RDMA devices.
///
/// # Errors
///
/// - `EPERM`: Permission denied.
/// - `ENOMEM`: Insufficient memory to complete the operation.
/// - `ENOSYS`: No kernel support for RDMA.
pub fn devices() -> io::Result<DeviceList> {
let mut n = 0i32;
let devices = unsafe { ffi::ibv_get_device_list(&mut n as *mut _) };
if devices.is_null() {
return Err(io::Error::last_os_error());
}
let devices = unsafe {
use std::slice;
slice::from_raw_parts_mut(devices, n as usize)
};
Ok(DeviceList(devices))
}
/// List of available RDMA devices.
#[must_use]
pub struct DeviceList(&'static mut [*mut ffi::ibv_device]);
unsafe impl Sync for DeviceList {}
unsafe impl Send for DeviceList {}
impl Drop for DeviceList {
fn drop(&mut self) {
unsafe { ffi::ibv_free_device_list(self.0.as_mut_ptr()) };
}
}
impl DeviceList {
/// Returns an iterator over all found devices.
pub fn iter(&self) -> DeviceListIter<'_> {
DeviceListIter { list: self, i: 0 }
}
/// Returns the number of devices.
pub fn len(&self) -> usize {
self.0.len()
}
/// Returns `true` if there are any devices.
pub fn is_empty(&self) -> bool {
self.0.is_empty()
}
/// Returns the device at the given `index`, or `None` if out of bounds.
pub fn get(&self, index: usize) -> Option<Device<'_>> {
self.0.get(index).map(|d| d.into())
}
}
impl<'a> IntoIterator for &'a DeviceList {
type Item = <DeviceListIter<'a> as Iterator>::Item;
type IntoIter = DeviceListIter<'a>;
fn into_iter(self) -> Self::IntoIter {
DeviceListIter { list: self, i: 0 }
}
}
/// Iterator over a `DeviceList`.
pub struct DeviceListIter<'iter> {
list: &'iter DeviceList,
i: usize,
}
impl<'iter> Iterator for DeviceListIter<'iter> {
type Item = Device<'iter>;
fn next(&mut self) -> Option<Self::Item> {
let e = self.list.0.get(self.i);
if e.is_some() {
self.i += 1;
}
e.map(|e| e.into())
}
}
/// An RDMA device.
pub struct Device<'devlist>(&'devlist *mut ffi::ibv_device);
unsafe impl Sync for Device<'_> {}
unsafe impl Send for Device<'_> {}
impl<'d> From<&'d *mut ffi::ibv_device> for Device<'d> {
fn from(d: &'d *mut ffi::ibv_device) -> Self {
Device(d)
}
}
/// A Global unique identifier for ibv.
///
/// This struct acts as a rust wrapper for GUID value represented as `__be64` in
/// libibverbs. We introduce this struct, because u64 is stored in host
/// endianness, whereas ibverbs stores GUID in network order (big endian).
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Default, Copy, Clone, Debug, Eq, PartialEq, Hash)]
#[repr(transparent)]
pub struct Guid {
raw: [u8; 8],
}
impl Guid {
/// Upper 24 bits of the GUID are OUI (Organizationally Unique Identifier,
/// http://standards-oui.ieee.org/oui/oui.txt). The function returns OUI as
/// a 24-bit number inside a u32.
pub fn oui(&self) -> u32 {
let padded = [0, self.raw[0], self.raw[1], self.raw[2]];
u32::from_be_bytes(padded)
}
/// Returns `true` if this GUID is all zeroes, which is considered reserved.
pub fn is_reserved(&self) -> bool {
self.raw == [0; 8]
}
}
impl From<u64> for Guid {
fn from(guid: u64) -> Self {
Self {
raw: guid.to_be_bytes(),
}
}
}
impl From<Guid> for u64 {
fn from(guid: Guid) -> Self {
u64::from_be_bytes(guid.raw)
}
}
impl AsRef<ffi::__be64> for Guid {
fn as_ref(&self) -> &ffi::__be64 {
unsafe { &*self.raw.as_ptr().cast::<ffi::__be64>() }
}
}
impl<'devlist> Device<'devlist> {
/// Opens an RMDA device and creates a context for further use.
///
/// This context will later be used to query its resources or for creating resources.
///
/// Unlike what the verb name suggests, it doesn't actually open the device. This device was
/// opened by the kernel low-level driver and may be used by other user/kernel level code. This
/// verb only opens a context to allow user level applications to use it.
///
/// # Errors
///
/// - `EINVAL`: `PORT_NUM` is invalid (from `ibv_query_port_attr`).
/// - `ENOMEM`: Out of memory (from `ibv_query_port_attr`).
/// - `EMFILE`: Too many files are opened by this process (from `ibv_query_gid`).
/// - Other: the device is not in `ACTIVE` or `ARMED` state.
pub fn open(&self) -> io::Result<Context> {
Context::with_device(*self.0)
}
/// Returns a string of the name, which is associated with this RDMA device.
///
/// This name is unique within a specific machine (the same name cannot be assigned to more
/// than one device). However, this name isn't unique across an InfiniBand fabric (this name
/// can be found in different machines).
///
/// When there are more than one RDMA devices in a computer, changing the device location in
/// the computer (i.e. in the PCI bus) may result a change in the names associated with the
/// devices. In order to distinguish between the device, it is recommended using the device
/// GUID, returned by `Device::guid`.
///
/// The name is composed from:
///
/// - a *prefix* which describes the RDMA device vendor and model
/// - `cxgb3` - Chelsio Communications, T3 RDMA family
/// - `cxgb4` - Chelsio Communications, T4 RDMA family
/// - `ehca` - IBM, eHCA family
/// - `ipathverbs` - QLogic
/// - `mlx4` - Mellanox Technologies, ConnectX family
/// - `mthca` - Mellanox Technologies, InfiniHost family
/// - `nes` - Intel, Intel-NE family
/// - an *index* that helps to differentiate between several devices from the same vendor and
/// family in the same computer
pub fn name(&self) -> Option<&'devlist CStr> {
let name_ptr = unsafe { ffi::ibv_get_device_name(*self.0) };
if name_ptr.is_null() {
None
} else {
Some(unsafe { CStr::from_ptr(name_ptr) })
}
}
/// Returns the Global Unique IDentifier (GUID) of this RDMA device.
///
/// This GUID, that was assigned to this device by its vendor during the manufacturing, is
/// unique and can be used as an identifier to an RDMA device.
///
/// From the prefix of the RDMA device GUID, one can know who is the vendor of that device
/// using the [IEEE OUI](http://standards.ieee.org/develop/regauth/oui/oui.txt).
///
/// # Errors
///
/// - `EMFILE`: Too many files are opened by this process.
pub fn guid(&self) -> io::Result<Guid> {
let guid_int = unsafe { ffi::ibv_get_device_guid(*self.0) };
let guid: Guid = guid_int.into();
if guid.is_reserved() {
Err(io::Error::last_os_error())
} else {
Ok(guid)
}
}
/// Returns stable IB device index as it is assigned by the kernel
/// # Errors
///
/// - `ENOTSUP`: Stable index is not supported
pub fn index(&self) -> io::Result<i32> {
let idx = unsafe { ffi::ibv_get_device_index(*self.0) };
if idx == -1 {
Err(io::Error::new(
io::ErrorKind::Unsupported,
"device index not known",
))
} else {
Ok(idx)
}
}
}
struct ContextInner {
ctx: *mut ffi::ibv_context,
}
impl ContextInner {
fn query_port(&self) -> io::Result<ffi::ibv_port_attr> {
// TODO: from http://www.rdmamojo.com/2012/07/21/ibv_query_port/
//
// Most of the port attributes, returned by ibv_query_port(), aren't constant and may be
// changed, mainly by the SM (in InfiniBand), or by the Hardware. It is highly
// recommended avoiding saving the result of this query, or to flush them when a new SM
// (re)configures the subnet.
//
let mut port_attr = ffi::ibv_port_attr::default();
let errno = unsafe {
ffi::ibv_query_port(
self.ctx,
PORT_NUM,
&mut port_attr as *mut ffi::ibv_port_attr as *mut _,
)
};
if errno != 0 {
return Err(io::Error::from_raw_os_error(errno));
}
// From http://www.rdmamojo.com/2012/08/02/ibv_query_gid/:
//
// The content of the GID table is valid only when the port_attr.state is either
// IBV_PORT_ARMED or IBV_PORT_ACTIVE. For other states of the port, the value of the GID
// table is indeterminate.
//
match port_attr.state {
ffi::ibv_port_state::IBV_PORT_ACTIVE | ffi::ibv_port_state::IBV_PORT_ARMED => {}
_ => {
return Err(io::Error::other("port is not ACTIVE or ARMED"));
}
}
Ok(port_attr)
}
}
impl Drop for ContextInner {
fn drop(&mut self) {
let ok = unsafe { ffi::ibv_close_device(self.ctx) };
assert_eq!(ok, 0);
}
}
unsafe impl Sync for ContextInner {}
unsafe impl Send for ContextInner {}
/// An RDMA context bound to a device.
#[must_use]
pub struct Context {
inner: Arc<ContextInner>,
}
impl Context {
/// Opens a context for the given device, and queries its port and gid.
fn with_device(dev: *mut ffi::ibv_device) -> io::Result<Context> {
assert!(!dev.is_null());
let ctx = unsafe { ffi::ibv_open_device(dev) };
if ctx.is_null() {
return Err(io::Error::other("failed to open device"));
}
let inner = Arc::new(ContextInner { ctx });
let ctx = Context { inner };
// checks that the port is active/armed.
ctx.inner.query_port()?;
Ok(ctx)
}
/// Create a completion queue (CQ).
///
/// When an outstanding Work Request, within a Send or Receive Queue, is completed, a Work
/// Completion is being added to the CQ of that Work Queue. This Work Completion indicates that
/// the outstanding Work Request has been completed (and no longer considered outstanding) and
/// provides details on it (status, direction, opcode, etc.).
///
/// A single CQ can be shared for sending, receiving, and sharing across multiple QPs. The Work
/// Completion holds the information to specify the QP number and the Queue (Send or Receive)
/// that it came from.
///
/// `min_cq_entries` defines the minimum size of the CQ. The actual created size can be equal
/// or higher than this value. `id` is an opaque identifier that is echoed by
/// `CompletionQueue::poll`.
///
/// # Errors
///
/// - `EINVAL`: Invalid `min_cq_entries` (must be `1 <= cqe <= dev_cap.max_cqe`).
/// - `ENOMEM`: Not enough resources to complete this operation.
pub fn create_cq(&self, min_cq_entries: i32, id: isize) -> io::Result<CompletionQueue> {
let cc = unsafe { ffi::ibv_create_comp_channel(self.inner.ctx) };
if cc.is_null() {
return Err(io::Error::last_os_error());
}
let cc_fd = unsafe { *cc }.fd;
let flags = nix::fcntl::fcntl(cc_fd, nix::fcntl::F_GETFL)?;
// the file descriptor needs to be set to non-blocking because `ibv_get_cq_event()`
// would block otherwise.
let arg = nix::fcntl::FcntlArg::F_SETFL(
nix::fcntl::OFlag::from_bits_retain(flags) | nix::fcntl::OFlag::O_NONBLOCK,
);
nix::fcntl::fcntl(cc_fd, arg)?;
let cq = unsafe {
ffi::ibv_create_cq(
self.inner.ctx,
min_cq_entries,
ptr::null::<c_void>().offset(id) as *mut _,
cc,
0,
)
};
if cq.is_null() {
Err(io::Error::last_os_error())
} else {
Ok(CompletionQueue {
inner: Arc::new(CompletionQueueInner {
_ctx: self.inner.clone(),
cc,
cq,
}),
})
}
}
/// Allocate a protection domain (PDs) for the device's context.
///
/// The created PD will be used primarily to create `QueuePair`s and `MemoryRegion`s.
///
/// A protection domain is a means of protection, and helps you create a group of object that
/// can work together. If several objects were created using PD1, and others were created using
/// PD2, working with objects from group1 together with objects from group2 will not work.
pub fn alloc_pd(&self) -> io::Result<ProtectionDomain> {
let pd = unsafe { ffi::ibv_alloc_pd(self.inner.ctx) };
if pd.is_null() {
Err(io::Error::other("obv_alloc_pd returned null"))
} else {
Ok(ProtectionDomain {
inner: Arc::new(ProtectionDomainInner {
ctx: self.inner.clone(),
pd,
}),
})
}
}
/// Returns the valid GID table entries of this RDMA device context.
pub fn gid_table(&self) -> io::Result<Vec<GidEntry>> {
let max_entries = self.inner.query_port()?.gid_tbl_len as usize;
let mut gid_table = vec![ffi::ibv_gid_entry::default(); max_entries];
let num_entries = unsafe {
ffi::_ibv_query_gid_table(
self.inner.ctx,
gid_table.as_mut_ptr(),
max_entries,
0,
size_of::<ffi::ibv_gid_entry>(),
)
};
if num_entries < 0 {
return Err(io::Error::other(format!(
"failed to query gid table, error={}",
-num_entries
)));
}
gid_table.truncate(num_entries as usize);
let gid_table = gid_table.into_iter().map(GidEntry::from).collect();
Ok(gid_table)
}
}
struct CompletionQueueInner {
_ctx: Arc<ContextInner>,
cq: *mut ffi::ibv_cq,
cc: *mut ffi::ibv_comp_channel,
}
impl Drop for CompletionQueueInner {
fn drop(&mut self) {
let errno = unsafe { ffi::ibv_destroy_cq(self.cq) };
if errno != 0 {
let e = io::Error::from_raw_os_error(errno);
panic!("{e}");
}
let errno = unsafe { ffi::ibv_destroy_comp_channel(self.cc) };
if errno != 0 {
let e = io::Error::from_raw_os_error(errno);
panic!("{e}");
}
}
}
unsafe impl Send for CompletionQueueInner {}
unsafe impl Sync for CompletionQueueInner {}
/// A completion queue that allows subscribing to the completion of queued sends and receives.
#[must_use]
pub struct CompletionQueue {
inner: Arc<CompletionQueueInner>,
}
impl CompletionQueue {
/// Poll for (possibly multiple) work completions.
///
/// A Work Completion indicates that a Work Request in a Work Queue, and all of the outstanding
/// unsignaled Work Requests that posted to that Work Queue, associated with this CQ have
/// completed. Any Receive Requests, signaled Send Requests and Send Requests that ended with
/// an error will generate Work Completions.
///
/// When a Work Request ends, a Work Completion is added to the tail of the CQ that this Work
/// Queue is associated with. `poll` checks if Work Completions are present in a CQ, and pop
/// them from the head of the CQ in the order they entered it (FIFO) into `completions`. After
/// a Work Completion was popped from a CQ, it cannot be returned to it. `poll` returns the
/// subset of `completions` that successfully completed. If the returned slice has fewer
/// elements than the provided `completions` slice, the CQ was emptied.
///
/// Not all attributes of the completed `ibv_wc`'s are always valid. If the completion status
/// is not `IBV_WC_SUCCESS`, only the following attributes are valid: `wr_id`, `status`,
/// `qp_num`, and `vendor_err`.
///
/// Callers must ensure the CQ does not overrun (exceed its capacity), as this triggers an
/// `IBV_EVENT_CQ_ERR` async event, rendering the CQ unusable. You can do this by limiting
/// the number of inflight Work Requests.
///
/// Note that `poll` does not block or cause a context switch. This is why RDMA technologies
/// can achieve very low latency (below 1 µs).
#[inline]
pub fn poll<'c>(
&self,
completions: &'c mut [ffi::ibv_wc],
) -> io::Result<&'c mut [ffi::ibv_wc]> {
// TODO: from http://www.rdmamojo.com/2013/02/15/ibv_poll_cq/
//
// One should consume Work Completions at a rate that prevents the CQ from being overrun
// (hold more Work Completions than the CQ size). In case of an CQ overrun, the async
// event `IBV_EVENT_CQ_ERR` will be triggered, and the CQ cannot be used anymore.
//
let ctx: *mut ffi::ibv_context = unsafe { &*self.inner.cq }.context;
let ops = &mut unsafe { &mut *ctx }.ops;
let n = unsafe {
ops.poll_cq.as_mut().unwrap()(
self.inner.cq,
completions.len() as i32,
completions.as_mut_ptr(),
)
};
if n < 0 {
Err(io::Error::other("ibv_poll_cq failed"))
} else {
Ok(&mut completions[0..n as usize])
}
}
/// Waits for one or more work completions in a Completion Queue (CQ).
///
/// Unlike `poll`, this method blocks until at least one work completion is available or the
/// optional timeout expires. It is designed to wait efficiently for completions when polling
/// alone is insufficient, such as in low-traffic scenarios.
///
/// The returned slice reflects completed work requests (e.g., sends, receives) from the
/// associated Work Queue. Not all fields in `ibv_wc` are valid unless the status is
/// `IBV_WC_SUCCESS`.
///
/// # Errors
/// - `TimedOut`: If the timeout expires before any completions are available.
/// - System errors: From underlying calls like `req_notify_cq`, `poll`, or `ibv_get_cq_event`.
pub fn wait<'c>(
&self,
completions: &'c mut [ffi::ibv_wc],
timeout: Option<Duration>,
) -> io::Result<&'c mut [ffi::ibv_wc]> {
let c = completions as *mut [ffi::ibv_wc];
loop {
let polled_completions = self.poll(unsafe { &mut *c })?;
if !polled_completions.is_empty() {
return Ok(polled_completions);
}
// SAFETY: dereferencing completion queue context, which is guaranteed to not have
// been destroyed yet because we don't destroy it until in Drop, and given we have
// self, Drop has not been called. The context is guaranteed to not have been destroyed
// because the `CompletionQueue` holds a reference to the `Context` and we only destroy
// the context in Drop implementation of the `Context`.
let ctx = unsafe { *self.inner.cq }.context;
let errno = unsafe {
let ops = &mut { &mut *ctx }.ops;
ops.req_notify_cq.as_mut().unwrap()(self.inner.cq, 0)
};
if errno != 0 {
return Err(io::Error::from_raw_os_error(errno));
}
// We poll again to avoid a race when Work Completions arrive between the first `poll()` and `req_notify_cq()`.
let polled_completions = self.poll(unsafe { &mut *c })?;
if !polled_completions.is_empty() {
return Ok(polled_completions);
}
let pollfd = nix::poll::PollFd::new(
// SAFETY: dereferencing completion queue context, which is guaranteed to not have
// been destroyed yet because we don't destroy it until in Drop, and given we have
// self, Drop has not been called. `fd` is guaranteed to not have been destroyed
// because only destroy it in the Drop implementation of this `CompletionQueue` and
// we still hold `self` here.
unsafe { BorrowedFd::borrow_raw({ *self.inner.cc }.fd) },
nix::poll::PollFlags::POLLIN,
);
let ret = nix::poll::poll(
&mut [pollfd],
timeout
.map(nix::poll::PollTimeout::try_from)
.transpose()
.map_err(|_| io::Error::other("failed to convert timeout to PollTimeout"))?,
)?;
match ret {
0 => {
return Err(io::Error::new(
io::ErrorKind::TimedOut,
"Timed out during completion queue wait",
));
}
1 => {}
_ => unreachable!("we passed 1 fd to poll, but it returned {ret}"),
}
let mut out_cq = std::ptr::null_mut();
let mut out_cq_context = std::ptr::null_mut();
// The Completion Notification must be read using ibv_get_cq_event(). The file descriptor of
// `cq_context` was put into non-blocking mode to make `ibv_get_cq_event()` non-blocking.
// SAFETY: c ffi call
let rc =
unsafe { ffi::ibv_get_cq_event(self.inner.cc, &mut out_cq, &mut out_cq_context) };
if rc < 0 {
let e = io::Error::last_os_error();
if e.kind() == io::ErrorKind::WouldBlock {
continue;
}
return Err(e);
}
assert_eq!(self.inner.cq, out_cq);
// cq_context is the opaque user defined identifier passed to `ibv_create_cq()`.
assert!(out_cq_context.is_null());
// All completion events returned by ibv_get_cq_event() must eventually be acknowledged with ibv_ack_cq_events().
// SAFETY: c ffi call
unsafe { ffi::ibv_ack_cq_events(self.inner.cq, 1) };
}
}
}
/// An unconfigured `QueuePair`.
///
/// A `QueuePairBuilder` is used to configure a `QueuePair` before it is allocated and initialized.
/// To construct one, use `ProtectionDomain::create_qp`. See also [RDMAmojo] for many more details.
///
/// [RDMAmojo]: http://www.rdmamojo.com/2013/01/12/ibv_modify_qp/
pub struct QueuePairBuilder {
ctx: isize,
pd: Arc<ProtectionDomainInner>,
port_attr: ffi::ibv_port_attr,
send: Arc<CompletionQueueInner>,
max_send_wr: u32,
recv: Arc<CompletionQueueInner>,
max_recv_wr: u32,
gid_index: Option<u32>,
max_send_sge: u32,
max_recv_sge: u32,
max_inline_data: u32,
qp_type: ffi::ibv_qp_type,
// carried along to handshake phase
/// traffic class set in Global Routing Headers, only used if `gid_index` is set.
traffic_class: u8,
/// only valid for RC and UC
access: Option<ffi::ibv_access_flags>,
/// only valid for RC
timeout: Option<u8>,
/// only valid for RC
retry_count: Option<u8>,
/// only valid for RC
rnr_retry: Option<u8>,
/// only valid for RC
min_rnr_timer: Option<u8>,
/// only valid for RC
max_rd_atomic: Option<u8>,
/// only valid for RC
max_dest_rd_atomic: Option<u8>,
/// only valid for RC and UC
path_mtu: Option<ibv_mtu>,
/// only valid for RC and UC
rq_psn: Option<u32>,
/// service level (0-15). Higher value means higher priority.
service_level: u8,
}
impl QueuePairBuilder {
/// Prepare a new `QueuePair` builder.
///
/// `max_send_wr` is the maximum number of outstanding Work Requests that can be posted to the
/// Send Queue in that Queue Pair. Value must be in `[0..dev_cap.max_qp_wr]`. There may be RDMA
/// devices that for specific transport types may support less outstanding Work Requests than
/// the maximum reported value.
///
/// Similarly, `max_recv_wr` is the maximum number of outstanding Work Requests that can be
/// posted to the Receive Queue in that Queue Pair. Value must be in `[0..dev_cap.max_qp_wr]`.
/// There may be RDMA devices that for specific transport types may support less outstanding
/// Work Requests than the maximum reported value. This value is ignored if the Queue Pair is
/// associated with an SRQ
#[allow(clippy::too_many_arguments)]
fn new(
pd: Arc<ProtectionDomainInner>,
port_attr: ffi::ibv_port_attr,
send: Arc<CompletionQueueInner>,
max_send_wr: u32,
recv: Arc<CompletionQueueInner>,
max_recv_wr: u32,
qp_type: ffi::ibv_qp_type,
max_send_sge: u32,
max_recv_sge: u32,
) -> QueuePairBuilder {
let port_active_mtu = port_attr.active_mtu;
QueuePairBuilder {
ctx: 0,
pd,
port_attr,
gid_index: None,
traffic_class: 0,
send,
max_send_wr,
recv,
max_recv_wr,
max_send_sge,
max_recv_sge,
max_inline_data: 0,
qp_type,
access: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC
|| qp_type == ffi::ibv_qp_type::IBV_QPT_UC)
.then_some(ffi::ibv_access_flags::IBV_ACCESS_LOCAL_WRITE),
min_rnr_timer: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(16),
retry_count: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(6),
rnr_retry: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(6),
timeout: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(4),
max_rd_atomic: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(1),
max_dest_rd_atomic: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC).then_some(1),
path_mtu: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC
|| qp_type == ffi::ibv_qp_type::IBV_QPT_UC)
.then_some(port_active_mtu),
rq_psn: (qp_type == ffi::ibv_qp_type::IBV_QPT_RC
|| qp_type == ffi::ibv_qp_type::IBV_QPT_UC)
.then_some(0),
service_level: 0,
}
}
/// Set the access flags for the new `QueuePair`.
///
/// Valid only for RC and UC QPs.
///
/// Defaults to `IBV_ACCESS_LOCAL_WRITE`.
pub fn set_access(&mut self, access: ffi::ibv_access_flags) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC
|| self.qp_type == ffi::ibv_qp_type::IBV_QPT_UC
{
self.access = Some(access);
}
self
}
/// Set the access flags of the new `QueuePair` such that it allows remote reads and writes.
///
/// Valid only for RC and UC QPs.
pub fn allow_remote_rw(&mut self) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC
|| self.qp_type == ffi::ibv_qp_type::IBV_QPT_UC
{
self.access = Some(
self.access.expect("always set to Some in new")
| ffi::ibv_access_flags::IBV_ACCESS_REMOTE_WRITE
| ffi::ibv_access_flags::IBV_ACCESS_REMOTE_READ,
);
}
self
}
/// Set the service level of the new `QueuePair`.
/// service level (0-15). Higher value means higher priority.
/// Defaults to 0.
pub fn set_service_level(&mut self, service_level: u8) -> &mut Self {
self.service_level = service_level;
self
}
/// Sets the GID table index that should be used for the new `QueuePair`.
/// The entry corresponds to the index in `Context::gid_table()`. This is only used if the
/// `QueuePairEndpoint` that is passed to `QueuePair::handshake()` has a `gid`.
///
/// Defaults to unset.
pub fn set_gid_index(&mut self, gid_index: u32) -> &mut Self {
self.gid_index = Some(gid_index);
self
}
/// Sets the traffic class of the Global Routing Headers (GRH).
///
/// This value is only used if a `gid_index` was specified. Using this value, the originator
/// of the packets specifies the required delivery priority for handling them by the routers.
///
/// Defaults to 0.
pub fn set_traffic_class(&mut self, traffic_class: u8) -> &mut Self {
self.traffic_class = traffic_class;
self
}
/// Sets the minimum RNR NAK Timer Field Value for the new `QueuePair`.
///
/// Defaults to 16 (2.56 ms delay).
/// Valid only for RC QPs.
///
/// When an incoming message to this QP should consume a Work Request from the Receive Queue,
/// but no Work Request is outstanding on that Queue, the QP will send an RNR NAK packet to
/// the initiator. It does not affect RNR NAKs sent for other reasons. The value must be one of
/// the following values:
///
/// - 0 - 655.36 ms delay
/// - 1 - 0.01 ms delay
/// - 2 - 0.02 ms delay
/// - 3 - 0.03 ms delay
/// - 4 - 0.04 ms delay
/// - 5 - 0.06 ms delay
/// - 6 - 0.08 ms delay
/// - 7 - 0.12 ms delay
/// - 8 - 0.16 ms delay
/// - 9 - 0.24 ms delay
/// - 10 - 0.32 ms delay
/// - 11 - 0.48 ms delay
/// - 12 - 0.64 ms delay
/// - 13 - 0.96 ms delay
/// - 14 - 1.28 ms delay
/// - 15 - 1.92 ms delay
/// - 16 - 2.56 ms delay
/// - 17 - 3.84 ms delay
/// - 18 - 5.12 ms delay
/// - 19 - 7.68 ms delay
/// - 20 - 10.24 ms delay
/// - 21 - 15.36 ms delay
/// - 22 - 20.48 ms delay
/// - 23 - 30.72 ms delay
/// - 24 - 40.96 ms delay
/// - 25 - 61.44 ms delay
/// - 26 - 81.92 ms delay
/// - 27 - 122.88 ms delay
/// - 28 - 163.84 ms delay
/// - 29 - 245.76 ms delay
/// - 30 - 327.68 ms delay
/// - 31 - 491.52 ms delay
pub fn set_min_rnr_timer(&mut self, timer: u8) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC {
self.min_rnr_timer = Some(timer);
}
self
}
/// Sets the minimum timeout that the new `QueuePair` waits for ACK/NACK from remote QP before
/// retransmitting the packet.
///
/// Defaults to 4 (65.536µs).
/// Valid only for RC QPs.
///
/// The value zero is special value that waits an infinite time for the ACK/NACK (useful
/// for debugging). This means that if any packet in a message is being lost and no ACK or NACK
/// is being sent, no retry will ever occur and the QP will just stop sending data.
///
/// For any other value of timeout, the time calculation is `4.096*2^timeout`µs, giving:
///
/// - 0 - infinite
/// - 1 - 8.192 µs
/// - 2 - 16.384 µs
/// - 3 - 32.768 µs
/// - 4 - 65.536 µs
/// - 5 - 131.072 µs
/// - 6 - 262.144 µs
/// - 7 - 524.288 µs
/// - 8 - 1.048 ms
/// - 9 - 2.097 ms
/// - 10 - 4.194 ms
/// - 11 - 8.388 ms
/// - 12 - 16.777 ms
/// - 13 - 33.554 ms
/// - 14 - 67.108 ms
/// - 15 - 134.217 ms
/// - 16 - 268.435 ms
/// - 17 - 536.870 ms
/// - 18 - 1.07 s
/// - 19 - 2.14 s
/// - 20 - 4.29 s
/// - 21 - 8.58 s
/// - 22 - 17.1 s
/// - 23 - 34.3 s
/// - 24 - 68.7 s
/// - 25 - 137 s
/// - 26 - 275 s
/// - 27 - 550 s
/// - 28 - 1100 s
/// - 29 - 2200 s
/// - 30 - 4400 s
/// - 31 - 8800 s
pub fn set_timeout(&mut self, timeout: u8) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC {
self.timeout = Some(timeout);
}
self
}
/// Sets the total number of times that the new `QueuePair` will try to resend the packets
/// before reporting an error because the remote side doesn't answer in the primary path.
///
/// This 3 bit value defaults to 6.
/// Valid only for RC QPs.
///
/// # Panics
///
/// Panics if a count higher than 7 is given.
pub fn set_retry_count(&mut self, count: u8) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC {
assert!(count <= 7);
self.retry_count = Some(count);
}
self
}
/// Sets the total number of times that the new `QueuePair` will try to resend the packets when
/// an RNR NACK was sent by the remote QP before reporting an error.
///
/// This 3 bit value defaults to 6. The value 7 is special and specify to retry sending the
/// message indefinitely when a RNR Nack is being sent by remote side.
/// Valid only for RC QPs.
///
/// # Panics
///
/// Panics if a limit higher than 7 is given.
pub fn set_rnr_retry(&mut self, n: u8) -> &mut Self {
if self.qp_type == ffi::ibv_qp_type::IBV_QPT_RC {
assert!(n <= 7);
self.rnr_retry = Some(n);
}
self
}
/// Set the number of outstanding RDMA reads & atomic operations on the destination Queue Pair.
///
/// This defaults to 1.
/// Valid only for RC QPs.