Skip to content

Commit 45829cb

Browse files
dstaay-fbmeta-codesync[bot]
authored andcommitted
Lazy NIC device init + integration of NIC autodetection. (#1444)
Summary: Pull Request resolved: #1444 Rewrites RdmaManager internals wrt: Before - we select a given RDMA-compliant device on (on init via config) and use that for all network traffic. Now - RdmaManager is a general resource manager for a given process. For a given memory virtual addr, we will select the closest RDMA device (pci-distance), and setup the relevant QPs, pd, on the fly. So a given manager could be sending on traffic on "N" rdma devices. To make this work, we need to add a 'device_name' field to RdmaBuffer, so its a combination of other_id + other_device; introducing a many to many connection setup between two rdma managers. Reviewed By: casteryh Differential Revision: D83918488 fbshipit-source-id: f8b7e5cd1abe29e08afbcc526ab02053c9fa12f5
1 parent 8e27c74 commit 45829cb

File tree

10 files changed

+716
-287
lines changed

10 files changed

+716
-287
lines changed

monarch_rdma/examples/cuda_ping_pong/src/cuda_ping_pong.rs

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,12 @@ impl Handler<PerformPingPong> for CudaRdmaActor {
468468
}
469469
let qp = self
470470
.rdma_manager
471-
.request_queue_pair(cx, remote_buffer.owner.clone())
471+
.request_queue_pair(
472+
cx,
473+
remote_buffer.owner.clone(),
474+
local_buffer.device_name.clone(),
475+
remote_buffer.device_name.clone(),
476+
)
472477
.await?;
473478

474479
unsafe {

monarch_rdma/src/device_selection.rs

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,31 @@ pub fn select_optimal_rdma_device(device_hint: Option<&str>) -> Option<RdmaDevic
439439
}
440440
}
441441

442+
/// Creates a mapping from CUDA PCI addresses to optimal RDMA devices
443+
///
444+
/// This function discovers all available CUDA devices and determines the best
445+
/// RDMA device for each one using the device selection algorithm.
446+
///
447+
/// # Returns
448+
///
449+
/// * `HashMap<String, RdmaDevice>` - Map from CUDA PCI address to optimal RDMA device
450+
pub fn create_cuda_to_rdma_mapping() -> HashMap<String, RdmaDevice> {
451+
let mut mapping = HashMap::new();
452+
453+
// Try to discover CUDA devices (GPU 0-8 should be sufficient for most systems)
454+
for gpu_idx in 0..8 {
455+
let gpu_idx_str = gpu_idx.to_string();
456+
if let Some(cuda_pci_addr) = get_cuda_pci_address(&gpu_idx_str) {
457+
let cuda_hint = format!("cuda:{}", gpu_idx);
458+
if let Some(rdma_device) = select_optimal_rdma_device(Some(&cuda_hint)) {
459+
mapping.insert(cuda_pci_addr, rdma_device);
460+
}
461+
}
462+
}
463+
464+
mapping
465+
}
466+
442467
/// Resolves RDMA device using auto-detection logic when needed
443468
///
444469
/// This function applies auto-detection for default devices, but otherwise

monarch_rdma/src/rdma_components.rs

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ pub struct RdmaBuffer {
101101
pub rkey: u32,
102102
pub addr: usize,
103103
pub size: usize,
104+
pub device_name: String,
104105
}
105106

106107
impl RdmaBuffer {
@@ -128,10 +129,18 @@ impl RdmaBuffer {
128129
remote.owner.actor_id(),
129130
remote,
130131
);
131-
let remote_owner = remote.owner.clone(); // Clone before the move!
132+
let remote_owner = remote.owner.clone();
133+
134+
let local_device = self.device_name.clone();
135+
let remote_device = remote.device_name.clone();
132136
let mut qp = self
133137
.owner
134-
.request_queue_pair_deprecated(client, remote_owner.clone())
138+
.request_queue_pair_deprecated(
139+
client,
140+
remote_owner.clone(),
141+
local_device.clone(),
142+
remote_device.clone(),
143+
)
135144
.await?;
136145

137146
qp.put(self.clone(), remote)?;
@@ -141,7 +150,7 @@ impl RdmaBuffer {
141150

142151
// Release the queue pair back to the actor
143152
self.owner
144-
.release_queue_pair_deprecated(client, remote_owner, qp)
153+
.release_queue_pair_deprecated(client, remote_owner, local_device, remote_device, qp)
145154
.await?;
146155

147156
result
@@ -173,9 +182,19 @@ impl RdmaBuffer {
173182
remote,
174183
);
175184
let remote_owner = remote.owner.clone(); // Clone before the move!
185+
186+
// Extract device name from buffer, fallback to a default if not present
187+
let local_device = self.device_name.clone();
188+
let remote_device = remote.device_name.clone();
189+
176190
let mut qp = self
177191
.owner
178-
.request_queue_pair_deprecated(client, remote_owner.clone())
192+
.request_queue_pair_deprecated(
193+
client,
194+
remote_owner.clone(),
195+
local_device.clone(),
196+
remote_device.clone(),
197+
)
179198
.await?;
180199
qp.get(self.clone(), remote)?;
181200
let result = self
@@ -184,7 +203,7 @@ impl RdmaBuffer {
184203

185204
// Release the queue pair back to the actor
186205
self.owner
187-
.release_queue_pair_deprecated(client, remote_owner, qp)
206+
.release_queue_pair_deprecated(client, remote_owner, local_device, remote_device, qp)
188207
.await?;
189208

190209
result?;
@@ -877,6 +896,7 @@ impl RdmaQueuePair {
877896
if (wqe_cnt as u64) < (self.send_wqe_idx - self.send_db_idx) {
878897
return Err(anyhow::anyhow!("Overflow of WQE, possible data loss"));
879898
}
899+
self.apply_first_op_delay(self.send_db_idx);
880900
while self.send_db_idx < self.send_wqe_idx {
881901
let offset = (self.send_db_idx % wqe_cnt as u64) * stride as u64;
882902
let src_ptr = (base_ptr as *mut u8).wrapping_add(offset as usize);

0 commit comments

Comments
 (0)