Skip to content

Commit 1c57e04

Browse files
dstaay-fbmeta-codesync[bot]
authored andcommitted
Introduce small hardware delay before first send/write. (#1400)
Summary: Pull Request resolved: #1400 Testing on different internal infra, found that despite hardware Ready-To-Send state, hardware needs some time to settle. looking at NCCL/other comms libraries, a suggested latency is around 2ms; so make sure rts from software was a minimum of 2ms before first transaction; seems to resolve issue. buck run //monarch/python/tests:rdma_load_test -- --device cpu --operation write --iterations 10 --size 512 Reviewed By: casteryh Differential Revision: D83710723 fbshipit-source-id: 72d7648c37731fbdd7247d0e21a70316ea6e3549
1 parent 7b11852 commit 1c57e04

File tree

2 files changed

+41
-0
lines changed

2 files changed

+41
-0
lines changed

monarch_rdma/src/ibverbs_primitives.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ pub struct IbverbsConfig {
129129
pub psn: u32,
130130
/// `use_gpu_direct` - Whether to enable GPU Direct RDMA support on init.
131131
pub use_gpu_direct: bool,
132+
/// `hw_init_delay_ms` - The delay in milliseconds before initializing the hardware.
133+
/// This is used to allow the hardware to settle before starting the first transmission.
134+
pub hw_init_delay_ms: u64,
132135
}
133136

134137
/// Default RDMA parameters below are based on common values from rdma-core examples
@@ -155,6 +158,7 @@ impl Default for IbverbsConfig {
155158
pkey_index: 0,
156159
psn: rand::random::<u32>() & 0xffffff,
157160
use_gpu_direct: false, // nv_peermem enabled for cuda
161+
hw_init_delay_ms: 2,
158162
}
159163
}
160164
}

monarch_rdma/src/rdma_components.rs

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ use std::ffi::CStr;
4747
use std::fs;
4848
use std::io::Error;
4949
use std::result::Result;
50+
use std::thread::sleep;
5051
use std::time::Duration;
5152

5253
use hyperactor::ActorRef;
@@ -445,9 +446,35 @@ pub struct RdmaQueuePair {
445446
pub recv_wqe_idx: u64,
446447
pub recv_db_idx: u64,
447448
pub recv_cq_idx: u64,
449+
rts_timestamp: u64,
448450
}
449451

450452
impl RdmaQueuePair {
453+
/// Applies hardware initialization delay if this is the first operation since RTS.
454+
///
455+
/// This ensures the hardware has sufficient time to settle after reaching
456+
/// Ready-to-Send state before the first actual operation.
457+
fn apply_first_op_delay(&self, wr_id: u64) {
458+
if wr_id == 0 {
459+
assert!(
460+
self.rts_timestamp != u64::MAX,
461+
"First operation attempted before queue pair reached RTS state! Call connect() first."
462+
);
463+
let current_nanos = RealClock
464+
.system_time_now()
465+
.duration_since(std::time::UNIX_EPOCH)
466+
.unwrap()
467+
.as_nanos() as u64;
468+
let elapsed_nanos = current_nanos - self.rts_timestamp;
469+
let elapsed = Duration::from_nanos(elapsed_nanos);
470+
let init_delay = Duration::from_millis(self.config.hw_init_delay_ms);
471+
if elapsed < init_delay {
472+
let remaining_delay = init_delay - elapsed;
473+
sleep(remaining_delay);
474+
}
475+
}
476+
}
477+
451478
/// Creates a new RdmaQueuePair from a given RdmaDomain.
452479
///
453480
/// This function initializes a new Queue Pair (QP) and associated Completion Queue (CQ)
@@ -539,6 +566,7 @@ impl RdmaQueuePair {
539566
send_db_idx: 0,
540567
send_wqe_idx: 0,
541568
send_cq_idx: 0,
569+
rts_timestamp: u64::MAX,
542570
})
543571
}
544572
}
@@ -747,6 +775,13 @@ impl RdmaQueuePair {
747775
qp
748776
);
749777

778+
// Record RTS time now that the queue pair is ready to send
779+
self.rts_timestamp = RealClock
780+
.system_time_now()
781+
.duration_since(std::time::UNIX_EPOCH)
782+
.unwrap()
783+
.as_nanos() as u64;
784+
750785
Ok(())
751786
}
752787
}
@@ -1039,6 +1074,8 @@ impl RdmaQueuePair {
10391074
|| op_type == RdmaOperation::Read
10401075
|| op_type == RdmaOperation::WriteWithImm
10411076
{
1077+
// Apply hardware initialization delay if this is the first operation
1078+
self.apply_first_op_delay(wr_id);
10421079
let send_flags = if signaled {
10431080
rdmaxcel_sys::ibv_send_flags::IBV_SEND_SIGNALED.0
10441081
} else {

0 commit comments

Comments
 (0)