Skip to content

Commit 667531b

Browse files
thomasywangfacebook-github-bot
authored andcommitted
Distance based latency (#858)
Summary: Pull Request resolved: #858 Now that the simnet has awareness of which compute resource each ProcId maps to, when messages are being sent we can simply look at the sender and destination ProcIds and compute the distance the message is being sent in order to determine the latency. Latency is randomly sample from a beta distribution where the min and max for each distance is configured Implementation details (follow along numbers in comments): 1. In the previous diff when Procs were allocated, their coordinates (region, dc, zone, rack, host, gpu) were registered to the Simnet 2. When SimTx posts a message, we can safely assume that it is a MessageEnvelope. MessageEnvelopes contain information about the sender and receiver so we can determine which ProcIds the message is being sent between, which in turn means we can identify which coordinates they are being sent between 3. We determine distance between 2 coordinates by identifying the most major dimension in which they differ 4. We create a struct called LatencyConfig which holds a distribution for sampling, as well as minimum and maximum values for each distance. 5. We use the identified distance to get a sample for what the latency should be for that send 6. We pass in that latency to the MessageDeliveryEvent to use as its duration 7. The old network configuration which was an all-to-all map of edges with latencies between nodes has been removed along with all related structs 8. Unit tests have been refactored such that when we need a particular message to be sent with a particular latency, we register the ProcIds with the appropriate coordinates, and configure the interdistance latency test_allocator_registers_resources in alloc/sim.rs demonstrates that when we allocate a ProcMesh using the sim allocator, our Procs are registered as compute resources and the latencies are computed based on distance Differential Revision: D80141665
1 parent 8b2a36f commit 667531b

File tree

5 files changed

+326
-366
lines changed

5 files changed

+326
-366
lines changed

hyperactor/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ nix = { version = "0.30.1", features = ["dir", "event", "hostname", "inotify", "
4242
opentelemetry = "0.29"
4343
paste = "1.0.14"
4444
rand = { version = "0.8", features = ["small_rng"] }
45+
rand_distr = "0.4"
4546
regex = "1.11.1"
4647
rustls-pemfile = "1.0.0"
4748
serde = { version = "1.0.219", features = ["derive", "rc"] }

hyperactor/src/channel/sim.rs

Lines changed: 137 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,13 @@
1010
// SimRx contains a way to receive messages.
1111

1212
//! Local simulated channel implementation.
13+
use std::any::Any;
1314
// send leads to add to network.
1415
use std::marker::PhantomData;
1516
use std::sync::Arc;
1617

1718
use dashmap::DashMap;
1819
use regex::Regex;
19-
use tokio::sync::Mutex;
2020

2121
use super::*;
2222
use crate::channel;
@@ -25,12 +25,9 @@ use crate::clock::RealClock;
2525
use crate::clock::SimClock;
2626
use crate::data::Serialized;
2727
use crate::mailbox::MessageEnvelope;
28-
use crate::simnet;
2928
use crate::simnet::Dispatcher;
3029
use crate::simnet::Event;
3130
use crate::simnet::ScheduledEvent;
32-
use crate::simnet::SimNetConfig;
33-
use crate::simnet::SimNetEdge;
3431
use crate::simnet::SimNetError;
3532
use crate::simnet::simnet_handle;
3633

@@ -126,20 +123,18 @@ impl fmt::Display for SimAddr {
126123
/// Message Event that can be passed around in the simnet.
127124
#[derive(Debug)]
128125
pub(crate) struct MessageDeliveryEvent {
129-
src_addr: Option<ChannelAddr>,
130126
dest_addr: ChannelAddr,
131127
data: Serialized,
132-
duration_ms: u64,
128+
latency: u64,
133129
}
134130

135131
impl MessageDeliveryEvent {
136132
/// Creates a new MessageDeliveryEvent.
137-
pub fn new(src_addr: Option<ChannelAddr>, dest_addr: ChannelAddr, data: Serialized) -> Self {
133+
pub fn new(dest_addr: ChannelAddr, data: Serialized, latency: u64) -> Self {
138134
Self {
139-
src_addr,
140135
dest_addr,
141136
data,
142-
duration_ms: 100,
137+
latency,
143138
}
144139
}
145140
}
@@ -149,42 +144,17 @@ impl Event for MessageDeliveryEvent {
149144
async fn handle(&mut self) -> Result<(), SimNetError> {
150145
// Send the message to the correct receiver.
151146
SENDER
152-
.send(
153-
self.src_addr.clone(),
154-
self.dest_addr.clone(),
155-
self.data.clone(),
156-
)
147+
.send(self.dest_addr.clone(), self.data.clone())
157148
.await?;
158149
Ok(())
159150
}
160151

161152
fn duration_ms(&self) -> u64 {
162-
self.duration_ms
153+
self.latency
163154
}
164155

165156
fn summary(&self) -> String {
166-
format!(
167-
"Sending message from {} to {}",
168-
self.src_addr
169-
.as_ref()
170-
.map_or("unknown".to_string(), |addr| addr.to_string()),
171-
self.dest_addr.clone()
172-
)
173-
}
174-
175-
async fn read_simnet_config(&mut self, topology: &Arc<Mutex<SimNetConfig>>) {
176-
if let Some(src_addr) = &self.src_addr {
177-
let edge = SimNetEdge {
178-
src: src_addr.clone(),
179-
dst: self.dest_addr.clone(),
180-
};
181-
self.duration_ms = topology
182-
.lock()
183-
.await
184-
.topology
185-
.get(&edge)
186-
.map_or_else(|| 1, |v| v.latency.as_millis() as u64);
187-
}
157+
format!("Sending message to {}", self.dest_addr.clone())
188158
}
189159
}
190160

@@ -194,12 +164,6 @@ pub async fn bind(addr: ChannelAddr) -> anyhow::Result<(), SimNetError> {
194164
simnet_handle()?.bind(addr)
195165
}
196166

197-
/// Update the configuration for simnet.
198-
pub async fn update_config(config: simnet::NetworkConfig) -> anyhow::Result<(), SimNetError> {
199-
// Only update network config for now, will add host config in the future.
200-
simnet_handle()?.update_network_config(config).await
201-
}
202-
203167
/// Returns a simulated channel address that is bound to "any" channel address.
204168
pub(crate) fn any(transport: ChannelTransport) -> ChannelAddr {
205169
ChannelAddr::Sim(SimAddr {
@@ -274,12 +238,7 @@ fn create_egress_sender(
274238

275239
#[async_trait]
276240
impl Dispatcher<ChannelAddr> for SimDispatcher {
277-
async fn send(
278-
&self,
279-
_src_addr: Option<ChannelAddr>,
280-
addr: ChannelAddr,
281-
data: Serialized,
282-
) -> Result<(), SimNetError> {
241+
async fn send(&self, addr: ChannelAddr, data: Serialized) -> Result<(), SimNetError> {
283242
self.dispatchers
284243
.get(&addr)
285244
.ok_or_else(|| {
@@ -318,27 +277,34 @@ pub(crate) struct SimRx<M: RemoteMessage> {
318277
}
319278

320279
#[async_trait]
321-
impl<M: RemoteMessage> Tx<M> for SimTx<M> {
280+
impl<M: RemoteMessage + Any> Tx<M> for SimTx<M> {
322281
fn try_post(&self, message: M, _return_handle: oneshot::Sender<M>) -> Result<(), SendError<M>> {
323282
let data = match Serialized::serialize(&message) {
324283
Ok(data) => data,
325284
Err(err) => return Err(SendError(err.into(), message)),
326285
};
286+
287+
let envelope = (&message as &dyn Any)
288+
.downcast_ref::<MessageEnvelope>()
289+
.expect("RemoteMessage should always be a MessageEnvelope");
290+
291+
let (sender, dest) = (envelope.sender().clone(), envelope.dest().0.clone());
292+
327293
match simnet_handle() {
328-
Ok(handle) => match &self.src_addr {
329-
Some(_) if self.client => handle.send_scheduled_event(ScheduledEvent {
330-
event: Box::new(MessageDeliveryEvent::new(
331-
self.src_addr.clone(),
332-
self.dst_addr.clone(),
333-
data,
334-
)),
335-
time: SimClock.millis_since_start(RealClock.now()),
336-
}),
337-
_ => handle.send_event(Box::new(MessageDeliveryEvent::new(
338-
self.src_addr.clone(),
294+
Ok(handle) => {
295+
let event = Box::new(MessageDeliveryEvent::new(
339296
self.dst_addr.clone(),
340297
data,
341-
))),
298+
handle.sample_latency(sender.proc_id(), dest.proc_id()),
299+
));
300+
301+
match &self.src_addr {
302+
Some(_) if self.client => handle.send_scheduled_event(ScheduledEvent {
303+
event,
304+
time: SimClock.millis_since_start(RealClock.now()),
305+
}),
306+
_ => handle.send_event(event),
307+
}
342308
}
343309
.map_err(|err: SimNetError| SendError(ChannelError::from(err), message)),
344310
Err(err) => Err(SendError(ChannelError::from(err), message)),
@@ -410,19 +376,27 @@ impl<M: RemoteMessage> Rx<M> for SimRx<M> {
410376
mod tests {
411377
use std::iter::zip;
412378

379+
use ndslice::extent;
380+
413381
use super::*;
382+
use crate::PortId;
383+
use crate::attrs::Attrs;
414384
use crate::clock::Clock;
415385
use crate::clock::RealClock;
416386
use crate::clock::SimClock;
417-
use crate::simnet::NetworkConfig;
387+
use crate::id;
388+
use crate::simnet;
389+
use crate::simnet::LatencyConfig;
418390
use crate::simnet::start;
391+
use crate::simnet::start_with_config;
419392

420393
#[tokio::test]
421394
async fn test_sim_basic() {
422395
let dst_ok = vec!["tcp:[::1]:1234", "tcp:127.0.0.1:8080", "local:123"];
423396
let srcs_ok = vec!["tcp:[::2]:1234", "tcp:127.0.0.2:8080", "local:124"];
424397

425398
start();
399+
let handle = simnet_handle().unwrap();
426400

427401
// TODO: New NodeAdd event should do this for you..
428402
for addr in dst_ok.iter().chain(srcs_ok.iter()) {
@@ -439,10 +413,24 @@ mod tests {
439413
)
440414
.unwrap();
441415

442-
let (_, mut rx) = sim::serve::<u64>(dst_addr.clone()).unwrap();
443-
let tx = sim::dial::<u64>(dst_addr).unwrap();
444-
tx.try_post(123, oneshot::channel().0).unwrap();
445-
assert_eq!(rx.recv().await.unwrap(), 123);
416+
let (_, mut rx) = sim::serve::<MessageEnvelope>(dst_addr.clone()).unwrap();
417+
let tx = sim::dial::<MessageEnvelope>(dst_addr).unwrap();
418+
let data = Serialized::serialize(&456).unwrap();
419+
let sender = id!(world[0].hello);
420+
let dest = id!(world[1].hello);
421+
let ext = extent!(region = 1, dc = 1, rack = 4, host = 4, gpu = 8);
422+
handle.register_proc(
423+
sender.proc_id().clone(),
424+
ext.point(vec![0, 0, 0, 0, 0]).unwrap(),
425+
);
426+
handle.register_proc(
427+
dest.proc_id().clone(),
428+
ext.point(vec![0, 0, 0, 1, 0]).unwrap(),
429+
);
430+
431+
let msg = MessageEnvelope::new(sender, PortId(dest, 0), data.clone(), Attrs::new());
432+
tx.try_post(msg, oneshot::channel().0).unwrap();
433+
assert_eq!(*rx.recv().await.unwrap().data(), data);
446434
}
447435

448436
let records = sim::simnet_handle().unwrap().close().await.unwrap();
@@ -481,30 +469,47 @@ mod tests {
481469

482470
#[tokio::test]
483471
async fn test_realtime_frontier() {
484-
start();
485-
486472
tokio::time::pause();
473+
// 1 second of latency
474+
start_with_config(LatencyConfig {
475+
inter_host: (100, 100),
476+
..Default::default()
477+
});
478+
487479
let sim_addr = SimAddr::new("unix:@dst".parse::<ChannelAddr>().unwrap()).unwrap();
488480
let sim_addr_with_src = SimAddr::new_with_src(
489481
"unix:@src".parse::<ChannelAddr>().unwrap(),
490482
"unix:@dst".parse::<ChannelAddr>().unwrap(),
491483
)
492484
.unwrap();
493-
let (_, mut rx) = sim::serve::<()>(sim_addr.clone()).unwrap();
494-
let tx = sim::dial::<()>(sim_addr_with_src).unwrap();
495-
let simnet_config_yaml = r#"
496-
edges:
497-
- src: unix:@src
498-
dst: unix:@dst
499-
metadata:
500-
latency: 100
501-
"#;
502-
update_config(NetworkConfig::from_yaml(simnet_config_yaml).unwrap())
503-
.await
504-
.unwrap();
485+
let (_, mut rx) = sim::serve::<MessageEnvelope>(sim_addr.clone()).unwrap();
486+
let tx = sim::dial::<MessageEnvelope>(sim_addr_with_src).unwrap();
487+
488+
let controller = id!(world[0].controller);
489+
let dest = id!(world[1].dest);
490+
let handle = simnet::simnet_handle().unwrap();
491+
492+
let ext = extent!(region = 1, dc = 1, zone = 1, rack = 4, host = 4, gpu = 8);
493+
handle.register_proc(
494+
controller.proc_id().clone(),
495+
ext.point(vec![0, 0, 0, 0, 0, 0]).unwrap(),
496+
);
497+
handle.register_proc(
498+
dest.proc_id().clone(),
499+
ext.point(vec![0, 0, 0, 0, 1, 0]).unwrap(),
500+
);
505501

506502
// This message will be delievered at simulator time = 100 seconds
507-
tx.try_post((), oneshot::channel().0).unwrap();
503+
tx.try_post(
504+
MessageEnvelope::new(
505+
controller,
506+
PortId(dest, 0),
507+
Serialized::serialize(&456).unwrap(),
508+
Attrs::new(),
509+
),
510+
oneshot::channel().0,
511+
)
512+
.unwrap();
508513
{
509514
// Allow simnet to run
510515
tokio::task::yield_now().await;
@@ -524,41 +529,74 @@ mod tests {
524529
#[tokio::test]
525530
async fn test_client_message_scheduled_realtime() {
526531
tokio::time::pause();
527-
start();
532+
// 1 second of latency
533+
start_with_config(LatencyConfig {
534+
inter_host: (1000, 1000),
535+
..Default::default()
536+
});
537+
528538
let controller_to_dst = SimAddr::new_with_src(
529539
"unix:@controller".parse::<ChannelAddr>().unwrap(),
530540
"unix:@dst".parse::<ChannelAddr>().unwrap(),
531541
)
532542
.unwrap();
533-
let controller_tx = sim::dial::<()>(controller_to_dst.clone()).unwrap();
543+
544+
let controller_tx = sim::dial::<MessageEnvelope>(controller_to_dst.clone()).unwrap();
534545

535546
let client_to_dst = SimAddr::new_with_client_src(
536547
"unix:@client".parse::<ChannelAddr>().unwrap(),
537548
"unix:@dst".parse::<ChannelAddr>().unwrap(),
538549
)
539550
.unwrap();
540-
let client_tx = sim::dial::<()>(client_to_dst).unwrap();
541-
542-
// 1 second of latency
543-
let simnet_config_yaml = r#"
544-
edges:
545-
- src: unix:@controller
546-
dst: unix:@dst
547-
metadata:
548-
latency: 1
549-
"#;
550-
update_config(NetworkConfig::from_yaml(simnet_config_yaml).unwrap())
551-
.await
552-
.unwrap();
551+
let client_tx = sim::dial::<MessageEnvelope>(client_to_dst).unwrap();
552+
553+
let controller = id!(world[0].controller);
554+
let dest = id!(world[1].dest);
555+
let client = id!(world[2].client);
556+
557+
let handle = simnet::simnet_handle().unwrap();
558+
let ext = extent!(region = 1, dc = 1, zone = 1, rack = 4, host = 4, gpu = 8);
559+
handle.register_proc(
560+
controller.proc_id().clone(),
561+
ext.point(vec![0, 0, 0, 0, 0, 0]).unwrap(),
562+
);
563+
handle.register_proc(
564+
client.proc_id().clone(),
565+
ext.point(vec![0, 0, 0, 0, 0, 0]).unwrap(),
566+
);
567+
handle.register_proc(
568+
dest.proc_id().clone(),
569+
ext.point(vec![0, 0, 0, 0, 1, 0]).unwrap(),
570+
);
553571

554572
assert_eq!(SimClock.millis_since_start(RealClock.now()), 0);
555573
// Fast forward real time to 5 seconds
556574
tokio::time::advance(tokio::time::Duration::from_secs(5)).await;
557575
{
558576
// Send client message
559-
client_tx.try_post((), oneshot::channel().0).unwrap();
577+
client_tx
578+
.try_post(
579+
MessageEnvelope::new(
580+
client.clone(),
581+
PortId(dest.clone(), 0),
582+
Serialized::serialize(&456).unwrap(),
583+
Attrs::new(),
584+
),
585+
oneshot::channel().0,
586+
)
587+
.unwrap();
560588
// Send system message
561-
controller_tx.try_post((), oneshot::channel().0).unwrap();
589+
controller_tx
590+
.try_post(
591+
MessageEnvelope::new(
592+
controller.clone(),
593+
PortId(dest.clone(), 0),
594+
Serialized::serialize(&456).unwrap(),
595+
Attrs::new(),
596+
),
597+
oneshot::channel().0,
598+
)
599+
.unwrap();
562600
// Allow some time for simnet to run
563601
RealClock.sleep(tokio::time::Duration::from_secs(1)).await;
564602
}

0 commit comments

Comments
 (0)