Skip to content

Commit ea2f50e

Browse files
pablorfb-metafacebook-github-bot
authored andcommitted
Variable message size cast to actor mesh (#763)
Summary: Pull Request resolved: #763 X-link: #763 Benchmark how long it takes to send a message of size X to an actor mesh of 1 and 10 actors on local transport Benchmark shows throughput starts scaling from 10kb -> 1MB. After that system seems to have reached max throughput (Bottleneck TBD): - ~500Mb/s for 1 Actor - ~2Gb/s for 10 Actors | Benchmark | Time [Min, Median, Max] | Throughput [Min, Median, Max] | Throughput Change % | |--------------------------|----------------------------------|-----------------------------------------|--------------------| | actors/1/size/10kb | [1.1249 ms, 1.1324 ms, 1.1401 ms] | [8.3649 MiB/s, 8.4220 MiB/s, 8.4779 MiB/s] | N/A | | actors/1/size/100kb | [1.1926 ms, 1.1984 ms, 1.2043 ms] | [79.187 MiB/s, 79.577 MiB/s, 79.969 MiB/s] | +845.3% | | actors/1/size/1mb | [1.7636 ms, 1.7798 ms, 1.7979 ms] | [530.44 MiB/s, 535.84 MiB/s, 540.77 MiB/s] | +573.6% | | actors/1/size/10mb | [18.867 ms, 19.286 ms, 19.640 ms] | [485.57 MiB/s, 494.50 MiB/s, 505.48 MiB/s] | -7.7% | | actors/1/size/100mb | [180.03 ms, 183.91 ms, 187.25 ms] | [509.29 MiB/s, 518.56 MiB/s, 529.73 MiB/s] | +4.9% | | actors/1/size/1gb | [1.7496 s, 1.7891 s, 1.8314 s] | [520.73 MiB/s, 533.05 MiB/s, 545.10 MiB/s] | +2.8% | | actors/10/size/10kb | [1.2557 ms, 1.2662 ms, 1.2758 ms] | [74.751 MiB/s, 75.318 MiB/s, 75.948 MiB/s] | N/A | | actors/10/size/100kb | [1.3862 ms, 1.4058 ms, 1.4244 ms] | [669.55 MiB/s, 678.38 MiB/s, 688.00 MiB/s] | +801.1% | | actors/10/size/1mb | [3.3235 ms, 3.3544 ms, 3.3854 ms] | [2.7510 GiB/s, 2.7764 GiB/s, 2.8022 GiB/s] | +309.3% | | actors/10/size/10mb | [41.557 ms, 42.532 ms, 43.215 ms] | [2.1551 GiB/s, 2.1897 GiB/s, 2.2411 GiB/s] | -21.1% | | actors/10/size/100mb | [427.53 ms, 443.07 ms, 460.93 ms] | [2.0205 GiB/s, 2.1020 GiB/s, 2.1784 GiB/s] | -4.0% | | actors/10/size/1gb | [4.4195 s, 4.6372 s, 4.8398 s] | [1.9243 GiB/s, 2.0084 GiB/s, 2.1073 GiB/s] | -4.5% | Comparison to raw channel benchmark https://fburl.com/code/gyryo1mc seems to indicate that there is a bottleneck on the actor cast path capping throughput . | Benchmark | Time [Min, Median, Max] | Throughput [Min, Median, Max] | Throughput Change % | |------------------------------|------------------------------------|-----------------------------------------|---------------------| | send_receive/local/10000 | [511.62 ns, 514.23 ns, 517.90 ns] | [17.983 GiB/s, 18.111 GiB/s, 18.203 GiB/s] | N/A | | send_receive/local/1000000 | [131.12 µs, 132.97 µs, 134.57 µs] | [6.9205 GiB/s, 7.0040 GiB/s, 7.1029 GiB/s] | -61.3% | | send_receive/local/10000000 | [3.0627 ms, 3.0889 ms, 3.1061 ms] | [2.9984 GiB/s, 3.0150 GiB/s, 3.0408 GiB/s] | -56.9% | | send_receive/local/100000000 | [27.317 ms, 28.661 ms, 29.659 ms] | [3.1400 GiB/s, 3.2494 GiB/s, 3.4094 GiB/s] | +7.8% | | send_receive/local/1000000000 | [244.17 ms, 247.15 ms, 250.02 ms] | [3.7250 GiB/s, 3.7682 GiB/s, 3.8142 GiB/s] | +16.0% | Reviewed By: pzhan9 Differential Revision: D79577855 fbshipit-source-id: 32f9a0c7fc1ee67b8fceaa0d6ef50211c4044694
1 parent 67fc122 commit ea2f50e

File tree

2 files changed

+103
-10
lines changed

2 files changed

+103
-10
lines changed

hyperactor_mesh/benches/bench_actor.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -36,14 +36,17 @@ pub struct BenchMessage {
3636
BenchMessage { cast = true },
3737
],
3838
)]
39-
pub struct BenchActor {}
39+
pub struct BenchActor {
40+
processing_time: Duration,
41+
}
4042

4143
#[async_trait]
4244
impl Actor for BenchActor {
43-
type Params = ();
44-
45-
async fn new(_: Self::Params) -> Result<Self, anyhow::Error> {
46-
Ok(Self {})
45+
type Params = Duration;
46+
async fn new(params: Duration) -> Result<Self, anyhow::Error> {
47+
Ok(Self {
48+
processing_time: params,
49+
})
4750
}
4851
}
4952

@@ -55,7 +58,7 @@ impl Handler<BenchMessage> for BenchActor {
5558
msg: BenchMessage,
5659
) -> Result<(), anyhow::Error> {
5760
hyperactor::clock::ClockKind::default()
58-
.sleep(Duration::from_millis(100))
61+
.sleep(self.processing_time.clone())
5962
.await;
6063

6164
let _ = msg.reply.send(ctx, msg.step);

hyperactor_mesh/benches/main.rs

Lines changed: 94 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use std::time::Instant;
1010

1111
use criterion::BenchmarkId;
1212
use criterion::Criterion;
13+
use criterion::Throughput;
1314
use criterion::criterion_group;
1415
use criterion::criterion_main;
1516
use hyperactor_mesh::ProcMesh;
@@ -21,6 +22,8 @@ use hyperactor_mesh::alloc::LocalAllocator;
2122
use hyperactor_mesh::extent;
2223
use hyperactor_mesh::selection::dsl::all;
2324
use hyperactor_mesh::selection::dsl::true_;
25+
use hyperactor_mesh::shape;
26+
use tokio::time::Duration;
2427

2528
mod bench_actor;
2629
use bench_actor::BenchActor;
@@ -48,16 +51,18 @@ fn bench_actor_scaling(c: &mut Criterion) {
4851
.unwrap();
4952

5053
let proc_mesh = ProcMesh::allocate(alloc).await.unwrap();
51-
let trainer_mesh: RootActorMesh<BenchActor> =
52-
proc_mesh.spawn("trainer", &()).await.unwrap();
54+
let actor_mesh: RootActorMesh<BenchActor> = proc_mesh
55+
.spawn("bench", &(Duration::from_millis(0)))
56+
.await
57+
.unwrap();
5358
let client = proc_mesh.client();
5459

5560
let start = Instant::now();
5661
for i in 0..iters {
5762
let (tx, mut rx) = client.open_port();
5863
let payload = vec![0u8; message_size];
5964

60-
trainer_mesh
65+
actor_mesh
6166
.cast(
6267
all(true_()),
6368
BenchMessage {
@@ -83,5 +88,90 @@ fn bench_actor_scaling(c: &mut Criterion) {
8388
group.finish();
8489
}
8590

86-
criterion_group!(benches, bench_actor_scaling);
91+
fn format_size(size: usize) -> String {
92+
if size >= 1_000_000_000 {
93+
format!("{}GB", size / 1_000_000_000)
94+
} else if size >= 1_000_000 {
95+
format!("{}MB", size / 1_000_000)
96+
} else if size >= 1_000 {
97+
format!("{}KB", size / 1_000)
98+
} else {
99+
format!("{}B", size)
100+
}
101+
}
102+
103+
// Benchmark how long it takes to send a message of size X to an actor mesh of 10 actors
104+
fn bench_actor_mesh_message_sizes(c: &mut Criterion) {
105+
let mut group = c.benchmark_group("actor_mesh_message_sizes");
106+
group.sample_size(10);
107+
let actor_counts = vec![1, 10];
108+
let message_sizes: Vec<usize> = vec![
109+
10_000,
110+
100_000,
111+
1_000_000,
112+
10_000_000,
113+
100_000_000,
114+
1_000_000_000,
115+
];
116+
117+
for message_size in message_sizes {
118+
for &actor_count in &actor_counts {
119+
group.throughput(Throughput::Bytes((message_size * actor_count) as u64));
120+
group.sampling_mode(criterion::SamplingMode::Flat);
121+
group.sample_size(10);
122+
group.bench_function(
123+
format!("actors/{}/size/{}", actor_count, format_size(message_size)),
124+
|b| {
125+
let mut b = b.to_async(Runtime::new().unwrap());
126+
b.iter_custom(|iters| async move {
127+
let alloc = LocalAllocator
128+
.allocate(AllocSpec {
129+
extent: extent!(gpus = actor_count),
130+
constraints: Default::default(),
131+
})
132+
.await
133+
.unwrap();
134+
135+
let proc_mesh = ProcMesh::allocate(alloc).await.unwrap();
136+
let actor_mesh: RootActorMesh<BenchActor> = proc_mesh
137+
.spawn("bench", &(Duration::from_millis(0)))
138+
.await
139+
.unwrap();
140+
141+
let client = proc_mesh.client();
142+
143+
let start = Instant::now();
144+
for i in 0..iters {
145+
let (tx, mut rx) = client.open_port();
146+
let payload = vec![0u8; message_size];
147+
148+
actor_mesh
149+
.cast(
150+
all(true_()),
151+
BenchMessage {
152+
step: i as usize,
153+
reply: tx.bind(),
154+
payload,
155+
},
156+
)
157+
.unwrap();
158+
159+
let mut msg_rcv = 0;
160+
while msg_rcv < actor_count {
161+
let _ = rx.recv().await.unwrap();
162+
msg_rcv += 1;
163+
}
164+
}
165+
166+
start.elapsed()
167+
});
168+
},
169+
);
170+
}
171+
}
172+
173+
group.finish();
174+
}
175+
176+
criterion_group!(benches, bench_actor_scaling, bench_actor_mesh_message_sizes);
87177
criterion_main!(benches);

0 commit comments

Comments
 (0)