Skip to content

Commit 7f363d0

Browse files
pablorfb-metafacebook-github-bot
authored andcommitted
Python actor throughput (meta-pytorch#830)
Summary: For an unknown reason (needs deeper dive), small payloads consistently perform better in Python compared to Rust (D79577855), eventhough the messaging mechanism is powered by port handles and receivers on both implementations. Once payload size increases, Rust outperforms Python (as expected) Additionally, Python cannot reliable cast 1Gb of data during the bechmakr without throwing a bunch of errors, so it is excluded from benchmark for now. | Benchmark | Time [Min, Median, Max] (ms) | Python Throughput [MiB/s] | Rust Throughput [MiB/s] | Throughput Change % | |---------------------|------------------------------|--------------------------|---------------------------|-------------------------------| | hosts/1/size/10kb | [0, 0, 4] | 17 | 8.4220 | +101.8% | | hosts/1/size/100kb | [0, 0, 4] | 140 | 79.577 | +75.9% | | hosts/1/size/1mb | [1, 2, 9] | 433 | 535.84 | -19.2% | | hosts/1/size/10mb | [18, 20, 33] | 466 | 494.50 | -5.8% | | hosts/1/size/100mb | [202, 223, 318] | 443 | 518.56 | -14.6% | | hosts/10/size/10kb | [1, 1, 59] | 67 | 75.318 | -11.1% | | hosts/10/size/100kb | [1, 1, 59] | 537 | 678.38 | -20.9% | | hosts/10/size/1mb | [4, 6, 70] | 1451 | 2843.14 | -49.0% | | hosts/10/size/10mb | [52, 61, 145] | 1575 | 2243.11 | -29.8% | | hosts/10/size/100mb | [677, 720, 905] | 1353 | 2152.97 | -37.1% | Reviewed By: pzhan9 Differential Revision: D80100828
1 parent a9cd5a2 commit 7f363d0

File tree

1 file changed

+67
-7
lines changed

1 file changed

+67
-7
lines changed

python/benches/actor_mesh_benchmark.py

Lines changed: 67 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515
import time
1616
from typing import Any, Dict
1717

18+
import humanfriendly
19+
1820
from monarch.actor import Actor, endpoint, proc_mesh
1921

2022
from windtunnel.benchmarks.python_benchmark_runner.benchmark import (
@@ -29,15 +31,18 @@
2931

3032
class SleepActor(Actor):
3133
@endpoint
32-
async def sleep(self, sleep_secs: float, _: bytes) -> None:
34+
async def sleep(self, sleep_secs: float, _: bytes) -> int:
3335
await asyncio.sleep(sleep_secs)
3436

37+
return 1
38+
3539

3640
async def run_actor_scaling_benchmark(
3741
actor_mesh: Any,
42+
actor_count: int,
3843
message_size: int,
39-
duration_seconds: int = 10,
40-
sleep_secs: float = 0.1,
44+
duration_seconds: int,
45+
sleep_secs: float,
4146
) -> Dict[str, float]:
4247
"""
4348
Run a benchmark with a specific number of actors and message size.
@@ -57,16 +62,19 @@ async def run_actor_scaling_benchmark(
5762

5863
while time.time() - start_benchmark < duration_seconds:
5964
start_time = time.time()
60-
await actor_mesh.sleep.call(sleep_secs, payload)
65+
val_mesh = await actor_mesh.sleep.call(sleep_secs, payload)
6166
elapsed_time = time.time() - start_time
6267
times.append(elapsed_time)
68+
69+
val = sum([val[1] for val in val_mesh.items()])
70+
assert val == actor_count, f"Expected {actor_count} responses, got {val}"
6371
iteration_count += 1
6472

6573
if iteration_count == 0:
6674
raise ValueError("No iterations completed")
6775

6876
times_ms = [t * 1000 for t in times]
69-
avg_time_ms = sum(times_ms) / iteration_count
77+
avg_time_ms = sum(times_ms) / (iteration_count * 1.0)
7078
sorted_times = sorted(times_ms)
7179
median_time_ms = (
7280
sorted_times[iteration_count // 2]
@@ -82,7 +90,8 @@ async def run_actor_scaling_benchmark(
8290
"median_time_ms": median_time_ms,
8391
"min_time_ms": min(times_ms),
8492
"max_time_ms": max(times_ms),
85-
"throughput_mbps": (message_size * 8) / (avg_time_ms / 1000) / 1_000_000,
93+
"throughput_mBps": (message_size * actor_count * (1000.0 / avg_time_ms))
94+
/ 1_000_000,
8695
"iterations": iteration_count,
8796
}
8897

@@ -106,7 +115,11 @@ async def bench_actor_scaling(counters: UserCounters) -> None:
106115
await asyncio.sleep(1)
107116

108117
stats = await run_actor_scaling_benchmark(
109-
actor_mesh, message_size, duration_seconds, sleep_secs=0.1
118+
actor_mesh,
119+
host_count * 8,
120+
message_size,
121+
duration_seconds,
122+
sleep_secs=0.1,
110123
)
111124
await mesh.stop()
112125

@@ -121,5 +134,52 @@ async def bench_actor_scaling(counters: UserCounters) -> None:
121134
)
122135

123136

137+
@register_benchmark(FILE_PATH, use_counters=True)
138+
async def bench_message_scaling(counters: UserCounters) -> None:
139+
"""
140+
Benchmark how long it takes to process messages of different sizes on different numbers of actors.
141+
Reports average, median, min, max times, throughput in Mbps, and number of iterations completed.
142+
"""
143+
gpu_counts = [1, 10]
144+
KB = 1000
145+
MB = 1000 * KB
146+
message_sizes = [10 * KB, 100 * KB, 1 * MB, 10 * MB, 100 * MB]
147+
duration_seconds = 5
148+
149+
for gpus in gpu_counts:
150+
for message_size in message_sizes:
151+
if gpus >= 20 and message_size >= 100 * MB:
152+
continue
153+
print(f"Testing host_count: {gpus}, message_size: {message_size}")
154+
mesh = await proc_mesh(gpus=gpus)
155+
await mesh.logging_option(stream_to_client=True, aggregate_window_sec=None)
156+
actor_mesh = await mesh.spawn("actor", SleepActor)
157+
# Allow Actor init to finish
158+
await asyncio.sleep(1)
159+
160+
stats = await run_actor_scaling_benchmark(
161+
actor_mesh,
162+
gpus,
163+
message_size,
164+
duration_seconds,
165+
sleep_secs=0.0,
166+
)
167+
await mesh.stop()
168+
169+
size = humanfriendly.format_size(message_size)
170+
counters[f"hosts_{gpus}_size_{size}_median_ms"] = UserMetric(
171+
value=int(stats["median_time_ms"])
172+
)
173+
counters[f"hosts_{gpus}_size_{size}_min_ms"] = UserMetric(
174+
value=int(stats["min_time_ms"])
175+
)
176+
counters[f"hosts_{gpus}_size_{size}_max_ms"] = UserMetric(
177+
value=int(stats["max_time_ms"])
178+
)
179+
counters[f"hosts_{gpus}_size_{size}_throughput_mBps"] = UserMetric(
180+
value=int(stats["throughput_mBps"])
181+
)
182+
183+
124184
if __name__ == "__main__":
125185
asyncio.run(main())

0 commit comments

Comments
 (0)