Python actor throughput (meta-pytorch#830)

pablorfb-meta · facebook-github-bot · commit 7f363d034df0 · 2025-08-20T17:50:04.000-07:00
Summary:

For an unknown reason (needs deeper dive), small payloads consistently perform better in Python compared to Rust (D79577855), eventhough the messaging mechanism is powered by port handles and receivers on both implementations. 

Once payload size increases, Rust outperforms Python (as expected)

Additionally, Python cannot reliable cast 1Gb of data during the bechmakr without throwing a bunch of errors, so it is excluded from benchmark for now.

| Benchmark           | Time [Min, Median, Max] (ms) | Python Throughput [MiB/s] | Rust Throughput [MiB/s] | Throughput Change % |
|---------------------|------------------------------|--------------------------|---------------------------|-------------------------------|
| hosts/1/size/10kb   | [0, 0, 4]                    | 17                       | 8.4220                    | +101.8%                       |
| hosts/1/size/100kb  | [0, 0, 4]                    | 140                      | 79.577                    | +75.9%                        |
| hosts/1/size/1mb    | [1, 2, 9]                    | 433                      | 535.84                    | -19.2%                        |
| hosts/1/size/10mb   | [18, 20, 33]                 | 466                      | 494.50                    | -5.8%                         |
| hosts/1/size/100mb  | [202, 223, 318]              | 443                      | 518.56                    | -14.6%                        |
| hosts/10/size/10kb  | [1, 1, 59]                   | 67                       | 75.318                    | -11.1%                        |
| hosts/10/size/100kb | [1, 1, 59]                   | 537                      | 678.38                    | -20.9%                        |
| hosts/10/size/1mb   | [4, 6, 70]                   | 1451                     | 2843.14                   | -49.0%                        |
| hosts/10/size/10mb  | [52, 61, 145]                | 1575                     | 2243.11                   | -29.8%                        |
| hosts/10/size/100mb | [677, 720, 905]              | 1353                     | 2152.97                   | -37.1%                        |

Reviewed By: pzhan9

Differential Revision: D80100828
diff --git a/python/benches/actor_mesh_benchmark.py b/python/benches/actor_mesh_benchmark.py
@@ -15,6 +15,8 @@
 import time
 from typing import Any, Dict
 
+import humanfriendly
+
 from monarch.actor import Actor, endpoint, proc_mesh
 
 from windtunnel.benchmarks.python_benchmark_runner.benchmark import (
@@ -29,15 +31,18 @@
 
 class SleepActor(Actor):
     @endpoint
-    async def sleep(self, sleep_secs: float, _: bytes) -> None:
+    async def sleep(self, sleep_secs: float, _: bytes) -> int:
         await asyncio.sleep(sleep_secs)
 
+        return 1
+
 
 async def run_actor_scaling_benchmark(
     actor_mesh: Any,
+    actor_count: int,
     message_size: int,
-    duration_seconds: int = 10,
-    sleep_secs: float = 0.1,
+    duration_seconds: int,
+    sleep_secs: float,
 ) -> Dict[str, float]:
     """
     Run a benchmark with a specific number of actors and message size.
@@ -57,16 +62,19 @@ async def run_actor_scaling_benchmark(
 
     while time.time() - start_benchmark < duration_seconds:
         start_time = time.time()
-        await actor_mesh.sleep.call(sleep_secs, payload)
+        val_mesh = await actor_mesh.sleep.call(sleep_secs, payload)
         elapsed_time = time.time() - start_time
         times.append(elapsed_time)
+
+        val = sum([val[1] for val in val_mesh.items()])
+        assert val == actor_count, f"Expected {actor_count} responses, got {val}"
         iteration_count += 1
 
     if iteration_count == 0:
         raise ValueError("No iterations completed")
 
     times_ms = [t * 1000 for t in times]
-    avg_time_ms = sum(times_ms) / iteration_count
+    avg_time_ms = sum(times_ms) / (iteration_count * 1.0)
     sorted_times = sorted(times_ms)
     median_time_ms = (
         sorted_times[iteration_count // 2]
@@ -82,7 +90,8 @@ async def run_actor_scaling_benchmark(
         "median_time_ms": median_time_ms,
         "min_time_ms": min(times_ms),
         "max_time_ms": max(times_ms),
-        "throughput_mbps": (message_size * 8) / (avg_time_ms / 1000) / 1_000_000,
+        "throughput_mBps": (message_size * actor_count * (1000.0 / avg_time_ms))
+        / 1_000_000,
         "iterations": iteration_count,
     }
 
@@ -106,7 +115,11 @@ async def bench_actor_scaling(counters: UserCounters) -> None:
             await asyncio.sleep(1)
 
             stats = await run_actor_scaling_benchmark(
-                actor_mesh, message_size, duration_seconds, sleep_secs=0.1
+                actor_mesh,
+                host_count * 8,
+                message_size,
+                duration_seconds,
+                sleep_secs=0.1,
             )
             await mesh.stop()
 
@@ -121,5 +134,52 @@ async def bench_actor_scaling(counters: UserCounters) -> None:
             )
 
 
+@register_benchmark(FILE_PATH, use_counters=True)
+async def bench_message_scaling(counters: UserCounters) -> None:
+    """
+    Benchmark how long it takes to process messages of different sizes on different numbers of actors.
+    Reports average, median, min, max times, throughput in Mbps, and number of iterations completed.
+    """
+    gpu_counts = [1, 10]
+    KB = 1000
+    MB = 1000 * KB
+    message_sizes = [10 * KB, 100 * KB, 1 * MB, 10 * MB, 100 * MB]
+    duration_seconds = 5
+
+    for gpus in gpu_counts:
+        for message_size in message_sizes:
+            if gpus >= 20 and message_size >= 100 * MB:
+                continue
+            print(f"Testing host_count: {gpus}, message_size: {message_size}")
+            mesh = await proc_mesh(gpus=gpus)
+            await mesh.logging_option(stream_to_client=True, aggregate_window_sec=None)
+            actor_mesh = await mesh.spawn("actor", SleepActor)
+            # Allow Actor init to finish
+            await asyncio.sleep(1)
+
+            stats = await run_actor_scaling_benchmark(
+                actor_mesh,
+                gpus,
+                message_size,
+                duration_seconds,
+                sleep_secs=0.0,
+            )
+            await mesh.stop()
+
+            size = humanfriendly.format_size(message_size)
+            counters[f"hosts_{gpus}_size_{size}_median_ms"] = UserMetric(
+                value=int(stats["median_time_ms"])
+            )
+            counters[f"hosts_{gpus}_size_{size}_min_ms"] = UserMetric(
+                value=int(stats["min_time_ms"])
+            )
+            counters[f"hosts_{gpus}_size_{size}_max_ms"] = UserMetric(
+                value=int(stats["max_time_ms"])
+            )
+            counters[f"hosts_{gpus}_size_{size}_throughput_mBps"] = UserMetric(
+                value=int(stats["throughput_mBps"])
+            )
+
+
 if __name__ == "__main__":
     asyncio.run(main())