Added CPU runtime benchmarking and expanded BenchmarkResult to store CPU runtimes (#3206)

SSYernar · facebook-github-bot · commit 3ac28bf51061 · 2025-07-22T09:20:06.000-07:00
Summary: Pull Request resolved: #3206 * Added feature to benchmark CPU runtimes alongside with GPU measurements. (`time.process_time_ns()` measures the sum of system and user CPU time of the current process in nanoseconds, excluding time spent sleeping or waiting for I/O operations, making it ideal for measuring pure CPU computation time.) * Expanded `BenchmarkResult` class to store both device measurements * Adapted files that are importing `BenchmarkResult` to ensure compatibility Now we can compare CPU and GPU runtimes without running and analyzing PyTorch Profiler. BenchmarkResults can help users to detect if the module/function/operator is CPU-bounded or GPU-bounded. Example metrics of FBGEMM operators: | Operator | CPU Runtime | GPU Runtime | GPU Memory | |---------------------------------------------|-------------|-------------|------------| | **[fallback] pytorch generic** | 5.41 ms | 2.66 ms | 1.01 GB | | **[Prod] KeyedTensor.regroup_dup** | 2.13 ms | 2.48 ms | 1.01 GB | | **[Module] KTRegroupAsDict_dup** | 0.14 ms | 0.75 ms | 1.01 GB | | **[2 Ops] permute_multi_embs_dup** | 0.88 ms | 1.44 ms | 1.01 GB | | **[1 Op] KT_regroup_dup** | 0.99 ms | 1.54 ms | 1.01 GB | We can see that `[fallback] pytorch generic` is CPU-bounded Reviewed By: aliafzal Differential Revision: D78503319 fbshipit-source-id: d038bcb424880b258f6f765ca0c632c34d1ccc7b
diff --git a/torchrec/distributed/benchmark/benchmark_utils.py b/torchrec/distributed/benchmark/benchmark_utils.py
@@ -135,26 +135,43 @@ def __str__(self) -> str:
 class BenchmarkResult:
     "Class for holding results of benchmark runs"
     short_name: str
-    elapsed_time: torch.Tensor  # milliseconds
+    gpu_elapsed_time: torch.Tensor  # milliseconds
+    cpu_elapsed_time: torch.Tensor  # milliseconds
     mem_stats: List[MemoryStats]  # memory stats per rank
     rank: int = -1
 
     def __str__(self) -> str:
-        runtime = f"Runtime (P90): {self.runtime_percentile(90):.2f} ms"
+        gpu_runtime = (
+            f"GPU Runtime (P90): {self.runtime_percentile(90, device='gpu'):.2f} ms"
+        )
+        cpu_runtime = (
+            f"CPU Runtime (P90): {self.runtime_percentile(90, device='cpu'):.2f} ms"
+        )
         if len(self.mem_stats) == 0:
-            return f"{self.short_name: <{35}} |  {runtime}"
+            return f"{self.short_name: <{35}} |  {gpu_runtime} | {cpu_runtime}"
         mem_alloc = (
             f"Peak Memory alloc (P90): {self.max_mem_alloc_percentile(90)/1000:.2f} GB"
         )
         mem_reserved = f"Peak Memory reserved (P90): {self.max_mem_reserved_percentile(90)/1000:.2f} GB"
-        malloc_retries = f"Malloc retries (P50/P90/P100): {self.mem_retries(50) } / {self.mem_retries(90)} / {self.mem_retries(100)}"
-        return f"{self.short_name: <{35}} | {malloc_retries} | {runtime} | {mem_alloc} | {mem_reserved}"
+        malloc_retries = f"Malloc retries (P50/P90/P100): {self.mem_retries(50)} / {self.mem_retries(90)} / {self.mem_retries(100)}"
+        return f"{self.short_name: <{35}} | {malloc_retries} | {gpu_runtime} | {cpu_runtime} | {mem_alloc} | {mem_reserved}"
 
     def runtime_percentile(
-        self, percentile: int = 50, interpolation: str = "nearest"
+        self,
+        percentile: int = 50,
+        interpolation: str = "nearest",
+        device: str = "gpu",
     ) -> torch.Tensor:
+        """Return the runtime percentile for the requested timer.
+
+        Args:
+            percentile: Percentile to compute.
+            interpolation: See ``torch.quantile``.
+            device: 'gpu' for CUDA event timings, 'cpu' for active CPU timings.
+        """
+        timings = self.gpu_elapsed_time if device == "gpu" else self.cpu_elapsed_time
         return torch.quantile(
-            self.elapsed_time,
+            timings,
             percentile / 100.0,
             interpolation=interpolation,
         )
@@ -409,17 +426,26 @@ def write_report(
     num_requests: int,
 ) -> None:
     for benchmark_res in benchmark_results:
-        avg_dur_s = benchmark_res.elapsed_time.mean().item() * 1e-3  # time in seconds
-        std_dur_s = benchmark_res.elapsed_time.std().item() * 1e-3  # time in seconds
+        # GPU statistics
+        avg_dur_s_gpu = benchmark_res.gpu_elapsed_time.mean().item() * 1e-3  # sec
+        std_dur_s_gpu = benchmark_res.gpu_elapsed_time.std().item() * 1e-3  # sec
+
+        # CPU statistics
+        avg_dur_s_cpu = benchmark_res.cpu_elapsed_time.mean().item() * 1e-3  # sec
+        std_dur_s_cpu = benchmark_res.cpu_elapsed_time.std().item() * 1e-3  # sec
 
-        qps = int(num_requests / avg_dur_s)
+        qps_gpu = int(num_requests / avg_dur_s_gpu)
 
         mem_str = ""
         for memory_stats in benchmark_res.mem_stats:
             mem_str += f"{memory_stats}\n"
 
-        report_str += f"{benchmark_res.short_name:40} Avg QPS:{qps:10} Avg Duration: {int(1000*avg_dur_s):5}"
-        report_str += f"ms Standard Dev Duration: {(1000*std_dur_s):.2f}ms\n"
+        report_str += (
+            f"{benchmark_res.short_name:40} "
+            f"Avg QPS(GPU):{qps_gpu:10} "
+            f"GPU Avg: {int(1000*avg_dur_s_gpu):5}ms ±{(1000*std_dur_s_gpu):.2f}ms "
+            f"CPU Avg: {int(1000*avg_dur_s_cpu):5}ms ±{(1000*std_dur_s_cpu):.2f}ms\n"
+        )
         report_str += f"\tMemory Allocated Per Rank:\n\t{mem_str}\n"
 
     with open(report_file, "w") as f:
@@ -731,44 +757,63 @@ def _run_benchmark_core(
             if reset_accumulated_memory_stats:
                 torch.cuda.reset_accumulated_memory_stats(rank)
 
-    # Optional allocator warm-up to create fragmentation similar to production
-    if pre_gpu_load and device_type == "cuda":
-        _tmp = torch.rand(16384, 16384, device="cuda")
-        for _ in range(pre_gpu_load):
-            _tmp = _tmp * torch.rand(16384, 16384, device="cuda")
+        # Optional allocator warm-up to create fragmentation similar to production
+        if pre_gpu_load:
+            _tmp = torch.rand(16384, 16384, device="cuda")
+            for _ in range(pre_gpu_load):
+                _tmp = _tmp * torch.rand(16384, 16384, device="cuda")
 
-    # Timing loop
+    # Timings
     start_events, end_events, times = [], [], []
+
     if device_type == "cuda":
         start_events = [
             torch.cuda.Event(enable_timing=True) for _ in range(num_benchmarks)
         ]
         end_events = [
             torch.cuda.Event(enable_timing=True) for _ in range(num_benchmarks)
         ]
+        # Capture per-iteration active CPU cycles (excludes time the thread is truly idle/asleep) using `process_time_ns`.
+        cpu_times_active_ns: List[int] = []
+
         for i in range(num_benchmarks):
+            # Ensure that outstanding GPU work from the previous iteration has
+            # finished so that we do not attribute its wait time to the next
+            # CPU measurement.
+            if i > 0:
+                torch.cuda.synchronize(rank if rank >= 0 else 0)
+
             start_events[i].record()
+            cpu_start_active_ns = time.process_time_ns()
+
             run_iter_fn()
+
+            cpu_end_active_ns = time.process_time_ns()
             end_events[i].record()
-    else:
-        times = timeit.repeat(run_iter_fn, number=1, repeat=num_benchmarks)
+            cpu_times_active_ns.append(cpu_end_active_ns - cpu_start_active_ns)
 
-    # Make sure all kernels are finished before reading timers / stats
-    if device_type == "cuda":
+        # Convert to milliseconds and drop the first iteration
+        cpu_elapsed_time = torch.tensor(
+            [t / 1e6 for t in cpu_times_active_ns[1:]], dtype=torch.float
+        )
+
+        # Make sure all kernels are finished before reading timers / stats
         if rank == -1:
             for di in range(world_size):
                 torch.cuda.synchronize(di)
         else:
             torch.cuda.synchronize(rank)
 
-    # First Benchmark Run for Eager Mode produces outlier
-    # Start counting after first as workaround for standard deviation
-    if device_type == "cuda":
-        elapsed_time = torch.tensor(
+        gpu_elapsed_time = torch.tensor(
             [s.elapsed_time(e) for s, e in zip(start_events[1:], end_events[1:])]
         )
     else:
-        elapsed_time = torch.tensor(times) * 1e3  # convert seconds ➜ milliseconds
+        # For CPU-only benchmarks we fall back to wall-clock timing via ``timeit``.
+        times = timeit.repeat(run_iter_fn, number=1, repeat=num_benchmarks)
+        cpu_elapsed_time = torch.tensor(times) * 1e3  # convert to ms
+
+        # mirror CPU timings for overall consistency
+        gpu_elapsed_time = cpu_elapsed_time.clone()
 
     # Memory statistics collection
     mem_stats: List[MemoryStats] = []
@@ -820,7 +865,11 @@ def _trace_handler(prof: torch.profiler.profile) -> None:
             torch.cuda.synchronize(rank)
 
     return BenchmarkResult(
-        short_name=name, elapsed_time=elapsed_time, mem_stats=mem_stats, rank=rank
+        short_name=name,
+        gpu_elapsed_time=gpu_elapsed_time,
+        cpu_elapsed_time=cpu_elapsed_time,
+        mem_stats=mem_stats,
+        rank=rank,
     )
 
 
@@ -1095,10 +1144,11 @@ def setUp() -> None:
         assert 0 == p.exitcode
 
     total_benchmark_res = BenchmarkResult(
-        benchmark_res_per_rank[0].short_name,
-        benchmark_res_per_rank[0].elapsed_time,
-        [MemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
-        0,
+        short_name=benchmark_res_per_rank[0].short_name,
+        gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
+        cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
+        mem_stats=[MemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
+        rank=0,
     )
 
     for res in benchmark_res_per_rank:
diff --git a/torchrec/sparse/tests/jagged_tensor_benchmark.py b/torchrec/sparse/tests/jagged_tensor_benchmark.py
@@ -107,14 +107,13 @@ def wrapped_func(
         )
         result = BenchmarkResult(
             short_name=name,
-            elapsed_time=torch.tensor(times) * 1e3,
+            gpu_elapsed_time=torch.tensor(times) * 1e3,
+            cpu_elapsed_time=torch.tensor(times) * 1e3,
             mem_stats=[MemoryStats(0, 0, 0, 0)],
         )
 
-    mem_alloc = f"Memory alloc (P90): {result.max_mem_alloc_percentile(90):5.1f}"
-    mem_reserved = f"Memory alloc (P90): {result.max_mem_reserved_percentile(90):5.1f}"
     print(
-        f"  {name : <{30}} | B: {batch_size : <{8}} | F: {feature_count : <{8}} | device: {device_type : <{8}} | Runtime (P90): {result.runtime_percentile(90):5.2f} ms | {mem_alloc} | {mem_reserved}"
+        f"B: {batch_size : <{8}} | F: {feature_count : <{8}} | device: {device_type : <{8}} | {result}"
     )
 
 
diff --git a/torchrec/sparse/tests/keyed_jagged_tensor_benchmark_lib.py b/torchrec/sparse/tests/keyed_jagged_tensor_benchmark_lib.py
@@ -227,7 +227,8 @@ def benchmark_kjt(
 
     result = BenchmarkResult(
         short_name=f"{test_name}-{transform_type.name}",
-        elapsed_time=torch.tensor(times),
+        gpu_elapsed_time=torch.tensor(times),
+        cpu_elapsed_time=torch.tensor(times),
         mem_stats=[MemoryStats(0, 0, 0, 0)],
     )
 

Original file line number	Diff line number	Diff line change
`@@ -107,14 +107,13 @@ def wrapped_func(`
`107`	`107`	`)`
`108`	`108`	`result = BenchmarkResult(`
`109`	`109`	`short_name=name,`
`110`		`- elapsed_time=torch.tensor(times) * 1e3,`
	`110`	`+ gpu_elapsed_time=torch.tensor(times) * 1e3,`
	`111`	`+ cpu_elapsed_time=torch.tensor(times) * 1e3,`
`111`	`112`	`mem_stats=[MemoryStats(0, 0, 0, 0)],`
`112`	`113`	`)`
`113`	`114`
`114`		`- mem_alloc = f"Memory alloc (P90): {result.max_mem_alloc_percentile(90):5.1f}"`
`115`		`- mem_reserved = f"Memory alloc (P90): {result.max_mem_reserved_percentile(90):5.1f}"`
`116`	`115`	`print(`
`117`		`- f" {name : <{30}} \| B: {batch_size : <{8}} \| F: {feature_count : <{8}} \| device: {device_type : <{8}} \| Runtime (P90): {result.runtime_percentile(90):5.2f} ms \| {mem_alloc} \| {mem_reserved}"`
	`116`	`+ f"B: {batch_size : <{8}} \| F: {feature_count : <{8}} \| device: {device_type : <{8}} \| {result}"`
`118`	`117`	`)`
`119`	`118`
`120`	`119`
Original file line number	Diff line number	Diff line change
`@@ -227,7 +227,8 @@ def benchmark_kjt(`
`227`	`227`
`228`	`228`	`result = BenchmarkResult(`
`229`	`229`	`short_name=f"{test_name}-{transform_type.name}",`
`230`		`- elapsed_time=torch.tensor(times),`
	`230`	`+ gpu_elapsed_time=torch.tensor(times),`
	`231`	`+ cpu_elapsed_time=torch.tensor(times),`
`231`	`232`	`mem_stats=[MemoryStats(0, 0, 0, 0)],`
`232`	`233`	`)`
`233`	`234`