Stack pipeline benchmark per-rank-results into a single BenchmarkResult (#3258)

SSYernar · facebook-github-bot · commit fc56fb0bd058 · 2025-08-08T11:36:39.000-07:00
Summary: Pull Request resolved: #3258 Stack pipeline benchmark per-rank-results into a single BenchmarkResult to measure overall performance across GPUs. This will cause 'intentional' **2x regression** for allocated memory metrics since we are accumulating the total memory stats across two given GPUs instead of measuring only one. After the change is landed, I will **re-register** the ServiceLab task with the new baseline metrics to avoid future false regression warnings. Reviewed By: aliafzal Differential Revision: D79537357 fbshipit-source-id: 8b2e850532189ff9db584e2c19e1fca55f3963a9
diff --git a/torchrec/distributed/benchmark/benchmark_train_pipeline.py b/torchrec/distributed/benchmark/benchmark_train_pipeline.py
@@ -41,9 +41,11 @@
     benchmark_func,
     BenchmarkResult,
     cmd_conf,
+    CPUMemoryStats,
     generate_planner,
     generate_sharded_model_and_optimizer,
     generate_tables,
+    GPUMemoryStats,
 )
 from torchrec.distributed.comm import get_local_size
 from torchrec.distributed.embedding_types import EmbeddingComputeKernel
@@ -255,15 +257,15 @@ def run_pipeline(
     table_config: EmbeddingTablesConfig,
     pipeline_config: PipelineConfig,
     model_config: BaseModelConfig,
-) -> List[BenchmarkResult]:
+) -> BenchmarkResult:
 
     tables, weighted_tables = generate_tables(
         num_unweighted_features=table_config.num_unweighted_features,
         num_weighted_features=table_config.num_weighted_features,
         embedding_feature_dim=table_config.embedding_feature_dim,
     )
 
-    return run_multi_process_func(
+    benchmark_res_per_rank = run_multi_process_func(
         func=runner,
         world_size=run_option.world_size,
         tables=tables,
@@ -273,6 +275,28 @@ def run_pipeline(
         pipeline_config=pipeline_config,
     )
 
+    # Combine results from all ranks into a single BenchmarkResult
+    # Use timing data from rank 0, combine memory stats from all ranks
+    world_size = run_option.world_size
+
+    total_benchmark_res = BenchmarkResult(
+        short_name=benchmark_res_per_rank[0].short_name,
+        gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
+        cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
+        gpu_mem_stats=[GPUMemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
+        cpu_mem_stats=[CPUMemoryStats(rank, 0) for rank in range(world_size)],
+        rank=0,
+    )
+
+    for res in benchmark_res_per_rank:
+        # Each rank's BenchmarkResult contains 1 GPU and 1 CPU memory measurement
+        if len(res.gpu_mem_stats) > 0:
+            total_benchmark_res.gpu_mem_stats[res.rank] = res.gpu_mem_stats[0]
+        if len(res.cpu_mem_stats) > 0:
+            total_benchmark_res.cpu_mem_stats[res.rank] = res.cpu_mem_stats[0]
+
+    return total_benchmark_res
+
 
 def runner(
     rank: int,