Skip to content

Commit fc56fb0

Browse files
SSYernarfacebook-github-bot
authored andcommitted
Stack pipeline benchmark per-rank-results into a single BenchmarkResult (#3258)
Summary: Pull Request resolved: #3258 Stack pipeline benchmark per-rank-results into a single BenchmarkResult to measure overall performance across GPUs. This will cause 'intentional' **2x regression** for allocated memory metrics since we are accumulating the total memory stats across two given GPUs instead of measuring only one. After the change is landed, I will **re-register** the ServiceLab task with the new baseline metrics to avoid future false regression warnings. Reviewed By: aliafzal Differential Revision: D79537357 fbshipit-source-id: 8b2e850532189ff9db584e2c19e1fca55f3963a9
1 parent 091ec6b commit fc56fb0

File tree

1 file changed

+26
-2
lines changed

1 file changed

+26
-2
lines changed

torchrec/distributed/benchmark/benchmark_train_pipeline.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,11 @@
4141
benchmark_func,
4242
BenchmarkResult,
4343
cmd_conf,
44+
CPUMemoryStats,
4445
generate_planner,
4546
generate_sharded_model_and_optimizer,
4647
generate_tables,
48+
GPUMemoryStats,
4749
)
4850
from torchrec.distributed.comm import get_local_size
4951
from torchrec.distributed.embedding_types import EmbeddingComputeKernel
@@ -255,15 +257,15 @@ def run_pipeline(
255257
table_config: EmbeddingTablesConfig,
256258
pipeline_config: PipelineConfig,
257259
model_config: BaseModelConfig,
258-
) -> List[BenchmarkResult]:
260+
) -> BenchmarkResult:
259261

260262
tables, weighted_tables = generate_tables(
261263
num_unweighted_features=table_config.num_unweighted_features,
262264
num_weighted_features=table_config.num_weighted_features,
263265
embedding_feature_dim=table_config.embedding_feature_dim,
264266
)
265267

266-
return run_multi_process_func(
268+
benchmark_res_per_rank = run_multi_process_func(
267269
func=runner,
268270
world_size=run_option.world_size,
269271
tables=tables,
@@ -273,6 +275,28 @@ def run_pipeline(
273275
pipeline_config=pipeline_config,
274276
)
275277

278+
# Combine results from all ranks into a single BenchmarkResult
279+
# Use timing data from rank 0, combine memory stats from all ranks
280+
world_size = run_option.world_size
281+
282+
total_benchmark_res = BenchmarkResult(
283+
short_name=benchmark_res_per_rank[0].short_name,
284+
gpu_elapsed_time=benchmark_res_per_rank[0].gpu_elapsed_time,
285+
cpu_elapsed_time=benchmark_res_per_rank[0].cpu_elapsed_time,
286+
gpu_mem_stats=[GPUMemoryStats(rank, 0, 0, 0) for rank in range(world_size)],
287+
cpu_mem_stats=[CPUMemoryStats(rank, 0) for rank in range(world_size)],
288+
rank=0,
289+
)
290+
291+
for res in benchmark_res_per_rank:
292+
# Each rank's BenchmarkResult contains 1 GPU and 1 CPU memory measurement
293+
if len(res.gpu_mem_stats) > 0:
294+
total_benchmark_res.gpu_mem_stats[res.rank] = res.gpu_mem_stats[0]
295+
if len(res.cpu_mem_stats) > 0:
296+
total_benchmark_res.cpu_mem_stats[res.rank] = res.cpu_mem_stats[0]
297+
298+
return total_benchmark_res
299+
276300

277301
def runner(
278302
rank: int,

0 commit comments

Comments
 (0)