feat(dcp): add benchmark capabilities for DCP (#259)

matthieu-d4r · IsaevIlya · commit 547d30e1747d · 2024-11-22T09:11:33.000Z
Create a dedicated `dcp` Python package within `s3torchbenchmarking`, to
run benchmarks against fsspec. Use Pandas for result metrics.
diff --git a/s3torchbenchmarking/conf/dcp.yaml b/s3torchbenchmarking/conf/dcp.yaml
@@ -0,0 +1,16 @@
+s3:
+  region: ???
+  uri: ???
+epochs: 4
+path: ./nvme/ # only used when `checkpoint.storage` contains `disk`, ignored for `s3`
+
+# https://hydra.cc/docs/tutorials/basic/running_your_app/multi-run/#sweeper
+hydra:
+  mode: MULTIRUN
+  sweeper:
+    params:
+      +model: vit-base,T0_3B
+      +backend: nccl,gloo # nccl == GPU, gloo == CPU
+      +world_size: 1,2,4,8 # == total number of workers to use
+      +thread_count: 1,2,4,8
+      +checkpoint.storage: disk,s3
diff --git a/s3torchbenchmarking/pyproject.toml b/s3torchbenchmarking/pyproject.toml
@@ -27,9 +27,18 @@ dependencies = [
     "click",
     "omegaconf",
     "accelerate",
+    "pandas",
 ]
-optional-dependencies = { test = ["pytest"] }
-scripts = { s3torch-benchmark = "s3torchbenchmarking.benchmark:run_experiment", s3torch-datagen = "s3torchbenchmarking.datagen:synthesize_dataset" }
+
+[project.optional-dependencies]
+test = [
+    "pytest"
+]
+
+[project.scripts]
+s3torch-benchmark = "s3torchbenchmarking.benchmark:run_experiment"
+s3torch-datagen = "s3torchbenchmarking.datagen:synthesize_dataset"
+s3torch-benchmark-dcp = "s3torchbenchmarking.dcp.benchmark:run_benchmark"
 
 [tool.setuptools.packages]
 # Pure Python packages/modules and configuration files
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/README.md
@@ -0,0 +1,155 @@
+## PyTorch's Distributed Checkpoint (DCP) benchmarks
+
+The `dcp` Python package holds all the logic to execute benchmarks for [PyTorch's Distributed Checkpointing][DCP]
+feature against the `s3torchconnector` library.
+
+### Purpose
+
+These benchmarks are designed to:
+
+1. Test the "save" mechanism of PyTorch DCP (`torch.distributed.checkpoint.save`);
+2. Compare the performance of the s3torchconnector library against other libraries and local storage;
+3. Measure throughput (in MiB/s) and save times (in seconds).
+
+### Usage
+
+> [!IMPORTANT]
+> The benchmarks are designed to be run on a EC2 instance.
+
+Install the `s3torchbenchmarking` package with `pip` (see the [root README](../../../README.md) for instructions); once
+installed, the DCP benchmarks can be run with:
+
+```shell
+$ s3torch-benchmark-dcp -cd conf -cn dcp
+```
+
+The command must be executed from the package's root, where it can read from the `config/` directory; it will create a
+`./multirun/` directory (at the location of execution), and store all benchmark results there.
+
+> [!WARNING]
+> When saving on local disk, consider clearing the `path` specified in your config between runs to prevent disk space
+> issues.
+
+#### Potential caveats
+
+If you encounter the following error during installation:
+
+```
+TypeError: canonicalize_version() got an unexpected keyword argument 'strip_trailing_zero'
+```
+
+Run this command to resolve it:
+
+```shell
+$ pip install "setuptools<71"
+```
+
+### Configuration
+
+The benchmark runs can be customized using the [`dcp.yaml`](../../../conf/dcp.yaml) file. This section outlines the key
+configuration options and their impacts.
+
+#### Configuration Requirements
+
+All keys in the `dcp.yaml` file must be defined for a run to execute successfully.
+
+#### Key Configuration Options
+
+`epochs`
+
+- Specifies the number of iterations for "saving" a model's checkpoint.
+- Note: This does not affect model training, as no actual training occurs in these benchmarks.
+
+`path`
+
+- Designates the directory for benchmark operations.
+- If the specified directory doesn't exist, it will be created automatically.
+- For optimal performance using an SSD filesystem, refer to the [`prepare_nvme.sh`](../../../utils/prepare_nvme.sh)
+  script.
+
+`hydra.sweeper.params`
+
+This section allows for multiple benchmark configurations:
+
+- The benchmark will run sequential jobs for each combination of the specified parameters.
+- Available options include:
+    - `+model`: Choose from pre-trained models listed in [`models.py`](models.py).
+    - `+backend`: Select `nccl`, `gloo`, or both.
+    - `+world_size`: Defines the number of workers.
+    - `+thread_count`: Defines the number of threads to use for saving the checkpoints.
+    - `+checkpoint.storage`: Choose `s3`, `disk`, or both.
+
+#### Example Configuration
+
+```yaml
+s3:
+  region: eu-west-1
+  uri: s3://my-bucket
+epochs: 3
+path: ./nvme/
+
+hydra:
+  mode: MULTIRUN
+  sweeper:
+    params:
+      +model: vit-base,T0_3B
+      +backend: nccl,gloo
+      +world_size: 2,4
+      +thread_count: 1
+      +checkpoint.storage: s3,disk
+```
+
+This configuration will run benchmarks for all combinations of the specified models, backends, world sizes, and storage
+options, totaling 16 (2×2×2×1×2) different benchmark scenarios.
+
+### Important notes
+
+- The benchmarks may take some time to complete, depending on the hardware and network configuration.
+- For optimal results, it is recommended to run the benchmarks on a dedicated EC2 instance without other
+  resource-intensive processes.
+- Ensure the specified S3 bucket exists in the given region and the EC2 user/role has read+write permissions.
+
+### Results
+
+Benchmark results are organized as follows:
+
+```shell
+multirun/
+└── YYYY-MM-DD
+    └── HH-MM-SS
+        ├── 0
+        │   ├── benchmark.log
+        │   └── results_small_nccl_2_2_s3.json
+        ├── 1
+        │   ├── benchmark.log
+        │   └── results_small_nccl_2_2_disk.json
+        ├── 2
+        │   ├── benchmark.log
+        │   └── results_small_nccl_4_2_s3.json
+        ├── 3
+        │   ├── benchmark.log
+        │   └── results_small_nccl_4_2_disk.json
+        └── multirun.yaml
+```
+
+Each run creates a timestamped subdirectory. The `./multirun/` directory is managed by [Hydra](https://hydra.cc/).
+
+Result file names reflect the parameter combinations, e.g.,
+
+```
++model: vit-base
++backend: nccl
++world_size: 2
++thread_count: 1
++checkpoint.storage: s3
+```
+
+will produce the file `results_vit-base_nccl_2_1_s3.json` (respecting parameters declaration order).
+
+### References
+
+- https://pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html
+- https://pytorch.org/docs/stable/elastic/run.html
+- https://pytorch.org/tutorials/intermediate/ddp_tutorial.html
+
+[DCP]: https://pytorch.org/docs/stable/distributed.checkpoint.html
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/__init__.py b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/__init__.py
@@ -0,0 +1,2 @@
+#  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  // SPDX-License-Identifier: BSD
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/benchmark.py b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/benchmark.py
@@ -0,0 +1,127 @@
+#  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  // SPDX-License-Identifier: BSD
+
+import logging
+import os
+import random
+import string
+from multiprocessing.queues import Queue
+from pathlib import Path
+from time import perf_counter
+from typing import List
+
+import hydra
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+from omegaconf import DictConfig
+from torch import multiprocessing as mp
+from torch.distributed.checkpoint import FileSystemWriter
+from torch.nn import Module
+from torch.nn.parallel import DistributedDataParallel
+
+from s3torchconnector.dcp import S3StorageWriter
+from .constants import Timestamps
+from .models import get_benchmark_model
+from .results import save_results
+from ..benchmark_utils import ResourceMonitor
+
+logger = logging.getLogger(__name__)
+
+
+@hydra.main(version_base=None, config_path=".", config_name="config")
+def run_benchmark(cfg: DictConfig):
+    """DCP benchmark entry point."""
+    benchmark_model = get_benchmark_model(cfg.model)
+
+    # For every run, use a randomized suffix (for either local disk or S3).
+    suffix = "".join(random.choices(string.ascii_letters, k=7))
+    storage_writer = get_writer(cfg, suffix)
+
+    manager = mp.Manager()
+    corrected_save_timestamps: Queue[Timestamps] = manager.Queue()
+    processing_timestamps: List[Timestamps] = []
+
+    with ResourceMonitor() as monitor:
+        for epoch in range(cfg.epochs):
+            logger.info("Executing epoch #%i / %i...", epoch + 1, cfg.epochs)
+            begin_mp = perf_counter()
+            mp.spawn(
+                run,
+                (cfg, benchmark_model.model, storage_writer, corrected_save_timestamps),
+                nprocs=cfg.world_size,
+                join=True,
+            )
+            end_mp = perf_counter()
+            processing_timestamps.append((begin_mp, end_mp))
+
+    # Dump the multiprocessing Queue's content into a list.
+    collector: List[Timestamps] = []
+    while not corrected_save_timestamps.empty():
+        collector.append(corrected_save_timestamps.get())
+
+    save_results(
+        cfg,
+        benchmark_model,
+        corrected_save_timestamps=collector,
+        processing_timestamps=processing_timestamps,
+    )
+
+
+def get_writer(cfg: DictConfig, suffix: str) -> FileSystemWriter:
+    """Instantiate a checkpoint writer based on the input config."""
+    if cfg.checkpoint.storage == "disk":
+        local_path = Path(cfg.path) / suffix
+        logger.info("Saving checkpoint to %s (local disk)...", local_path)
+        return dcp.FileSystemWriter(local_path, thread_count=cfg.thread_count)
+    elif cfg.checkpoint.storage == "s3":
+        uri = build_checkpoint_uri(cfg.s3.uri, suffix)
+        logger.info("Saving checkpoint to %s (S3)...", uri)
+        return S3StorageWriter(cfg.s3.region, uri, thread_count=cfg.thread_count)
+    raise ValueError(f"Storage writer {cfg.checkpoint.storage} not supported")
+
+
+def build_checkpoint_uri(s3_uri: str, suffix: str) -> str:
+    suffix = suffix.lstrip("/")
+    return s3_uri + suffix if s3_uri.endswith("/") else s3_uri + "/" + suffix
+
+
+def setup(backend: str, world_size: int, rank: int) -> None:
+    os.environ["MASTER_ADDR"] = "localhost"
+    os.environ["MASTER_PORT"] = "12355"
+    dist.init_process_group(backend, world_size=world_size, rank=rank)
+
+
+# FIXME: configure logging in subprocess accordingly
+def run(
+    rank: int,  # needs to be passed first (provided by `multiprocessing.spawn` automatically)
+    cfg: DictConfig,
+    model: Module,
+    storage_writer: FileSystemWriter,
+    save_timestamps: Queue,
+) -> None:
+    """Execute the actual code for checkpoint saving.
+
+    This function is meant to be executed in subprocesses."""
+    begin_process = perf_counter()
+
+    setup(cfg.backend, world_size=cfg.world_size, rank=rank)
+    if cfg.backend == "nccl":
+        device_id = rank % torch.cuda.device_count()
+        torch.cuda.set_device(device_id)
+    else:
+        device_id = rank % torch.cpu.device_count()
+        torch.cpu.set_device(device_id)
+
+    model.to(device_id)
+    model = DistributedDataParallel(model, device_ids=[device_id])
+    state_dict = model.state_dict()
+
+    begin_save = perf_counter()
+    dcp.save(state_dict, storage_writer=storage_writer)
+    end_save = perf_counter()
+
+    # Record the save times excluding the influence of the process setup and model loading to device.
+    save_timestamps.put((begin_process, end_save - (begin_save - begin_process)))
+
+    dist.destroy_process_group()
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/constants.py b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/constants.py
@@ -0,0 +1,3 @@
+from typing import Tuple
+
+Timestamps = Tuple[float, float]
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/models.py b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/models.py
@@ -0,0 +1,63 @@
+#  Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  // SPDX-License-Identifier: BSD
+
+from functools import cached_property
+from typing import Callable
+
+from torch.nn import Module
+from transformers import AutoModelForSeq2SeqLM, ViTModel, CLIPModel
+
+
+class BenchmarkModel:
+    """Utility class around a :class:`torch.nn.Module`, with an additional metadata layer."""
+
+    def __init__(self, loader: Callable, name: str):
+        self._loader = loader
+        self._name = name
+
+    @property
+    def name(self) -> str:
+        return self._name
+
+    @cached_property
+    def model(self) -> Module:
+        return self._loader(self._name)
+
+    @cached_property
+    def size(self) -> float:
+        """Compute a model's size (in MiB).
+
+        Sourced from https://discuss.pytorch.org/t/finding-model-size/130275/2.
+        """
+        param_size = 0
+        for param in self.model.parameters():
+            param_size += param.nelement() * param.element_size()
+        buffer_size = 0
+        for buffer in self.model.buffers():
+            buffer_size += buffer.nelement() * buffer.element_size()
+        return (param_size + buffer_size) / 1024**2
+
+
+# NOTE: keys below are later used to construct a filename, so make sure they do not contain characters that will not
+# play well with filesystems (e.g., '/').
+SIZE_TO_MODEL = {
+    # ~350 MB model
+    "vit-base": BenchmarkModel(
+        ViTModel.from_pretrained, "google/vit-base-patch16-224-in21k"
+    ),
+    # ~1.7 GB model
+    "clip-vit": BenchmarkModel(
+        CLIPModel.from_pretrained, "openai/clip-vit-large-patch14"
+    ),
+    # ~12 GB model
+    "T0_3B": BenchmarkModel(AutoModelForSeq2SeqLM.from_pretrained, "bigscience/T0_3B"),
+    # ~45 GB model
+    "T0pp": BenchmarkModel(AutoModelForSeq2SeqLM.from_pretrained, "bigscience/T0pp"),
+}
+
+
+def get_benchmark_model(name: str) -> BenchmarkModel:
+    """Select a model for benchmarking."""
+    if name not in SIZE_TO_MODEL:
+        raise ValueError(f'Name "{name}" is unexpected')
+    return SIZE_TO_MODEL[name]
diff --git a/s3torchbenchmarking/src/s3torchbenchmarking/dcp/results.py b/s3torchbenchmarking/src/s3torchbenchmarking/dcp/results.py
diff --git a/s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py b/s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.`
	`2`	`+# // SPDX-License-Identifier: BSD`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from typing import Tuple`
	`2`	`+`
	`3`	`+Timestamps = Tuple[float, float]`