awslabs
diff --git a/‎README.md‎
Lines changed: 128 additions & 0 deletions b/‎README.md‎
Lines changed: 128 additions & 0 deletions
diff --git a/‎examples/dcp/stateful_example.py‎
Lines changed: 38 additions & 3 deletions b/‎examples/dcp/stateful_example.py‎
Lines changed: 38 additions & 3 deletions
diff --git a/‎s3torchconnector/src/s3torchconnector/dcp/__init__.py‎
Lines changed: 12 additions & 0 deletions b/‎s3torchconnector/src/s3torchconnector/dcp/__init__.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py‎
Lines changed: 37 additions & 3 deletions b/‎s3torchconnector/src/s3torchconnector/dcp/s3_file_system.py‎
Lines changed: 37 additions & 3 deletions
@@ -171,6 +171,134 @@ DCP.load(
 model.load_state_dict(model_state_dict)
 ```
 
+## S3 Prefix Strategies for Distributed Checkpointing
+
+S3StorageWriter implements various prefix strategies to optimize checkpoint organization in S3 buckets.
+These strategies are specifically designed to prevent throttling (503 Slow Down errors) in high-throughput scenarios 
+by implementing S3 key naming best practices as outlined in 
+[Best practices design patterns: optimizing Amazon S3 performance](https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html).
+
+When many distributed training processes write checkpoints simultaneously, the prefixing strategies help distribute
+the load across multiple S3 partitions.
+
+### Available Strategies
+
+#### 1. RoundRobinPrefixStrategy
+Distributes checkpoints across specified prefixes in a round-robin fashion, ideal for balancing data across multiple storage locations.
+
+```python
+from s3torchconnector.dcp import RoundRobinPrefixStrategy, S3StorageWriter
+
+model = torchvision.models.resnet18()
+
+# Initialize with multiple prefixes and optional epoch tracking
+strategy = RoundRobinPrefixStrategy(
+    user_prefixes=["shard1", "shard2", "shard3"],
+    epoch_num=5  # Optional: for checkpoint versioning
+)
+
+writer = S3StorageWriter(
+    region=REGION,
+    path="CHECKPOINT_URI",
+    prefix_strategy=strategy
+)
+
+# Save checkpoint
+DCP.save(
+    state_dict=model.state_dict(),
+    storage_writer=writer
+)
+```
+Output Structure:
+```
+CHECKPOINT_URI
+├── shard1/
+│   └── epoch_5/
+│       ├── __0_0.distcp
+│       ├── __3_0.distcp
+│       └── ...
+├── shard2/
+│   └── epoch_5/
+│       ├── __1_0.distcp
+│       ├── __4_0.distcp
+│       └── ...
+└── shard3/
+    └── epoch_5/
+        ├── __2_0.distcp
+        ├── __5_0.distcp
+        └── ...
+```
+
+#### 2. BinaryPrefixStrategy
+
+Generates binary (base-2) prefixes for optimal partitioning in distributed environments.
+
+```python
+from s3torchconnector.dcp import BinaryPrefixStrategy
+
+strategy = BinaryPrefixStrategy(
+    epoch_num=1,          # Optional: for checkpoint versioning
+    min_prefix_len=10     # Optional: minimum prefix length
+)
+
+```
+Output Structure:
+```
+s3://my-bucket/checkpoints/
+├── 0000000000/
+│   └── epoch_1/
+│       └── __0_0.distcp
+├── 1000000000/
+│   └── epoch_1/
+│       └── __1_0.distcp
+├── 0100000000/
+│   └── epoch_1/
+│       └── __2_0.distcp
+└── ...
+```
+
+#### 3. HexPrefixStrategy
+
+Uses hexadecimal (base-16) prefixes for a balance of efficiency and readability.
+```
+from s3torchconnector.dcp import HexPrefixStrategy
+
+strategy = HexPrefixStrategy(
+    epoch_num=1,          # Optional: for checkpoint versioning
+    min_prefix_len=4      # Optional: minimum prefix length
+)
+```
+Output Structure:
+```
+s3://my-bucket/checkpoints/
+├── 0000/
+│   └── epoch_1/
+│       └── __0_0.distcp
+├── 1000/
+│   └── epoch_1/
+│       └── __1_0.distcp
+...
+├── f000/
+│   └── epoch_1/
+│       └── __15_0.distcp
+└── ...
+```
+
+### Creating Custom Strategies
+
+You can implement custom prefix strategies by extending the S3PrefixStrategyBase class:
+```
+from s3torchconnector.dcp import S3PrefixStrategyBase
+
+class CustomPrefixStrategy(S3PrefixStrategyBase):
+    def __init__(self, custom_param):
+        super().__init__()
+        self.custom_param = custom_param
+
+    def generate_prefix(self, rank: int) -> str:
+        return f"custom_{self.custom_param}/{rank}/"
+```
+
 ## Parallel/Distributed Training
 
 Amazon S3 Connector for PyTorch provides support for parallel and distributed training with PyTorch, 
 
@@ -17,7 +17,9 @@
 from torch.distributed.device_mesh import init_device_mesh
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
 
+from s3torchconnector import S3ClientConfig
 from s3torchconnector.dcp import S3StorageWriter, S3StorageReader
+from s3torchconnector.dcp.s3_prefix_strategy import RoundRobinPrefixStrategy
 
 
 class Model(torch.nn.Module):
@@ -98,6 +100,10 @@ def _setup(rank, world_size):
     torch.cuda.set_device(rank)
 
 
+def _cleanup():
+    dist.destroy_process_group()
+
+
 def _train_initial_model(device, rank, world_size):
     print(f"Train initial model on rank:{rank}")
     model, optim = _init_model(device, world_size)
@@ -126,8 +132,34 @@ def run(rank, world_size, region, s3_uri, device="cuda"):
     model, optim = _train_initial_model(device, rank, world_size)
 
     print(f"Saving checkpoint on rank:{rank}")
-    # initialize S3StorageWriter with region and bucket name, before passing to dcp.save as writer
-    storage_writer = S3StorageWriter(region, s3_uri)
+    # S3ClientConfig configuration for optimized data transfer to S3
+    s3config = S3ClientConfig(
+        # Sets the size of each part in multipart upload to 16MB (16 * 1024 * 1024 bytes)
+        # This is a reasonable default for large file transfers
+        part_size=16 * 1024 * 1024,
+        # Targets a throughput of 600 Gbps for data transfer
+        # Suitable for high-bandwidth environments (P5/trn1 instances) and large model transfers
+        throughput_target_gbps=600,
+        # Maximum number of retry attempts for failed operations
+        # Helps handle transient network issues or S3 throttling
+        max_attempts=20,
+    )
+
+    # RoundRobinPrefixStrategy distributes checkpoint data across multiple prefixes in a round-robin fashion
+    strategy = RoundRobinPrefixStrategy(
+        # List of prefix strings that will be used in rotation for storing checkpoint shards
+        # Each prefix represents a separate "path" in S3 where checkpoint data will be stored
+        # Using multiple prefixes helps with lowering TPS per prefix
+        user_prefixes=["0000000000", "1000000000", "0100000000", "1100000000"],
+        # Optional integer for versioning checkpoints across training epochs
+        # If provided, will append epoch number to prefix paths
+        # Helps track checkpoint evolution over training progress
+        epoch_num=5,  # Optional: for checkpoint versioning
+    )
+    # initialize S3StorageWriter with region, bucket name and s3config, before passing to dcp.save as writer
+    storage_writer = S3StorageWriter(
+        region=region, path=s3_uri, s3client_config=s3config, prefix_strategy=strategy
+    )
     dcp.save(
         state_dict={"model": model, "optimizer": optim},
         storage_writer=storage_writer,
@@ -139,13 +171,16 @@ def run(rank, world_size, region, s3_uri, device="cuda"):
     )
     print(f"Load previously saved checkpoint on rank:{rank}")
     # initialize S3StorageReader with region and bucket name, before passing to dcp.load as reader
-    storage_reader = S3StorageReader(region, s3_uri)
+    storage_reader = S3StorageReader(
+        region=region, path=s3_uri, s3client_config=s3config
+    )
     dcp.load(
         state_dict={"model": modified_model, "optimizer": modified_optim},
         storage_reader=storage_reader,
     )
     _continue_training_loaded_model(modified_model, modified_optim, model, rank)
     print(f"Quiting on rank:{rank}")
+    _cleanup()
 
 
 if __name__ == "__main__":
 
@@ -2,9 +2,21 @@
 #  // SPDX-License-Identifier: BSD
 
 from .s3_file_system import S3FileSystem, S3StorageReader, S3StorageWriter
+from .s3_prefix_strategy import (
+    S3PrefixStrategyBase,
+    DefaultPrefixStrategy,
+    NumericPrefixStrategy,
+    BinaryPrefixStrategy,
+    HexPrefixStrategy,
+)
 
 __all__ = [
     "S3FileSystem",
     "S3StorageReader",
     "S3StorageWriter",
+    "S3PrefixStrategyBase",
+    "DefaultPrefixStrategy",
+    "NumericPrefixStrategy",
+    "BinaryPrefixStrategy",
+    "HexPrefixStrategy",
 ]
@@ -8,6 +8,7 @@
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Generator, Union, Optional
+from typing import List
 
 from s3torchconnectorclient._mountpoint_s3_client import S3Exception
 from tenacity import (
@@ -28,6 +29,7 @@
 from s3torchconnector._s3client import S3Client
 from s3torchconnector._s3dataset_common import parse_s3_uri
 from .. import S3ClientConfig
+from .s3_prefix_strategy import S3PrefixStrategyBase, DefaultPrefixStrategy
 from .._user_agent import UserAgent
 
 logger = logging.getLogger(__name__)
@@ -43,11 +45,11 @@ def __init__(
         self._path: Union[str, os.PathLike] = ""
         user_agent = UserAgent(["dcp", torch.__version__])
         self._client = (
-            s3_client
-            if s3_client is not None
-            else S3Client(
+            S3Client(
                 region=region, user_agent=user_agent, s3client_config=s3client_config
             )
+            if s3_client is None
+            else s3_client
         )
 
     @contextmanager
@@ -227,12 +229,25 @@ def _escape_path(string):
         return "/".join(parts)
 
 
+from torch.distributed.checkpoint.planner import SavePlan
+import dataclasses
+from dataclasses import dataclass
+
+
+@dataclass
+class StorageMetadata:
+    """Metadata for S3 storage prefix."""
+
+    prefix: str
+
+
 class S3StorageWriter(FileSystemWriter):
     def __init__(
         self,
         region: str,
         path: str,
         s3client_config: Optional[S3ClientConfig] = None,
+        prefix_strategy: Optional[S3PrefixStrategyBase] = None,
         **kwargs,
     ) -> None:
         """
@@ -241,6 +256,7 @@ def __init__(
         Args:
             region (str): The AWS region for S3.
             path (str): The S3 URI to write checkpoints to.
+            prefix_strategy: Strategy for generating S3 prefixes.
             kwargs (dict): Keyword arguments to pass to the parent :class:`FileSystemWriter`.
         """
         super().__init__(
@@ -250,6 +266,24 @@ def __init__(
         )
         self.fs = S3FileSystem(region, s3client_config=s3client_config)  # type: ignore
         self.path = self.fs.init_path(path)
+        self.prefix_strategy = prefix_strategy or DefaultPrefixStrategy()
+
+    def prepare_global_plan(self, plans: List[SavePlan]) -> List[SavePlan]:
+        """
+        Prepare save plans with S3-specific storage metadata.
+
+        Args:
+            plans: List of save plans to be processed.
+
+        Returns:
+            Modified save plans with S3 storage metadata.
+        """
+        return [
+            dataclasses.replace(
+                plan, storage_data=StorageMetadata(self.prefix_strategy(idx))
+            )
+            for idx, plan in enumerate(plans)
+        ]
 
     @classmethod
     def validate_checkpoint_id(cls, checkpoint_id: Union[str, os.PathLike]) -> bool: