Support FSDP2 for FSDP test case (#951)

nagpalar · KeitaW · web-flow · commit 223c9e970e9f · 2026-02-16T11:02:30.000+09:00
* FSDP2: updated files from the sample repo * updated sbatch files with conda activate command * Refactor FSDP training configuration - Downgrade PyTorch from 2.9.1 to 2.7.0 - Split Dockerfile into multi-stage build (Base and HTPO stages) - Remove venv activation from Slurm training scripts - Reverting Kubernetes template configuration * fixing typo HTPO to HPTO * Docker Target does not accept upercase so changed it Lowercase base or hpto and 2.7.0 is not supported so switching it to 2.6.0 for FSDP2 and Hyperpod Elastic agent * Update PyTorch and related package versions Let's come back to this in a separate PR. For now, let me merge the PR before we address #959 --------- Co-authored-by: Keita Watanabe <mlkeita@amazon.com>
diff --git a/3.test_cases/pytorch/FSDP/Dockerfile b/3.test_cases/pytorch/FSDP/Dockerfile
@@ -1,4 +1,4 @@
-FROM public.ecr.aws/hpc-cloud/nccl-tests:latest
+FROM public.ecr.aws/hpc-cloud/nccl-tests:latest AS base
 
 RUN apt update && apt install -y nvtop
 
@@ -7,6 +7,10 @@ RUN ln -s /usr/bin/python3 /usr/bin/python
 
 COPY src/ /fsdp/
 RUN --mount=type=cache,target=/root/.cache/pip pip install -r /fsdp/requirements.txt
-RUN pip install hyperpod-elastic-agent
 
-WORKDIR /fsdp
+WORKDIR /fsdp
+
+# HyperPod-specific image with elastic agent
+FROM base AS hpto
+
+RUN pip install hyperpod-elastic-agent
diff --git a/3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template b/3.test_cases/pytorch/FSDP/kubernetes/fsdp.yaml-template
@@ -6,7 +6,7 @@ spec:
   elasticPolicy:
     rdzvBackend: c10d
     minReplicas: 1
-    maxReplicas: 64
+    maxReplicas: $MAX_NODES
     maxRestarts: 100
     metrics:
       - type: Resource
@@ -17,7 +17,7 @@ spec:
             averageUtilization: 90
   pytorchReplicaSpecs:
     Worker:
-      replicas: $NUM_NODES
+      replicas: $MAX_NODES
       restartPolicy: OnFailure
       template:
         metadata:
@@ -31,6 +31,9 @@ spec:
             - name: local
               hostPath:
                 path: /mnt/k8s-disks/0
+            - name: checkpoints
+              persistentVolumeClaim:
+                claimName: fsx-claim
           #nodeSelector:
           #  node.kubernetes.io/instance-type: "${INSTANCE_TYPE}"
           containers:
@@ -78,16 +81,20 @@ spec:
                 value: "1"
               - name: HF_TOKEN
                 value: "${HF_TOKEN}"
-              #- name: TORCH_DIST_INIT_BARRIER
-              #  value: "1"
+              - name: TORCH_DIST_INIT_BARRIER
+                value: "1"
+              - name: GLOO_SOCKET_IFNAME
+                value: "eth0"
+              - name: NCCL_SOCKET_TIMEOUT_MS
+                value: "600000"
               #- name: NCCL_IGNORE_DISABLED_P2P
               #  value: "1"
               #- name: NCCL_NVLS_ENABLE
               #  value: "0"
               command: 
                 - /usr/local/bin/torchrun
                 - --nproc_per_node=$GPU_PER_NODE
-                - --nnodes=$NUM_NODES
+                - --nnodes=$MIN_NODES:$MAX_NODES
                 - /fsdp/train.py
                 - --max_context_width=4096
                 - --num_key_value_heads=32
@@ -113,3 +120,5 @@ spec:
                   mountPath: /dev/shm
                 - name: local
                   mountPath: /local
+                - name: checkpoints
+                  mountPath: /checkpoints
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_13b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_70b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama2_7b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama3_1_70b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama3_1_8b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama3_2_1b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/llama3_2_3b-training.sbatch
@@ -110,4 +110,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/mathstral_7b-training.sbatch
@@ -132,4 +132,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch b/3.test_cases/pytorch/FSDP/slurm/mistral_8x7b-training.sbatch
@@ -129,4 +129,4 @@ if [ -d "/opt/sagemaker_cluster" ]; then
     echo "Detected Hyperpod cluster.. enabling --auto-resume=1"
     AUTO_RESUME="--auto-resume=1"
 fi
-srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
+srun ${AUTO_RESUME} -l "${ARGS[@]}" ${TORCHRUN} "${TORCHRUN_ARGS[@]}" $TRAIN_SCRIPT "${TRAINING_ARGS[@]}"
diff --git a/3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py b/3.test_cases/pytorch/FSDP/src/model_utils/checkpoint.py
@@ -3,51 +3,57 @@
 
 import os
 import re
-import pickle
-import statistics
-import time
 import warnings
 from pathlib import Path
 
 import torch
 import torch.distributed as dist
-
-# pylint: disable=import-error,no-name-in-module
+from torch.distributed.checkpoint.state_dict import (
+    get_model_state_dict,
+    get_optimizer_state_dict,
+    set_model_state_dict,
+    set_optimizer_state_dict,
+    StateDictOptions,
+)
 import torch.distributed.checkpoint as dist_cp
-from torch.distributed.checkpoint.optimizer import load_sharded_optimizer_state_dict
-from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-from torch.distributed.fsdp.fully_sharded_data_parallel import StateDictType
 from model_utils.train_utils import get_logger
 
-
 logger = get_logger()
 
 def save_checkpoint(model, optimizer, scheduler, user_content, root_dir, sub_dir):
+    """Save checkpoint using FSDP2 DTensor state dict APIs."""
     torch.cuda.empty_cache()
 
     save_dir = os.path.join(root_dir, sub_dir)
     if dist.get_rank() == 0:
         logger.info("Writing checkpoint to {0}.".format(save_dir))
     
-    with FSDP.state_dict_type(
-            model, 
-            StateDictType.SHARDED_STATE_DICT):
-        state_dict = {
-            "model": model.state_dict(),
-            "optim": FSDP.optim_state_dict(model, optimizer),
-            "scheduler": scheduler.state_dict(),
-            "total_steps": user_content["total_steps"],
-            "start_batch_index": user_content["start_batch_index"],
-        }
-        dist_cp.save_state_dict(
-                    state_dict=state_dict,
-                    storage_writer=dist_cp.FileSystemWriter(save_dir)
-                )
+    # Get sharded state dicts (DTensor format)
+    model_state_dict = get_model_state_dict(model)
+    optimizer_state_dict = get_optimizer_state_dict(
+        model=model,
+        optimizer=optimizer,
+    )
+    
+    state_dict = {
+        "model": model_state_dict,
+        "optim": optimizer_state_dict,
+        "scheduler": scheduler.state_dict(),
+        "total_steps": user_content["total_steps"],
+        "start_batch_index": user_content["start_batch_index"],
+    }
+    
+    dist_cp.save(
+        state_dict=state_dict,
+        storage_writer=dist_cp.FileSystemWriter(save_dir),
+    )
+    
     dist.barrier()
     if dist.get_rank() == 0:
         logger.info("Completed checkpoint.")
 
 def get_last_checkpoint(checkpoint_paths, model_type):
+    """Find the most recent checkpoint."""
     steps = [int(re.findall(r'\d+steps', checkpoint.stem)[0].replace('steps','')) \
          for checkpoint in checkpoint_paths]
     checkpoints = sorted([(step, path) for step,path in zip(steps, checkpoint_paths)])
@@ -63,8 +69,10 @@ def get_last_checkpoint(checkpoint_paths, model_type):
         return None
     
 def load_checkpoint(model, optimizer, scheduler, checkpoint_dir, model_type, device):
+    """Load checkpoint using FSDP2 DTensor state dict APIs."""
     checkpoint_paths = list(Path(checkpoint_dir).glob(f"{model_type}-*steps"))
     last_checkpoint = get_last_checkpoint(checkpoint_paths, model_type)
+    
     if last_checkpoint is None:
         if dist.get_rank() == 0:
             logger.info("No Checkpoints Found")
@@ -75,52 +83,53 @@ def load_checkpoint(model, optimizer, scheduler, checkpoint_dir, model_type, dev
             0,
             0,
         )
+    
     if dist.get_rank() == 0:
         logger.info("Loading checkpoint from %s ...", last_checkpoint)
-    with FSDP.state_dict_type(
-            model,
-            StateDictType.SHARDED_STATE_DICT,
-        ):
-        state_dict = {
-            "model": model.state_dict(),
-            "scheduler": scheduler.state_dict(),
-            "total_steps": 0,
-            "start_batch_index": 0,
-            # cannot load the optimizer state_dict together with the model state_dict
-        }
-        dist_cp.load_state_dict(
-            state_dict=state_dict,
-            storage_reader=dist_cp.FileSystemReader(last_checkpoint),
-        )
-        model.load_state_dict(state_dict["model"])
-        scheduler.load_state_dict(state_dict["scheduler"])
-        if dist.get_rank() == 0:
-            logger.info("Loaded model state from disk")
-            logger.info("Loading optimizer state from disk")
-        optim_state = load_sharded_optimizer_state_dict(
-            model_state_dict=state_dict["model"],
-            optimizer_key="optim",
-            storage_reader=dist_cp.FileSystemReader(last_checkpoint),
-        )
-        if dist.get_rank() == 0:
-            logger.info("Loaded and sharded optimizer state from disk")
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", UserWarning)
-            # UserWarning to replace all_gather_base with all_gather_into_tensor floods the logs
-            flattened_osd = FSDP.optim_state_dict_to_load(
-                model, optimizer, optim_state["optim"]
-            )
-
-        if dist.get_rank() == 0:
-            logger.info("Converted optimizer state dict for FSDP")
-        optimizer.load_state_dict(flattened_osd)
+    
+    # Load state dict from checkpoint
+    state_dict = {
+        "model": {},
+        "optim": {},
+        "scheduler": {},
+        "total_steps": 0,
+        "start_batch_index": 0,
+    }
+    
+    dist_cp.load(
+        state_dict=state_dict,
+        storage_reader=dist_cp.FileSystemReader(last_checkpoint),
+    )
+    
+    # Load model state dict
+    set_model_state_dict(
+        model=model,
+        model_state_dict=state_dict["model"],
+    )
+    
+    if dist.get_rank() == 0:
+        logger.info("Loaded model state from disk")
+        logger.info("Loading optimizer state from disk")
+    
+    # Load optimizer state dict
+    set_optimizer_state_dict(
+        model=model,
+        optimizer=optimizer,
+        optim_state_dict=state_dict["optim"],
+    )
+    
+    # Load scheduler state
+    scheduler.load_state_dict(state_dict["scheduler"])
+    
     dist.barrier()
     if dist.get_rank() == 0:
         logger.info("Checkpoint loaded from %s.", last_checkpoint)
+    
     return (
         model,
         optimizer,
         scheduler,
         state_dict["total_steps"],
         state_dict["start_batch_index"],
     )
+
diff --git a/3.test_cases/pytorch/FSDP/src/train.py b/3.test_cases/pytorch/FSDP/src/train.py