Skip to content

[BUG] IndexError occurs when saving checkpoint under ZeRO Stage 1/2 #7650

@sdjasj

Description

@sdjasj

Describe the bug
When using the Adam optimizer together with ZeRO Stage 1/2 under the following configuration, calling
model_engine.save_checkpoint(ckpt_dir, tag=tag) will raise an IndexError during execution:

{
  "train_batch_size": 32,
  "train_micro_batch_size_per_gpu": 8,
  "optimizer": {
    "type": "Adam",
    "params": {
      "torch_adam": true
    }
  },
  "zero_optimization": {
    "stage": 1,
    "elastic_checkpoint": true
  }
}

If "torch_adam" is set to false, or if "elastic_checkpoint" in zero_optimization is set to false, the error does not occur.

To Reproduce
The following is the code test.py that causes the issue.

import torch
import torch.nn as nn
import deepspeed
import torch.distributed as dist
from torch.utils.data import Dataset
import os
def set_seed(seed: int):
    import os
    import random
    import numpy as np
    import torch
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False


set_seed(42)


class RandomDataset(Dataset):
    def __init__(self, num_samples=800, input_dim=32, num_classes=10):
        self.num_samples = num_samples
        self.input_dim = input_dim
        self.num_classes = num_classes

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        x = torch.randn(self.input_dim)
        y = torch.randint(0, self.num_classes, (1,)).item()
        return x, y


class RandomNet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        layers = []
        dim = input_dim
        for _ in range(10):
            next_dim = 16
            layers.append(nn.Linear(dim, next_dim))
            layers.append(nn.ReLU())
            dim = next_dim
        layers.append(nn.Linear(dim, output_dim))
        self.net = nn.Sequential(*layers)
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x, labels=None):
        logits = self.net(x)
        if labels is not None:
            return self.criterion(logits, labels)
        return logits

ds_config = {
    "train_batch_size": 32,
    "train_micro_batch_size_per_gpu": 8,
    "optimizer": {
        "type": "Adam",
        "params": {
        "torch_adam": True
        }
    },
    "zero_optimization": {
        "stage": 1,
        "elastic_checkpoint": True
    }
}

def main():
    input_dim, output_dim = 32, 10
    trainset = RandomDataset(num_samples=32 * 20, input_dim=input_dim, num_classes=output_dim)
    model = RandomNet(input_dim, output_dim)
    ckpt_dir = "./ckpt_dir"
    os.makedirs(ckpt_dir, exist_ok=True)
    try:
        model_engine, _, trainloader, _ = deepspeed.initialize(
            model=model,
            model_parameters=model.parameters(),
            config=ds_config,
            training_data=trainset,
        )
        rank = dist.get_rank() if dist.is_initialized() else 0
        is_rank0 = (rank == 0)
        for step, batch in enumerate(trainloader):
            x, y = batch
            x = x.to(model_engine.device)
            y = y.to(model_engine.device)

            loss = model_engine(x, labels=y)
            model_engine.backward(loss)
            model_engine.step()

            if is_rank0:
                print(f"Step {step} | Loss = {loss.item():.4f}")
            if step == 10:
                tag = f"step_{step}"
                if is_rank0:
                    print(f"\nSaving checkpoint at step {step} -> tag={tag}\n")
                model_engine.save_checkpoint(ckpt_dir, tag=tag)

    finally:
        if dist.is_initialized():
            dist.destroy_process_group()



if __name__ == "__main__":
    main()

Running

deepspeed --num_gpus=1 test.py

produces the following error log.

Step 0 | Loss = 2.2303
Step 1 | Loss = 2.2425
Step 2 | Loss = 2.3195
Step 3 | Loss = 2.2785
Step 4 | Loss = 2.3427
Step 5 | Loss = 2.3232
Step 6 | Loss = 2.3719
Step 7 | Loss = 2.3106
Step 8 | Loss = 2.2907
Step 9 | Loss = 2.2786
Step 10 | Loss = 2.2415

Saving checkpoint at step 10 -> tag=step_10

[rank0]: Traceback (most recent call last):
[rank0]:   File "/home/yanzhen/distributed_test/deepspeed-test/bug2.py", line 114, in <module>
[rank0]:     main()
[rank0]:   File "/home/yanzhen/distributed_test/deepspeed-test/bug2.py", line 105, in main
[rank0]:     model_engine.save_checkpoint(ckpt_dir, tag=tag)
[rank0]:   File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3505, in save_checkpoint
[rank0]:     self._save_zero_checkpoint(save_dir, tag)
[rank0]:   File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3888, in _save_zero_checkpoint
[rank0]:     zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version)
[rank0]:   File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2335, in state_dict
[rank0]:     state_dict[BASE_OPTIMIZER_STATE] = self._get_base_optimizer_state()
[rank0]:   File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2312, in _get_base_optimizer_state
[rank0]:     lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i])
[rank0]:   File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2300, in _get_state_without_padding
[rank0]:     lean_state[key] = value[:lean_length]
[rank0]: IndexError: slice() cannot be applied to a 0-dim tensor.

Expected behavior
An IndexError should not occur when saving a checkpoint.

ds_report output

--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
      runtime if needed. Op compatibility means that your system
      meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
async_io ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
dc ..................... [NO] ....... [OKAY]
 [WARNING]  Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fp_quantizer ........... [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
gds .................... [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
 [WARNING]  sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.5
 [WARNING]  using untested triton version (3.1.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/torch']
torch version .................... 2.5.1
deepspeed install path ........... ['/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.18.0+7af561c2, 7af561c2, master
torch cuda version ............... 12.1
torch hip version ................ None
nvcc version ..................... 12.4
deepspeed wheel compiled w. ...... torch 2.5, cuda 12.1
shared memory (/dev/shm) size .... 503.83 GB

System info (please complete the following information):

  • Ubuntu 22.04
  • one machines with x4 RTX 4090s
  • Python 3.10.18

Launcher context
deepspeed

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingtraining

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions