-
Notifications
You must be signed in to change notification settings - Fork 4.6k
Open
Labels
Description
Describe the bug
When using the Adam optimizer together with ZeRO Stage 1/2 under the following configuration, calling
model_engine.save_checkpoint(ckpt_dir, tag=tag) will raise an IndexError during execution:
{
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 8,
"optimizer": {
"type": "Adam",
"params": {
"torch_adam": true
}
},
"zero_optimization": {
"stage": 1,
"elastic_checkpoint": true
}
}If "torch_adam" is set to false, or if "elastic_checkpoint" in zero_optimization is set to false, the error does not occur.
To Reproduce
The following is the code test.py that causes the issue.
import torch
import torch.nn as nn
import deepspeed
import torch.distributed as dist
from torch.utils.data import Dataset
import os
def set_seed(seed: int):
import os
import random
import numpy as np
import torch
random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
set_seed(42)
class RandomDataset(Dataset):
def __init__(self, num_samples=800, input_dim=32, num_classes=10):
self.num_samples = num_samples
self.input_dim = input_dim
self.num_classes = num_classes
def __len__(self):
return self.num_samples
def __getitem__(self, idx):
x = torch.randn(self.input_dim)
y = torch.randint(0, self.num_classes, (1,)).item()
return x, y
class RandomNet(nn.Module):
def __init__(self, input_dim, output_dim):
super().__init__()
layers = []
dim = input_dim
for _ in range(10):
next_dim = 16
layers.append(nn.Linear(dim, next_dim))
layers.append(nn.ReLU())
dim = next_dim
layers.append(nn.Linear(dim, output_dim))
self.net = nn.Sequential(*layers)
self.criterion = nn.CrossEntropyLoss()
def forward(self, x, labels=None):
logits = self.net(x)
if labels is not None:
return self.criterion(logits, labels)
return logits
ds_config = {
"train_batch_size": 32,
"train_micro_batch_size_per_gpu": 8,
"optimizer": {
"type": "Adam",
"params": {
"torch_adam": True
}
},
"zero_optimization": {
"stage": 1,
"elastic_checkpoint": True
}
}
def main():
input_dim, output_dim = 32, 10
trainset = RandomDataset(num_samples=32 * 20, input_dim=input_dim, num_classes=output_dim)
model = RandomNet(input_dim, output_dim)
ckpt_dir = "./ckpt_dir"
os.makedirs(ckpt_dir, exist_ok=True)
try:
model_engine, _, trainloader, _ = deepspeed.initialize(
model=model,
model_parameters=model.parameters(),
config=ds_config,
training_data=trainset,
)
rank = dist.get_rank() if dist.is_initialized() else 0
is_rank0 = (rank == 0)
for step, batch in enumerate(trainloader):
x, y = batch
x = x.to(model_engine.device)
y = y.to(model_engine.device)
loss = model_engine(x, labels=y)
model_engine.backward(loss)
model_engine.step()
if is_rank0:
print(f"Step {step} | Loss = {loss.item():.4f}")
if step == 10:
tag = f"step_{step}"
if is_rank0:
print(f"\nSaving checkpoint at step {step} -> tag={tag}\n")
model_engine.save_checkpoint(ckpt_dir, tag=tag)
finally:
if dist.is_initialized():
dist.destroy_process_group()
if __name__ == "__main__":
main()
Running
deepspeed --num_gpus=1 test.py
produces the following error log.
Step 0 | Loss = 2.2303
Step 1 | Loss = 2.2425
Step 2 | Loss = 2.3195
Step 3 | Loss = 2.2785
Step 4 | Loss = 2.3427
Step 5 | Loss = 2.3232
Step 6 | Loss = 2.3719
Step 7 | Loss = 2.3106
Step 8 | Loss = 2.2907
Step 9 | Loss = 2.2786
Step 10 | Loss = 2.2415
Saving checkpoint at step 10 -> tag=step_10
[rank0]: Traceback (most recent call last):
[rank0]: File "/home/yanzhen/distributed_test/deepspeed-test/bug2.py", line 114, in <module>
[rank0]: main()
[rank0]: File "/home/yanzhen/distributed_test/deepspeed-test/bug2.py", line 105, in main
[rank0]: model_engine.save_checkpoint(ckpt_dir, tag=tag)
[rank0]: File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3505, in save_checkpoint
[rank0]: self._save_zero_checkpoint(save_dir, tag)
[rank0]: File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/engine.py", line 3888, in _save_zero_checkpoint
[rank0]: zero_sd = dict(optimizer_state_dict=self.optimizer.state_dict(), ds_config=self.config, ds_version=version)
[rank0]: File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2335, in state_dict
[rank0]: state_dict[BASE_OPTIMIZER_STATE] = self._get_base_optimizer_state()
[rank0]: File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2312, in _get_base_optimizer_state
[rank0]: lean_optimizer_state = self._get_state_without_padding(self.optimizer.state[p], self.groups_padding[i])
[rank0]: File "/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2300, in _get_state_without_padding
[rank0]: lean_state[key] = value[:lean_length]
[rank0]: IndexError: slice() cannot be applied to a 0-dim tensor.
Expected behavior
An IndexError should not occur when saving a checkpoint.
ds_report output
--------------------------------------------------
DeepSpeed C++/CUDA extension op report
--------------------------------------------------
NOTE: Ops not installed will be just-in-time (JIT) compiled at
runtime if needed. Op compatibility means that your system
meet the required dependencies to JIT install the op.
--------------------------------------------------
JIT compiled ops requires ninja
ninja .................. [OKAY]
--------------------------------------------------
op name ................ installed .. compatible
--------------------------------------------------
async_io ............... [NO] ....... [OKAY]
fused_adam ............. [NO] ....... [OKAY]
cpu_adam ............... [NO] ....... [OKAY]
cpu_adagrad ............ [NO] ....... [OKAY]
cpu_lion ............... [NO] ....... [OKAY]
dc ..................... [NO] ....... [OKAY]
[WARNING] Please specify the CUTLASS repo directory as environment variable $CUTLASS_PATH
evoformer_attn ......... [NO] ....... [NO]
fp_quantizer ........... [NO] ....... [OKAY]
fused_lamb ............. [NO] ....... [OKAY]
fused_lion ............. [NO] ....... [OKAY]
gds .................... [NO] ....... [OKAY]
transformer_inference .. [NO] ....... [OKAY]
inference_core_ops ..... [NO] ....... [OKAY]
cutlass_ops ............ [NO] ....... [OKAY]
quantizer .............. [NO] ....... [OKAY]
ragged_device_ops ...... [NO] ....... [OKAY]
ragged_ops ............. [NO] ....... [OKAY]
random_ltd ............. [NO] ....... [OKAY]
[WARNING] sparse_attn requires a torch version >= 1.5 and < 2.0 but detected 2.5
[WARNING] using untested triton version (3.1.0), only 1.0.0 is known to be compatible
sparse_attn ............ [NO] ....... [NO]
spatial_inference ...... [NO] ....... [OKAY]
transformer ............ [NO] ....... [OKAY]
stochastic_transformer . [NO] ....... [OKAY]
utils .................. [NO] ....... [OKAY]
--------------------------------------------------
DeepSpeed general environment info:
torch install path ............... ['/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/torch']
torch version .................... 2.5.1
deepspeed install path ........... ['/home/yanzhen/miniconda3/envs/deepspeed/lib/python3.10/site-packages/deepspeed']
deepspeed info ................... 0.18.0+7af561c2, 7af561c2, master
torch cuda version ............... 12.1
torch hip version ................ None
nvcc version ..................... 12.4
deepspeed wheel compiled w. ...... torch 2.5, cuda 12.1
shared memory (/dev/shm) size .... 503.83 GB
System info (please complete the following information):
- Ubuntu 22.04
- one machines with x4 RTX 4090s
- Python 3.10.18
Launcher context
deepspeed
therealnaveenkamal