-
Notifications
You must be signed in to change notification settings - Fork 3.6k
Description
Bug description
I was able to fine-tune a 8B LLM using Huggingface training framework with PEFT+DeepSpeed stage 2 under fp16 precision(mixed precision training). Recently I would like to change my codebase to lightning due to our team's decision. However, I was not able to get the code work due to OOM issue even the settings from both side is nearly the same.
Here's the code
lightning-deepspeed.zip
Update:
when lightning <=2.2.0
, the memory usage goes fine. Once lightning>=2.2.0
, the OOM error happened
lightning module
import lightning as L
import torch
import os
from pathlib import Path
from transformers import AutoModelForCausalLM
from peft import get_peft_model, LoraConfig,PeftModel
from lightning.pytorch.callbacks import Callback
from typing import Optional
LORA_CONFIG = LoraConfig(
r = 64,
lora_alpha=128,
target_modules=['q_proj', 'k_proj', 'v_proj'],
lora_dropout=0.1,
bias="none",
task_type="CASUAL_LM",
use_dora=False
)
class BoringModule(L.LightningModule):
def __init__(self, model_name: str,
precision=torch.float16,
peft_cfg: LoraConfig=None,
token: str=None,
is_deepspeed_enabled: bool=True,
):
super().__init__()
self.model_name = model_name
self.precision = precision
self.token = token
self.peft_cfg = peft_cfg
self.model = None
self.deepspeed = is_deepspeed_enabled
def configure_model(self):
if self.model is not None:
return
self.model = AutoModelForCausalLM.from_pretrained(self.model_name,
torch_dtype=torch.float16,
device_map={"": torch.cuda.current_device()},
trust_remote_code=True,
token=self.token
)
self.model.gradient_checkpointing_enable()
self.model = get_peft_model(self.model, self.peft_cfg)
def configure_optimizers(self):
if self.deepspeed:
from deepspeed.ops.adam import FusedAdam
optimizer = FusedAdam(self.model.parameters(), lr=2e-4)
else:
optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.1)
return [optimizer], [scheduler]
def forward(self, input_ids, attention_mask, label):
return self.model(input_ids=input_ids, attention_mask=attention_mask, labels=label, use_cache=False)
def training_step(self, batch, batch_idx):
output = self.forward(batch["input_ids"], batch["attention_mask"], batch["labels"])
loss = output.loss
self.log_dict({"train_loss": loss}, on_step=True, sync_dist=True)
return loss
def validation_step(self, batch, batch_idx):
output = self.forward(batch["input_ids"], batch["attention_mask"], batch["labels"])
loss = output.loss
self.log_dict({"val_loss": loss}, on_step=True, sync_dist=True)
return loss
class PeftCheckpoint(Callback):
def __init__(self,
dirpath: Optional[str]=None,
):
super().__init__()
self.dirpath = dirpath
self.ckpt_dir = None
self.current_ckpt = {}
def on_validation_start(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
current_step = trainer.global_step
if current_step != 0:
if not trainer.default_root_dir and not self.dirpath:
output_dir = os.getcwd()
elif not self.dirpath or not trainer.default_root_dir:
output_dir = self.dirpath if self.dirpath else trainer.default_root_dir
else:
raise ValueError("Get output path from both trainer and callback, please provide the path from either one of them")
self.ckpt_dir = os.path.join(output_dir, f"checkpoint-{current_step}")
if not os.path.exists(self.ckpt_dir):
Path(self.ckpt_dir).mkdir(parents=True, exist_ok=True)
self.current_ckpt["dir"] = self.ckpt_dir
def on_validation_end(self, trainer: L.Trainer, pl_module: L.LightningModule) -> None:
if isinstance(pl_module.model, PeftModel) and self.ckpt_dir:
pl_module.model.save_pretrained(self.ckpt_dir)
lightning training pipeline
import lightning as L
from transformers import DataCollatorForSeq2Seq, AutoTokenizer
from pl_modules import BoringModule, LORA_CONFIG, PeftCheckpoint
from datasets import load_dataset
from torch.utils.data import DataLoader
from lightning.pytorch.strategies import DeepSpeedStrategy
def main():
model_name = "meta-llama/Meta-Llama-3-8B"
token = None
# load data and keep necessary columns
data = load_dataset("json", data_files={"train":"./train_data.json",
"val":"./val_data.json",},
split=["train[:100]", "val[:100]"])
train_data, val_data = data[0], data[1]
# init pl module
peft_llm = BoringModule(model_name=model_name,
is_deepspeed_enabled=True,
peft_cfg=LORA_CONFIG,
token=token,
)
tokenizer = AutoTokenizer.from_pretrained(model_name, token=token, padding_side="left", max_length=8192)
# put them in the dataloaders
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt",padding=True)
train_dataloader = DataLoader(train_data, batch_size=2, collate_fn=data_collator, num_workers=8)
val_dataloader = DataLoader(val_data, batch_size=2, collate_fn=data_collator, num_workers=8)
# init trainer and set the args
peft_ckpt = PeftCheckpoint()
trainer = L.Trainer(default_root_dir="./codetest",
accelerator="cuda",
callbacks=[peft_ckpt],
log_every_n_steps=5,
val_check_interval=5,
devices=2,
max_epochs=1,
precision="16-mixed",
num_sanity_val_steps=0,
enable_checkpointing=True,
strategy=DeepSpeedStrategy(config="./ds_config.json")
)
trainer.fit(model=peft_llm,
train_dataloaders=train_dataloader,
val_dataloaders=val_dataloader)
if __name__ == "__main__":
main()
huggingface training pipeline
import torch
from transformers import AutoModelForCausalLM, DataCollatorForSeq2Seq, Trainer, TrainingArguments,AutoTokenizer, HfArgumentParser
from peft import get_peft_model
from pl_modules import LORA_CONFIG
from datasets import load_dataset
MODEL = "meta-llama/Meta-Llama-3-8B"
TOKEN = None
def main():
parser = HfArgumentParser(TrainingArguments)
training_args = parser.parse_args_into_dataclasses()[0]
# load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(MODEL, token=TOKEN, torch_dtype=torch.float16,
trust_remote_code=True,
device_map={"": torch.cuda.current_device()})
if training_args.gradient_checkpointing:
training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
model.config.use_cache = False
peft_model = get_peft_model(model, LORA_CONFIG)
tokenizer = AutoTokenizer.from_pretrained(MODEL, token=TOKEN, max_length=8192, padding_side="left")
# load data
data = load_dataset("json", data_files={"train":"./train_data.json",
"val":"./val_data.json",},
split=["train[:100]", "val[:100]"])
train_data, val_data = data[0], data[1]
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt",padding=True)
# init trainer
trainer = Trainer(model = peft_model,
args = training_args,
train_dataset = train_data,
eval_dataset = val_data,
data_collator = data_collator,
compute_metrics=None
)
trainer.train()
if __name__ == "__main__":
main()
command
- lightning
python pipeline.py > codetest.log 2>&1
- huggingface
deepspeed --num_gpus=2 hf-pipeline.py --output_dir ./hf_codetest --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --label_names labels --learning_rate 2e-4 --optim adamw_torch --lr_scheduler_type constant_with_warmup --fp16 True --evaluation_strategy steps --logging_steps 10 --save_steps 10 --eval_steps 10 --gradient_checkpointing True --gradient_accumulation_steps 1 --report_to none --deepspeed ./ds_config_hf.json > hf_codetest.log 2>&1
- If the code has trouble saving checkpoint, modity the trainer.py L2401 to
logs["grad_norm"] = grad_norm.item()
refer to this issue
- If the code has trouble saving checkpoint, modity the trainer.py L2401 to
I've seen some issues talking about the problem of using huggingface model in lightning framework, and I also tried some of the suggestions. however, none of them work : (
#17878 -> confict about device setting
#17043 ->properly load the model in configure_model hook should be alright
and some issues about using Zero 3 with hf pretrained model. I'm not putting all of them here since I'm trying to use zero 2 which should be less complicated.
The weird part I observe during lightning training is like below, the code start training with 4 processes which I have only two gpus.
when I use huggingface trainer, it only start training with 2 processes which makes sense. Also, the gpu utilization is balanced
What version are you seeing the problem on?
v2.4
How to reproduce the bug
1. download the script and install the requirement
2. use the command above to start training
Error messages and logs
lightning log
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[2024-09-24 18:24:45,506] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
initializing deepspeed distributed: GLOBAL_RANK: 0, MEMBER: 1/2
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
[2024-09-24 18:24:52,443] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
initializing deepspeed distributed: GLOBAL_RANK: 1, MEMBER: 2/2
Enabling DeepSpeed FP16. Model parameters and inputs will be cast to `float16`.
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
current process device: 0
current process: 136236
current process: 0
current process: 0
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]current process device: 1
current process: 136372
current process: 1
current process: 1
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 25%|██▌ | 1/4 [00:01<00:04, 1.57s/it]
Loading checkpoint shards: 25%|██▌ | 1/4 [00:01<00:05, 1.78s/it]
Loading checkpoint shards: 50%|█████ | 2/4 [00:03<00:03, 1.56s/it]
Loading checkpoint shards: 50%|█████ | 2/4 [00:03<00:03, 1.83s/it]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:04<00:01, 1.59s/it]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:05<00:01, 1.81s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00, 1.26s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00, 1.38s/it]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Using /home/ubuntu/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00, 1.32s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00, 1.50s/it]
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ubuntu/.cache/torch_extensions/py312_cu121/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Current model device: cuda:0
Current max memory: {0: '13522MB', 1: '13288MB'}
ninja: no work to do.
Loading extension module fused_adam...
LOCAL_RANK: 1 - CUDA_VISIBLE_DEVICES: [0,1]
Using /home/ubuntu/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ubuntu/.cache/torch_extensions/py312_cu121/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Current model device: cuda:1
Current max memory: {0: '14330MB', 1: '13216MB'}
ninja: no work to do.
Loading extension module fused_adam...
| Name | Type | Params | Mode
--------------------------------------------
0 | model | PeftModel | 8.1 B | train
--------------------------------------------
37.7 M Trainable params
8.0 B Non-trainable params
8.1 B Total params
32,272.040Total estimated model params size (MB)
866 Modules in train mode
454 Modules in eval mode
Time to load fused_adam op: 0.11289787292480469 seconds
Time to load fused_adam op: 0.11739063262939453 seconds
Training: | | 0/? [00:00<?, ?it/s]
Training: 0%| | 0/25 [00:00<?, ?it/s]
Epoch 0: 0%| | 0/25 [00:00<?, ?it/s] Current model device: cuda:0
Current gpu usage: 16412909056
==================================================Current model dtype: {torch.float16}==================================================
Currently using cache: False
Traceback (most recent call last):
File "/home/ubuntu/lightning-llm/pipline.py", line 88, in <module>
main()
File "/home/ubuntu/lightning-llm/pipline.py", line 83, in main
trainer.fit(model=peft_llm,
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 538, in fit
call._call_and_handle_interrupt(
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 46, in _call_and_handle_interrupt
return trainer.strategy.launcher.launch(trainer_fn, *args, trainer=trainer, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/strategies/launchers/subprocess_script.py", line 105, in launch
return function(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 574, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 981, in _run
results = self._run_stage()
^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/trainer.py", line 1025, in _run_stage
self.fit_loop.run()
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py", line 205, in run
self.advance()
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py", line 363, in advance
self.epoch_loop.run(self._data_fetcher)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 140, in run
self.advance(data_fetcher)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/training_epoch_loop.py", line 250, in advance
batch_output = self.automatic_optimization.run(trainer.optimizers[0], batch_idx, kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 190, in run
self._optimizer_step(batch_idx, closure)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 268, in _optimizer_step
call._call_lightning_module_hook(
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 167, in _call_lightning_module_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/core/module.py", line 1306, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/core/optimizer.py", line 153, in step
step_output = self._strategy.optimizer_step(self._optimizer, closure, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/strategies/ddp.py", line 270, in optimizer_step
optimizer_output = super().optimizer_step(optimizer, closure, model, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/strategies/strategy.py", line 238, in optimizer_step
return self.precision_plugin.optimizer_step(optimizer, model=model, closure=closure, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/plugins/precision/deepspeed.py", line 129, in optimizer_step
closure_result = closure()
^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 144, in __call__
self._result = self.closure(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 138, in closure
self._backward_fn(step_output.closure_loss)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/loops/optimization/automatic.py", line 239, in backward_fn
call._call_strategy_hook(self.trainer, "backward", loss, optimizer)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/trainer/call.py", line 319, in _call_strategy_hook
output = fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/strategies/strategy.py", line 212, in backward
self.precision_plugin.backward(closure_loss, self.lightning_module, optimizer, *args, **kwargs)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/lightning/pytorch/plugins/precision/deepspeed.py", line 117, in backward
deepspeed_engine.backward(tensor, *args, **kwargs)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/deepspeed/utils/nvtx.py", line 18, in wrapped_fn
ret_val = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/deepspeed/runtime/engine.py", line 2020, in backward
self.optimizer.backward(loss, retain_graph=retain_graph)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/deepspeed/runtime/zero/stage_1_and_2.py", line 2064, in backward
self.loss_scaler.backward(loss.float(), retain_graph=retain_graph)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/deepspeed/runtime/fp16/loss_scaler.py", line 63, in backward
scaled_loss.backward(retain_graph=retain_graph)
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/torch/_tensor.py", line 522, in backward
torch.autograd.backward(
File "/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/torch/autograd/__init__.py", line 266, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 948.00 MiB. GPU 0 has a total capacity of 31.74 GiB of which 474.12 MiB is free. Including non-PyTorch memory, this process has 30.97 GiB memory in use. Process 136372 has 306.00 MiB memory in use. Of the allocated memory 29.07 GiB is allocated by PyTorch, and 1.27 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)
huggingface trainer log
[2024-09-24 18:45:17,208] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-24 18:45:20,256] [WARNING] [runner.py:212:fetch_hostfile] Unable to find hostfile, will proceed with training with local resources only.
[2024-09-24 18:45:20,256] [INFO] [runner.py:585:main] cmd = /home/ubuntu/lightning-llm/.venv/bin/python -u -m deepspeed.launcher.launch --world_info=eyJsb2NhbGhvc3QiOiBbMCwgMV19 --master_addr=127.0.0.1 --master_port=29500 --enable_each_rank_log=None hf-pipeline.py --output_dir ./hf_codetest --num_train_epochs 1 --per_device_train_batch_size 2 --per_device_eval_batch_size 2 --label_names labels --learning_rate 2e-4 --optim adamw_torch --lr_scheduler_type constant_with_warmup --fp16 True --evaluation_strategy steps --logging_steps 10 --save_steps 10 --eval_steps 10 --gradient_checkpointing True --gradient_accumulation_steps 1 --report_to none --deepspeed /home/ubuntu/lightning-llm/ds_config_hf.json
[2024-09-24 18:45:21,510] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-24 18:45:24,502] [INFO] [launch.py:146:main] WORLD INFO DICT: {'localhost': [0, 1]}
[2024-09-24 18:45:24,502] [INFO] [launch.py:152:main] nnodes=1, num_local_procs=2, node_rank=0
[2024-09-24 18:45:24,502] [INFO] [launch.py:163:main] global_rank_mapping=defaultdict(<class 'list'>, {'localhost': [0, 1]})
[2024-09-24 18:45:24,502] [INFO] [launch.py:164:main] dist_world_size=2
[2024-09-24 18:45:24,502] [INFO] [launch.py:168:main] Setting CUDA_VISIBLE_DEVICES=0,1
[2024-09-24 18:45:24,503] [INFO] [launch.py:256:main] process 144709 spawned with command: ['/home/ubuntu/lightning-llm/.venv/bin/python', '-u', 'hf-pipeline.py', '--local_rank=0', '--output_dir', './hf_codetest', '--num_train_epochs', '1', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '2', '--label_names', 'labels', '--learning_rate', '2e-4', '--optim', 'adamw_torch', '--lr_scheduler_type', 'constant_with_warmup', '--fp16', 'True', '--evaluation_strategy', 'steps', '--logging_steps', '10', '--save_steps', '10', '--eval_steps', '10', '--gradient_checkpointing', 'True', '--gradient_accumulation_steps', '1', '--report_to', 'none', '--deepspeed', '/home/ubuntu/lightning-llm/ds_config_hf.json']
[2024-09-24 18:45:24,503] [INFO] [launch.py:256:main] process 144710 spawned with command: ['/home/ubuntu/lightning-llm/.venv/bin/python', '-u', 'hf-pipeline.py', '--local_rank=1', '--output_dir', './hf_codetest', '--num_train_epochs', '1', '--per_device_train_batch_size', '2', '--per_device_eval_batch_size', '2', '--label_names', 'labels', '--learning_rate', '2e-4', '--optim', 'adamw_torch', '--lr_scheduler_type', 'constant_with_warmup', '--fp16', 'True', '--evaluation_strategy', 'steps', '--logging_steps', '10', '--save_steps', '10', '--eval_steps', '10', '--gradient_checkpointing', 'True', '--gradient_accumulation_steps', '1', '--report_to', 'none', '--deepspeed', '/home/ubuntu/lightning-llm/ds_config_hf.json']
[2024-09-24 18:45:28,915] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-24 18:45:29,317] [INFO] [real_accelerator.py:203:get_accelerator] Setting ds_accelerator to cuda (auto detect)
[2024-09-24 18:45:29,724] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-09-24 18:45:30,142] [INFO] [comm.py:652:init_distributed] cdb=None
[2024-09-24 18:45:30,142] [INFO] [comm.py:683:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl
Number of GPUs: 2
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
Number of GPUs: 2
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/huggingface_hub/file_download.py:1150: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]
Loading checkpoint shards: 25%|██▌ | 1/4 [00:02<00:07, 2.48s/it]
Loading checkpoint shards: 25%|██▌ | 1/4 [00:02<00:07, 2.38s/it]
Loading checkpoint shards: 50%|█████ | 2/4 [00:05<00:05, 2.54s/it]
Loading checkpoint shards: 50%|█████ | 2/4 [00:04<00:04, 2.49s/it]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:07<00:02, 2.45s/it]
Loading checkpoint shards: 75%|███████▌ | 3/4 [00:07<00:02, 2.48s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 1.79s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 2.05s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 1.76s/it]
Loading checkpoint shards: 100%|██████████| 4/4 [00:08<00:00, 2.02s/it]
Model On Deivce: cuda:0
trainable params: 37,748,736 || all params: 8,068,009,984 || trainable%: 0.4679
Model On Deivce: cuda:1
trainable params: 37,748,736 || all params: 8,068,009,984 || trainable%: 0.4679
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using /home/ubuntu/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Using /home/ubuntu/.cache/torch_extensions/py312_cu121 as PyTorch extensions root...
Detected CUDA files, patching ldflags
Emitting ninja build file /home/ubuntu/.cache/torch_extensions/py312_cu121/fused_adam/build.ninja...
Building extension module fused_adam...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
ninja: no work to do.
Loading extension module fused_adam...
Time to load fused_adam op: 0.10865139961242676 seconds
Loading extension module fused_adam...
Time to load fused_adam op: 0.20218896865844727 seconds
0%| | 0/25 [00:00<?, ?it/s]
4%|▍ | 1/25 [00:01<00:41, 1.71s/it]
8%|▊ | 2/25 [00:03<00:34, 1.51s/it]
12%|█▏ | 3/25 [00:04<00:33, 1.51s/it]
16%|█▌ | 4/25 [00:06<00:31, 1.51s/it]
20%|██ | 5/25 [00:07<00:29, 1.46s/it]
24%|██▍ | 6/25 [00:09<00:29, 1.54s/it]
28%|██▊ | 7/25 [00:10<00:26, 1.48s/it]
32%|███▏ | 8/25 [00:12<00:25, 1.48s/it]
36%|███▌ | 9/25 [00:13<00:25, 1.61s/it]
40%|████ | 10/25 [00:15<00:24, 1.61s/it]
{'loss': 0.7134, 'grad_norm': 2.3449432849884033, 'learning_rate': 0.0002, 'epoch': 0.4}
40%|████ | 10/25 [00:15<00:24, 1.61s/it]
0%| | 0/25 [00:00<?, ?it/s]�[A
8%|▊ | 2/25 [00:00<00:10, 2.17it/s]�[A
12%|█▏ | 3/25 [00:01<00:08, 2.68it/s]�[A
16%|█▌ | 4/25 [00:01<00:09, 2.31it/s]�[A
20%|██ | 5/25 [00:02<00:08, 2.33it/s]�[A
24%|██▍ | 6/25 [00:02<00:08, 2.32it/s]�[A
28%|██▊ | 7/25 [00:03<00:08, 2.14it/s]�[A
32%|███▏ | 8/25 [00:03<00:08, 1.95it/s]�[A
36%|███▌ | 9/25 [00:04<00:07, 2.03it/s]�[A
40%|████ | 10/25 [00:04<00:07, 1.92it/s]�[A
44%|████▍ | 11/25 [00:05<00:07, 1.93it/s]�[A
48%|████▊ | 12/25 [00:05<00:06, 1.94it/s]�[A
52%|█████▏ | 13/25 [00:06<00:06, 1.89it/s]�[A
56%|█████▌ | 14/25 [00:06<00:05, 1.93it/s]�[A
60%|██████ | 15/25 [00:07<00:04, 2.02it/s]�[A
64%|██████▍ | 16/25 [00:07<00:04, 1.97it/s]�[A
68%|██████▊ | 17/25 [00:08<00:04, 1.96it/s]�[A
72%|███████▏ | 18/25 [00:08<00:03, 2.01it/s]�[A
76%|███████▌ | 19/25 [00:09<00:02, 2.01it/s]�[A
80%|████████ | 20/25 [00:09<00:02, 1.97it/s]�[A
84%|████████▍ | 21/25 [00:10<00:02, 1.96it/s]�[A
88%|████████▊ | 22/25 [00:10<00:01, 2.00it/s]�[A
92%|█████████▏| 23/25 [00:11<00:01, 1.96it/s]�[A
96%|█████████▌| 24/25 [00:11<00:00, 1.93it/s]�[A
100%|██████████| 25/25 [00:12<00:00, 1.96it/s]�[A
�[A{'eval_loss': 0.6632572412490845, 'eval_runtime': 12.8212, 'eval_samples_per_second': 7.8, 'eval_steps_per_second': 1.95, 'epoch': 0.4}
40%|████ | 10/25 [00:28<00:24, 1.61s/it]
100%|██████████| 25/25 [00:12<00:00, 1.96it/s]�[A
�[ACheckpoint destination directory ./hf_codetest/checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hf_codetest/checkpoint-10 already exists and is non-empty. Saving will proceed but saved results may be invalid.
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/peft/utils/other.py:611: Unable to fetch remote file due to the following error 401 Client Error. (Request ID: Root=1-66f29885-40c0234c5a09e86a059e7c87;5507b633-c539-4a31-a8aa-f2cb5ab2efd1)
Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/peft/utils/save_and_load.py:195: Could not find a config file in meta-llama/Meta-Llama-3-8B - will assume that the vocabulary was not modified.
44%|████▍ | 11/25 [00:50<02:45, 11.79s/it]
48%|████▊ | 12/25 [00:51<01:52, 8.68s/it]
52%|█████▏ | 13/25 [00:53<01:18, 6.51s/it]
56%|█████▌ | 14/25 [00:55<00:55, 5.02s/it]
60%|██████ | 15/25 [00:56<00:39, 3.98s/it]
64%|██████▍ | 16/25 [00:58<00:28, 3.20s/it]
68%|██████▊ | 17/25 [00:59<00:21, 2.63s/it]
72%|███████▏ | 18/25 [01:00<00:16, 2.29s/it]
76%|███████▌ | 19/25 [01:02<00:12, 2.05s/it]
80%|████████ | 20/25 [01:03<00:09, 1.85s/it]
{'loss': 0.6377, 'grad_norm': 1.6386961936950684, 'learning_rate': 0.0002, 'epoch': 0.8}
80%|████████ | 20/25 [01:03<00:09, 1.85s/it]
0%| | 0/25 [00:00<?, ?it/s]�[A
8%|▊ | 2/25 [00:00<00:06, 3.50it/s]�[A
12%|█▏ | 3/25 [00:01<00:08, 2.64it/s]�[A
16%|█▌ | 4/25 [00:01<00:09, 2.28it/s]�[A
20%|██ | 5/25 [00:02<00:08, 2.30it/s]�[A
24%|██▍ | 6/25 [00:02<00:08, 2.31it/s]�[A
28%|██▊ | 7/25 [00:03<00:08, 2.14it/s]�[A
32%|███▏ | 8/25 [00:03<00:08, 2.01it/s]�[A
36%|███▌ | 9/25 [00:04<00:07, 2.01it/s]�[A
40%|████ | 10/25 [00:04<00:07, 1.91it/s]�[A
44%|████▍ | 11/25 [00:05<00:07, 1.92it/s]�[A
48%|████▊ | 12/25 [00:05<00:06, 1.94it/s]�[A
52%|█████▏ | 13/25 [00:06<00:06, 1.89it/s]�[A
56%|█████▌ | 14/25 [00:06<00:05, 1.93it/s]�[A
60%|██████ | 15/25 [00:07<00:04, 2.01it/s]�[A
64%|██████▍ | 16/25 [00:07<00:04, 1.97it/s]�[A
68%|██████▊ | 17/25 [00:08<00:04, 1.96it/s]�[A
72%|███████▏ | 18/25 [00:08<00:03, 2.01it/s]�[A
76%|███████▌ | 19/25 [00:09<00:02, 2.03it/s]�[A
80%|████████ | 20/25 [00:09<00:02, 1.98it/s]�[A
84%|████████▍ | 21/25 [00:10<00:02, 1.97it/s]�[A
88%|████████▊ | 22/25 [00:10<00:01, 2.01it/s]�[A
92%|█████████▏| 23/25 [00:11<00:01, 1.97it/s]�[A
96%|█████████▌| 24/25 [00:11<00:00, 1.94it/s]�[A
100%|██████████| 25/25 [00:12<00:00, 1.96it/s]�[A
�[A{'eval_loss': 0.5724084377288818, 'eval_runtime': 12.712, 'eval_samples_per_second': 7.867, 'eval_steps_per_second': 1.967, 'epoch': 0.8}
80%|████████ | 20/25 [01:16<00:09, 1.85s/it]
100%|██████████| 25/25 [00:12<00:00, 1.96it/s]�[A
�[ACheckpoint destination directory ./hf_codetest/checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory ./hf_codetest/checkpoint-20 already exists and is non-empty. Saving will proceed but saved results may be invalid.
/home/ubuntu/lightning-llm/.venv/lib/python3.12/site-packages/peft/utils/other.py:611: Unable to fetch remote file due to the following error 401 Client Error. (Request ID: Root=1-66f298b4-7c01819b14b2e1095f2b7456;0557752d-5a87-44e6-950b-e9f5077bef1b)
Cannot access gated repo for url https://huggingface.co/meta-llama/Meta-Llama-3-8B/resolve/main/config.json.
Access to model meta-llama/Meta-Llama-3-8B is restricted. You must have access to it and be authenticated to access it. Please log in. - silently ignoring the lookup for the file config.json in meta-llama/Meta-Llama-3-8B.
84%|████████▍ | 21/25 [01:36<00:45, 11.29s/it]
88%|████████▊ | 22/25 [01:38<00:25, 8.45s/it]
92%|█████████▏| 23/25 [01:40<00:12, 6.37s/it]
96%|█████████▌| 24/25 [01:41<00:04, 4.93s/it]
100%|██████████| 25/25 [01:43<00:00, 3.99s/it]
{'train_runtime': 103.6883, 'train_samples_per_second': 0.964, 'train_steps_per_second': 0.241, 'train_loss': 0.6445709228515625, 'epoch': 1.0}
100%|██████████| 25/25 [01:43<00:00, 3.99s/it]
100%|██████████| 25/25 [01:43<00:00, 4.15s/it]
[2024-09-24 18:47:30,528] [INFO] [launch.py:351:main] Process 144710 exits successfully.
[2024-09-24 18:47:31,529] [INFO] [launch.py:351:main] Process 144709 exits successfully.
Environment
Current environment
#- PyTorch Lightning Version : 2.4.0
#- PyTorch Version : 2.2.1
#- Python version : 3.12.3
#- OS (e.g., Linux) : Ubuntu 24.04
#- CUDA/cuDNN version: 12,0
#- GPU models and configuration:
#- How you installed Lightning(`conda`, `pip`, source): pip
More info
Harware information: NVIDIA Corporation GV100GL [Tesla V100 SXM2 32GB] *2
cc @lantiga