Skip to content

Commit 4de4e31

Browse files
authored
[exampe] update llama example (#5626)
* [plugin] support dp inside for hybriad parallel * [example] update llama benchmark * [example] update llama benchmark * [example] update llama readme * [example] update llama readme
1 parent 862fbaa commit 4de4e31

File tree

8 files changed

+72
-783
lines changed

8 files changed

+72
-783
lines changed

colossalai/booster/plugin/gemini_plugin.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,7 @@ def __init__(
424424
)
425425
self.extra_dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS) if self.extra_dp_size > 1 else None
426426
self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS) if self.tp_size > 1 else None
427+
self.dp_size = self.zero_size * self.extra_dp_size
427428

428429
self.shard_config = ShardConfig(
429430
tensor_parallel_process_group=self.tp_group,

colossalai/booster/plugin/hybrid_parallel_plugin.py

Lines changed: 18 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,6 @@
3434

3535
from .pp_plugin_base import PipelinePluginBase
3636

37-
DP_AXIS, PP_AXIS, TP_AXIS, SP_AXIS = 0, 1, 2, 3
3837
SUPPORT_SP_MODE = ["split_gather", "ring", "all_to_all"]
3938

4039
PRECISION_TORCH_TYPE = {"fp16": torch.float16, "fp32": torch.float32, "bf16": torch.bfloat16}
@@ -987,6 +986,7 @@ def __init__(
987986
gradient_checkpoint_config: Optional[GradientCheckpointConfig] = None,
988987
enable_metadata_cache: bool = True,
989988
make_vocab_size_divisible_by: int = 64,
989+
dp_outside: bool = True,
990990
) -> None:
991991
super().__init__()
992992
assert (
@@ -1034,7 +1034,12 @@ def __init__(
10341034
self.enable_flash_attention = enable_flash_attention
10351035
self.enable_jit_fused = enable_jit_fused
10361036
self.enable_sequence_parallelism = enable_sequence_parallelism
1037-
self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
1037+
if dp_outside:
1038+
self.dp_axis, self.pp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
1039+
self.pg_mesh = ProcessGroupMesh(self.dp_size, self.pp_size, self.tp_size, self.sp_size)
1040+
else:
1041+
self.pp_axis, self.dp_axis, self.tp_axis, self.sp_axis = 0, 1, 2, 3
1042+
self.pg_mesh = ProcessGroupMesh(self.pp_size, self.dp_size, self.tp_size, self.sp_size)
10381043
self.stage_manager = None
10391044
self.schedule = None
10401045
self.custom_policy = custom_policy
@@ -1048,7 +1053,7 @@ def __init__(
10481053
assert self.zero_stage <= 1, "zero stage must be 0 or 1 when using pipeline parallelism"
10491054
self.stage_manager = PipelineStageManager(
10501055
self.pg_mesh,
1051-
pipeline_axis=PP_AXIS,
1056+
pipeline_axis=self.pp_axis,
10521057
enable_interleave=pp_style == "interleaved",
10531058
num_model_chunks=num_model_chunks,
10541059
)
@@ -1072,13 +1077,13 @@ def __init__(
10721077
else:
10731078
raise NotImplementedError()
10741079

1075-
self.tp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
1076-
self.dp_group = self.pg_mesh.get_group_along_axis(DP_AXIS)
1077-
self.pp_group = self.pg_mesh.get_group_along_axis(PP_AXIS)
1080+
self.tp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
1081+
self.dp_group = self.pg_mesh.get_group_along_axis(self.dp_axis)
1082+
self.pp_group = self.pg_mesh.get_group_along_axis(self.pp_axis)
10781083
if self.enable_sequence_parallelism and self.sequence_parallelism_mode in ["split_gather", "ring"]:
1079-
self.sp_group = self.pg_mesh.get_group_along_axis(TP_AXIS)
1084+
self.sp_group = self.pg_mesh.get_group_along_axis(self.tp_axis)
10801085
else:
1081-
self.sp_group = self.pg_mesh.get_group_along_axis(SP_AXIS)
1086+
self.sp_group = self.pg_mesh.get_group_along_axis(self.sp_axis)
10821087

10831088
self.shard_config = ShardConfig(
10841089
tensor_parallel_process_group=self.tp_group,
@@ -1169,7 +1174,7 @@ def configure(
11691174
and self.sequence_parallelism_mode == "all_to_all"
11701175
)
11711176
if self.enable_sequence_parallelism and self.sequence_parallelism_mode == "all_to_all":
1172-
dp_group = self.pg_mesh.create_group_along_axis([DP_AXIS, SP_AXIS])
1177+
dp_group = self.pg_mesh.create_group_along_axis([self.dp_axis, self.sp_axis])
11731178
else:
11741179
dp_group = self.dp_group
11751180
model = HybridParallelModule(
@@ -1317,7 +1322,10 @@ def prepare_dataloader(
13171322
_kwargs = kwargs.copy()
13181323
distributed_sampler_cls = distributed_sampler_cls or DistributedSampler
13191324
sampler = distributed_sampler_cls(
1320-
dataset, num_replicas=self.pg_mesh.size(DP_AXIS), rank=self.pg_mesh.coordinate(DP_AXIS), shuffle=shuffle
1325+
dataset,
1326+
num_replicas=self.pg_mesh.size(self.dp_axis),
1327+
rank=self.pg_mesh.coordinate(self.dp_axis),
1328+
shuffle=shuffle,
13211329
)
13221330

13231331
# Deterministic dataloader

examples/language/llama2/README.md

Lines changed: 3 additions & 114 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Pretraining LLaMA-1/2: best practices for building LLaMA-1/2-like base models
1+
# Pretraining LLaMA-1/2/3: best practices for building LLaMA-1/2/3-like base models
22

33
### LLaMA2
44
<p align="center">
@@ -16,38 +16,10 @@
1616
- 65-billion-parameter large model pretraining accelerated by 38%
1717
[[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)
1818

19-
## Dataset
20-
21-
Different from the original LLaMA, we use [RedPajama](https://www.together.xyz/blog/redpajama) dataset, which is a reproduction of the LLaMA training dataset containing over 1.2 trillion tokens. The full dataset is ~5TB unzipped on disk and ~3TB to download compressed.
22-
23-
A smaller, more consumable random sample can be downloaded through [Hugging Face](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T). If you just want to try out the pretraining script, you can use a 1B-token sample subset of RedPajama, which is available at [Hugging Face](https://huggingface.co/datasets/togethercomputer/RedPajama-Data-1T-Sample).
24-
25-
RedPajama-Data-1T consists of seven data slices:
26-
27-
| | RedPajama | LLaMA |
28-
|---------------|--------------|---------------|
29-
| CommonCrawl | 878 billion | 852 billion |
30-
| C4 | 175 billion | 190 billion |
31-
| Github | 59 billion | 100 billion |
32-
| Books | 26 billion | 25 billion |
33-
| ArXiv | 28 billion | 33 billion |
34-
| Wikipedia | 24 billion | 25 billion |
35-
| StackExchange | 20 billion | 27 billion |
36-
| Total | 1.2 trillion | 1.25 trillion |
37-
38-
## Training
39-
40-
We follow the hyperparameter settings from the original LLaMA paper. We use AdamW with $beta1=0.9$ and $beta2=0.95$. We use a cosine learning rate schedule, such that the final learning rate is equal to 10% of the maximal learning rate. We use a weight decay of 0.1 and gradient clipping of 1.0. We use 2,000 warmup steps.
41-
42-
| params | learning rate | batch size |
43-
|--------|---------------|------------|
44-
| 6.7B | 3.0e-4 | 4M |
45-
| 13.0B | 3.0e-4 | 4M |
46-
| 32.5B | 1.5e-4 | 4M |
47-
| 65.2B | 1.5e-4 | 4M |
48-
4919
## Usage
5020

21+
> ⚠ This example only has benchmarking script. For training/finetuning, please refer to the [applications/Colossal-LLaMA](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA).
22+
5123
### 1. Installation
5224

5325
Please install the latest ColossalAI from source.
@@ -62,52 +34,6 @@ Then install other dependencies.
6234
pip install -r requirements.txt
6335
```
6436

65-
Additionally, we recommend you to use torch 1.13.1. We've tested our code on torch 1.13.1 and found it's compatible with our code and flash attention.
66-
67-
### 2. Download the dataset
68-
69-
The dataset can be automatically downloaded by using `huggingface/datasets`. You can specify the dataset path by `-d` or `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`.
70-
71-
### 3. Command line arguments
72-
73-
Yon can use colossalai run to launch multi-nodes training:
74-
```bash
75-
colossalai run --nproc_per_node YOUR_GPU_PER_NODE --hostfile YOUR_HOST_FILE \
76-
pretrain.py --OTHER_CONFIGURATIONS
77-
```
78-
79-
Here is a sample hostfile:
80-
81-
```text
82-
hostname1
83-
hostname2
84-
hostname3
85-
hostname4
86-
```
87-
88-
Make sure master node can access all nodes (including itself) by ssh without password.
89-
90-
Here is details about CLI arguments:
91-
92-
- Model configuration: `-c`, `--config`. `7b`, `13b`, `30b` and `65b` are supported for LLaMA-1, `7b`, `13b`, and `70b` are supported for LLaMA-2.
93-
- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2`, `hybrid_parallel` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins).
94-
- Dataset path: `-d`, `--dataset`. The default dataset is `togethercomputer/RedPajama-Data-1T-Sample`. It support any dataset from `datasets` with the same data format as RedPajama.
95-
- Number of epochs: `-e`, `--num_epochs`. The default value is 1.
96-
- Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2.
97-
- Learning rate: `--lr`. The default value is 3e-4.
98-
- Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
99-
- Warmup steps: `-s`, `--warmup_steps`. The default value is 2000.
100-
- Gradient checkpointing: `-g`, `--gradient_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
101-
- Max length: `-l`, `--max_length`. The default value is 4096.
102-
- Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
103-
- Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
104-
- Checkpoint directory: `-o`, `--save_dir`. The directory path to save checkpoints. The default value is `checkpoint`.
105-
- Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`.
106-
- Gradient clipping: `--gradient_clipping`. The default value is 1.0.
107-
- Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`.
108-
- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install `flash-attn`. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
109-
110-
11137
### 4. Shell Script Examples
11238

11339
For your convenience, we provide some shell scripts to run benchmark with various configurations.
@@ -193,40 +119,3 @@ If you run the above command successfully, you will get the following results:
193119
year={2023}
194120
}
195121
```
196-
197-
198-
# Fine-tune Llama2
199-
200-
We also provide a example to fine-tune llama2 in `finetune.py`,
201-
202-
Make sure master node can access all nodes (including itself) by ssh without password.
203-
204-
Here is details about CLI arguments:
205-
206-
- Pretrained checkpoint path: `--model_path`, the path of your model checkpoint, it can be your local directory or a Hugging Face tag.
207-
- Booster plugin: `-p`, `--plugin`. `gemini`, `gemini_auto`, `zero2`, `hybrid_parallel` and `zero2_cpu` are supported. For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins).
208-
- Dataset path: `-d`, `--dataset`. The default dataset is `yizhongw/self_instruct`. It support any dataset from `datasets` with the same data format as `yizhongw/self_instruct`.
209-
- task name: `--task_name`, the task to fine-tune, it's also related to the target of loading dataset, The default value is `super_natural_instructions`.
210-
- Number of epochs: `-e`, `--num_epochs`. The default value is 1.
211-
- Local batch size: `-b`, `--batch_size`. Batch size per GPU. The default value is 2.
212-
- Learning rate: `--lr`. The default value is 3e-4.
213-
- Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
214-
- Gradient checkpointing: `-g`, `--gradient_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
215-
- Max length: `-l`, `--max_length`. The default value is 4096.
216-
- Mixed precision: `-x`, `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
217-
- Save interval: `-i`, `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
218-
- Checkpoint directory: `-o`, `--save_dir`. The directory path to save checkpoints. The default value is `checkpoint`.
219-
- Checkpoint to load: `-f`, `--load`. The checkpoint path to load. The default value is `None`.
220-
- Gradient clipping: `--gradient_clipping`. The default value is 1.0.
221-
- Tensorboard log directory: `-t`, `--tensorboard_dir`. The directory path to save tensorboard logs. The default value is `tb_logs`.
222-
- Flash attention: `-a`, `--flash_attention`. If you want to use flash attention, you must install `flash-attn`. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
223-
224-
225-
```shell
226-
torchrun --standalone --nproc_per_node 8 finetune.py \
227-
--plugin "hybrid_parallel" \
228-
--dataset "yizhongw/self_instruct" \
229-
--model_path "/path/llama" \
230-
--task_name "super_natural_instructions" \
231-
--save_dir "/path/output"
232-
```

examples/language/llama2/attn.py

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)