Skip to content

Commit 9f824ef

Browse files
authored
simplify the example configs to be more minimal and less daunting (axolotl-ai-cloud#2486) [skip ci]
* simplify the example configs to be more minimal and less daunting * drop empty s2_attention from example yamls
1 parent dd66fb1 commit 9f824ef

File tree

101 files changed

+14
-1140
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

101 files changed

+14
-1140
lines changed

examples/cerebras/btlm-ft.yml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,6 @@ tokenizer_type: GPT2Tokenizer
88
trust_remote_code: true
99
tokenizer_use_fast: true
1010
tokenizer_legacy: true
11-
12-
load_in_8bit: false
13-
load_in_4bit: false
1411
strict: false
1512
push_dataset_to_hub:
1613
hf_use_auth_token: true
@@ -34,7 +31,6 @@ lora_alpha:
3431
lora_dropout:
3532
lora_target_modules:
3633
lora_target_linear:
37-
lora_fan_in_fan_out:
3834

3935
wandb_project:
4036
wandb_entity:
@@ -58,16 +54,12 @@ learning_rate: 0.000085
5854
train_on_inputs: true
5955
group_by_length: false
6056
bf16: auto
61-
fp16:
6257
tf32: true
6358

6459
gradient_checkpointing: false
65-
early_stopping_patience:
6660
resume_from_checkpoint:
67-
local_rank:
6861
logging_steps: 1
6962

70-
xformers_attention:
7163
flash_attention: true
7264
sdp_attention:
7365
flash_optimum:
@@ -80,8 +72,6 @@ evals_per_epoch: 4
8072
saves_per_epoch: 1
8173
save_total_limit:
8274

83-
debug:
84-
deepspeed:
8575
weight_decay: 0.1
8676
special_tokens:
8777
pad_token: "<|endoftext|>"

examples/cerebras/qlora.yml

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ lora_target_modules:
2222
- c_attn
2323
- c_proj
2424
lora_target_linear:
25-
lora_fan_in_fan_out:
2625
wandb_project:
2726
wandb_entity:
2827
wandb_watch:
@@ -36,15 +35,10 @@ optimizer: paged_adamw_8bit
3635
torchdistx_path:
3736
lr_scheduler: cosine
3837
learning_rate: 0.0002
39-
train_on_inputs: false
40-
group_by_length: false
4138
bf16: auto
42-
fp16:
4339
tf32: true
4440
gradient_checkpointing: true
45-
early_stopping_patience:
4641
resume_from_checkpoint:
47-
local_rank:
4842
logging_steps: 1
4943
xformers_attention: true
5044
flash_attention:
@@ -53,10 +47,6 @@ gptq_model_v1:
5347
warmup_steps: 10
5448
evals_per_epoch: 4
5549
saves_per_epoch: 1
56-
debug:
57-
deepspeed:
5850
weight_decay: 0.1
59-
fsdp:
60-
fsdp_config:
6151
special_tokens:
6252
pad_token: "<|endoftext|>"

examples/code-llama/13b/lora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ lora_r: 32
2626
lora_alpha: 16
2727
lora_dropout: 0.05
2828
lora_target_linear: true
29-
lora_fan_in_fan_out:
3029

3130
wandb_project:
3231
wandb_entity:
@@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit
4140
lr_scheduler: cosine
4241
learning_rate: 0.0002
4342

44-
train_on_inputs: false
45-
group_by_length: false
4643
bf16: auto
47-
fp16:
4844
tf32: false
4945

5046
gradient_checkpointing: true
51-
early_stopping_patience:
5247
resume_from_checkpoint:
53-
local_rank:
5448
logging_steps: 1
55-
xformers_attention:
5649
flash_attention: true
57-
s2_attention:
5850

5951
warmup_steps: 10
6052
evals_per_epoch: 4
6153
saves_per_epoch: 1
62-
debug:
63-
deepspeed:
6454
weight_decay: 0.0
65-
fsdp:
66-
fsdp_config:
6755
special_tokens:
6856
bos_token: "<s>"
6957
eos_token: "</s>"

examples/code-llama/13b/qlora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ pad_to_sequence_len: true
2626
lora_r: 32
2727
lora_alpha: 16
2828
lora_dropout: 0.05
29-
lora_target_modules:
3029
lora_target_linear: true
31-
lora_fan_in_fan_out:
3230

3331
wandb_project:
3432
wandb_entity:
@@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit
4341
lr_scheduler: cosine
4442
learning_rate: 0.0002
4543

46-
train_on_inputs: false
47-
group_by_length: false
4844
bf16: auto
49-
fp16:
5045
tf32: false
5146

5247
gradient_checkpointing: true
53-
early_stopping_patience:
5448
resume_from_checkpoint:
55-
local_rank:
5649
logging_steps: 1
57-
xformers_attention:
5850
flash_attention: true
5951

6052
warmup_steps: 10
6153
evals_per_epoch: 4
6254
saves_per_epoch: 1
63-
debug:
64-
deepspeed:
6555
weight_decay: 0.0
66-
fsdp:
67-
fsdp_config:
6856
special_tokens:
6957
bos_token: "<s>"
7058
eos_token: "</s>"

examples/code-llama/34b/lora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ lora_r: 32
2626
lora_alpha: 16
2727
lora_dropout: 0.05
2828
lora_target_linear: true
29-
lora_fan_in_fan_out:
3029

3130
wandb_project:
3231
wandb_entity:
@@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit
4140
lr_scheduler: cosine
4241
learning_rate: 0.0002
4342

44-
train_on_inputs: false
45-
group_by_length: false
4643
bf16: auto
47-
fp16:
4844
tf32: false
4945

5046
gradient_checkpointing: true
51-
early_stopping_patience:
5247
resume_from_checkpoint:
53-
local_rank:
5448
logging_steps: 1
55-
xformers_attention:
5649
flash_attention: true
57-
s2_attention:
5850

5951
warmup_steps: 10
6052
evals_per_epoch: 4
6153
saves_per_epoch: 1
62-
debug:
63-
deepspeed:
6454
weight_decay: 0.0
65-
fsdp:
66-
fsdp_config:
6755
special_tokens:
6856
bos_token: "<s>"
6957
eos_token: "</s>"

examples/code-llama/34b/qlora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ pad_to_sequence_len: true
2626
lora_r: 32
2727
lora_alpha: 16
2828
lora_dropout: 0.05
29-
lora_target_modules:
3029
lora_target_linear: true
31-
lora_fan_in_fan_out:
3230

3331
wandb_project:
3432
wandb_entity:
@@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit
4341
lr_scheduler: cosine
4442
learning_rate: 0.0002
4543

46-
train_on_inputs: false
47-
group_by_length: false
4844
bf16: auto
49-
fp16:
5045
tf32: false
5146

5247
gradient_checkpointing: true
53-
early_stopping_patience:
5448
resume_from_checkpoint:
55-
local_rank:
5649
logging_steps: 1
57-
xformers_attention:
5850
flash_attention: true
5951

6052
warmup_steps: 10
6153
evals_per_epoch: 4
6254
saves_per_epoch: 1
63-
debug:
64-
deepspeed:
6555
weight_decay: 0.0
66-
fsdp:
67-
fsdp_config:
6856
special_tokens:
6957
bos_token: "<s>"
7058
eos_token: "</s>"

examples/code-llama/7b/lora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@ lora_r: 32
2626
lora_alpha: 16
2727
lora_dropout: 0.05
2828
lora_target_linear: true
29-
lora_fan_in_fan_out:
3029

3130
wandb_project:
3231
wandb_entity:
@@ -41,29 +40,18 @@ optimizer: adamw_bnb_8bit
4140
lr_scheduler: cosine
4241
learning_rate: 0.0002
4342

44-
train_on_inputs: false
45-
group_by_length: false
4643
bf16: auto
47-
fp16:
4844
tf32: false
4945

5046
gradient_checkpointing: true
51-
early_stopping_patience:
5247
resume_from_checkpoint:
53-
local_rank:
5448
logging_steps: 1
55-
xformers_attention:
5649
flash_attention: true
57-
s2_attention:
5850

5951
warmup_steps: 10
6052
evals_per_epoch: 4
6153
saves_per_epoch: 1
62-
debug:
63-
deepspeed:
6454
weight_decay: 0.0
65-
fsdp:
66-
fsdp_config:
6755
special_tokens:
6856
bos_token: "<s>"
6957
eos_token: "</s>"

examples/code-llama/7b/qlora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,7 @@ pad_to_sequence_len: true
2626
lora_r: 32
2727
lora_alpha: 16
2828
lora_dropout: 0.05
29-
lora_target_modules:
3029
lora_target_linear: true
31-
lora_fan_in_fan_out:
3230

3331
wandb_project:
3432
wandb_entity:
@@ -43,28 +41,18 @@ optimizer: paged_adamw_32bit
4341
lr_scheduler: cosine
4442
learning_rate: 0.0002
4543

46-
train_on_inputs: false
47-
group_by_length: false
4844
bf16: auto
49-
fp16:
5045
tf32: false
5146

5247
gradient_checkpointing: true
53-
early_stopping_patience:
5448
resume_from_checkpoint:
55-
local_rank:
5649
logging_steps: 1
57-
xformers_attention:
5850
flash_attention: true
5951

6052
warmup_steps: 10
6153
evals_per_epoch: 4
6254
saves_per_epoch: 1
63-
debug:
64-
deepspeed:
6555
weight_decay: 0.0
66-
fsdp:
67-
fsdp_config:
6856
special_tokens:
6957
bos_token: "<s>"
7058
eos_token: "</s>"

examples/cohere/command-r-7b-qlora.yml

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,28 +44,16 @@ optimizer: adamw_bnb_8bit
4444
lr_scheduler: cosine
4545
learning_rate: 0.0002
4646

47-
train_on_inputs: false
48-
group_by_length: false
4947
bf16: auto
50-
fp16:
5148
tf32: true
5249

5350
gradient_checkpointing: true
54-
early_stopping_patience:
5551
resume_from_checkpoint:
56-
local_rank:
5752
logging_steps: 1
58-
xformers_attention:
5953
flash_attention: true
6054

6155
warmup_ratio: 0.1
6256
evals_per_epoch:
63-
eval_table_size:
64-
eval_max_new_tokens: 128
6557
saves_per_epoch: 1
66-
debug:
67-
deepspeed:
6858
weight_decay: 0.0
69-
fsdp:
70-
fsdp_config:
7159
special_tokens:

examples/dbrx/16bit-lora.yaml

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
33
# hub_model_id: username/custom_model_name
44

55
trust_remote_code: true
6-
7-
load_in_8bit: false
8-
load_in_4bit: false
96
strict: false
107

118
datasets:
@@ -48,26 +45,20 @@ optimizer: paged_adamw_8bit
4845
lr_scheduler: cosine
4946
learning_rate: 0.0002
5047

51-
train_on_inputs: false
52-
group_by_length: false
5348
bf16: auto
54-
fp16:
5549
tf32: false
5650

5751
gradient_checkpointing: false # don't use with fsdp_activation_checkpointing
5852
gradient_checkpointing_kwargs:
5953
use_reentrant: false
60-
early_stopping_patience:
6154
resume_from_checkpoint:
62-
local_rank:
6355
logging_steps: 1
64-
xformers_attention:
6556
flash_attention: true
6657

6758
warmup_steps: 10
6859
evals_per_epoch:
6960
saves_per_epoch: 1
70-
debug:
61+
7162
weight_decay: 0.0
7263
fsdp:
7364
- full_shard

0 commit comments

Comments
 (0)