-
Notifications
You must be signed in to change notification settings - Fork 328
Expand file tree
/
Copy pathsft.yaml
More file actions
191 lines (168 loc) · 6.36 KB
/
sft.yaml
File metadata and controls
191 lines (168 loc) · 6.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# SFT Algorithm Configuration
sft:
## total number of steps to train will equal
## min((max_num_epochs * len(train_dataloader)), max_num_steps)
max_num_epochs: 1
max_num_steps: 60
val_period: 10
val_batches: 8
val_global_batch_size: 32
val_micro_batch_size: 1
val_at_start: true
seed: 42
checkpointing:
enabled: true
checkpoint_dir: "results/sft"
metric_name: "val_loss" ## set to null to save most recent k checkpoints
higher_is_better: false
keep_top_k: 3
save_period: 10
checkpoint_must_save_by: null
policy:
model_name: "meta-llama/Llama-3.2-1B"
tokenizer:
name: ${policy.model_name} ## specify if you'd like to use a tokenizer different from the model's default
# chat_template can be a Jinja template string or path to a .jinja file
chat_template: "{% for message in messages %}{%- if message['role'] == 'system' %}{{'Context: ' + message['content'].strip()}}{%- elif message['role'] == 'user' %}{{' Question: ' + message['content'].strip() + ' Answer:'}}{%- elif message['role'] == 'assistant' %}{{' ' + message['content'].strip()}}{%- endif %}{% endfor %}"
train_global_batch_size: 32
train_micro_batch_size: 1
max_total_sequence_length: 1024
precision: "bfloat16"
dtensor_cfg:
enabled: true
cpu_offload: False
sequence_parallel: false
activation_checkpointing: false
tensor_parallel_size: 1
context_parallel_size: 1
custom_parallel_plan: null
dynamic_batching:
enabled: false
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
sequence_length_round: 64
sequence_packing:
enabled: False
train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
algorithm: "modified_first_fit_decreasing"
sequence_length_round: 64
# makes the training sequence length divisible by the tensor parallel size
# this is useful for sequence parallel training
make_sequence_length_divisible_by: ${policy.dtensor_cfg.tensor_parallel_size}
max_grad_norm: 1.0
optimizer:
name: "torch.optim.AdamW"
kwargs:
lr: 5.0e-6
weight_decay: 0.1
betas: [0.9, 0.98]
eps: 1e-5
# when using Dtensor, we need to set foreach
# and fused to False
foreach: False
fused: False
## ignored since enabled=false, but needed for testing purposes
megatron_cfg:
enabled: false
empty_unused_memory_level: 1
activation_checkpointing: false
tensor_model_parallel_size: 1
expert_tensor_parallel_size: 1
expert_model_parallel_size: 1
pipeline_model_parallel_size: 1
context_parallel_size: 1
pipeline_dtype: ${policy.precision}
num_layers_in_first_pipeline_stage: null
num_layers_in_last_pipeline_stage: null
sequence_parallel: false
freeze_moe_router: false
moe_router_dtype: null
moe_router_load_balancing_type: "aux_loss"
moe_router_bias_update_rate: 1e-3
moe_permute_fusion: false
#gives ~20% training perf speedup with sequence packing
apply_rope_fusion: True
optimizer:
optimizer: "adam"
lr: 5.0e-6
min_lr: 4.9999e-6
weight_decay: 0.1
bf16: false
fp16: false
params_dtype: "float32"
#adam
adam_beta1: 0.9
adam_beta2: 0.98
adam_eps: 1e-5
#sgd
sgd_momentum: 0.9
#distributed optimizer
use_distributed_optimizer: true
use_precision_aware_optimizer: true
clip_grad: ${policy.max_grad_norm}
scheduler:
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
weight_decay_incr_style: "constant"
lr_decay_style: "constant"
lr_decay_iters: 1000
lr_warmup_iters: 50
lr_warmup_init: 4.9999e-6
distributed_data_parallel_config:
grad_reduce_in_fp32: false
overlap_grad_reduce: true
overlap_param_gather: true
average_in_collective: true
data_parallel_sharding_strategy: "optim_grads_params"
use_custom_fsdp: false
data:
max_input_seq_length: ${policy.max_total_sequence_length}
add_bos: true
add_eos: true
add_generation_prompt: false
shuffle: true
num_workers: 1
dataset_name: "squad"
# You can use custom response datasets for training and validation. For example:
# data:
# dataset_name: ResponseDataset
# train_data_path: <PathToTrainingDataset> # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
# val_data_path: <PathToValidationDataset>
# input_key: <QuestionKey>, default is "input"
# output_key: <AnswerKey>, default is "output"
# train_split: <TrainSplit>, default is None # used for HuggingFace datasets
# val_split: <ValSplit>, default is None # used for HuggingFace datasets
# See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details.
## unused with squad dataset
prompt_file: null
split: null
output_key: null
seed: null
## OpenAI format specific configs
# train_data_path: "/path/to/train.jsonl" # Path to training data
# val_data_path: "/path/to/val.jsonl" # Path to validation data
# chat_key: "messages" # Key for messages in the data
# system_key: null # Key for system message (optional)
# system_prompt: null # Default system prompt (optional)
# tool_key: "tools" # Key for tools in the data
# use_preserving_dataset: false # If true, uses PreservingDataset to preserve heterogeneous schemas (e.g., tool calls with varying argument structures)
logger:
log_dir: "logs" # Base directory for all logs
wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
tensorboard_enabled: true
mlflow_enabled: false
swanlab_enabled: false # Disable SwanLab logging
monitor_gpus: true # If true, will monitor GPU usage and log to wandb and/or tensorboard
wandb:
project: "sft-dev"
name: "sft-dev-${data.dataset_name}"
tensorboard:
log_dir: "tb_logs-sft-dev-${data.dataset_name}"
mlflow:
experiment_name: "sft-dev"
run_name: "sft-dev-${data.dataset_name}"
gpu_monitoring:
collection_interval: 10 # How often to collect GPU usage metrics (in seconds)
flush_interval: 10 # How often to flush GPU usage metrics to the loggers (in seconds)
cluster:
gpus_per_node: 1
num_nodes: 1