Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 70 additions & 0 deletions apps/sft/deepseek_v3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# >>> python -m apps.sft.main --config apps/sft/deepseek_v3.yaml


# TODO: required by torchtitan
# https://github.com/pytorch/torchtitan/blob/2f1c814da071cc8ad165d00be6f9c1a66f8e1cce/torchtitan/distributed/utils.py#L265
comm:
trace_buf_size: 0

model_name: "deepseek-ai/DeepSeek-V3.1-Base"

model:
name: deepseek_v3
flavor: 16B
hf_assets_path: hf://${model_name}

processes:
procs: 8
with_gpus: true

optimizer:
name: AdamW
lr: 1e-5
eps: 1e-8

lr_scheduler:
warmup_steps: 200

training:
local_batch_size: 1
seq_len: 2048
max_norm: 1.0
steps: 1000
compile: false
dataset: "c4"

parallelism:
data_parallel_replicate_degree: 1
data_parallel_shard_degree: -1
tensor_parallel_degree: 1
Comment on lines +38 to +39
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Based on a conversation with @tianyu-l this should work for 32 nodes

Suggested change
data_parallel_shard_degree: -1
tensor_parallel_degree: 1
data_parallel_shard_degree: -1
tensor_parallel_degree: 8

It also may be worth looking at some of the runs here for reference.

Not sure of the latest status of ETP in titan but it'd be worth confirming that as well.

pipeline_parallel_degree: 1
context_parallel_degree: 1
expert_parallel_degree: 1
disable_loss_parallel: false

checkpoint:
enable: true
folder: ./checkpoint # The folder to save checkpoints to.
initial_load_path: hf://${model_name} # The path to load the initial checkpoint from. Ignored if `folder` exists.
initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo
last_save_in_hf: true
interval: 500
async_mode: "disabled"

activation_checkpoint:
mode: selective
selective_ac_option: op

metric_logging:
wandb:
project: sft-training
group: sft_exp_${oc.env:USER}
logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce

# profiling:
# enable_profiling: false

# metrics:
# log_freq: 10
# enable_tensorboard: true
# save_tb_folder: "tb"
Loading