Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions torchtitan/experiments/autoparallel/local_map_deepseek_v3/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,31 @@ def get_sample_config() -> DeepSeekV3ModelArgs:
v_head_dim=128,
mscale=0.70,
)


def get_16b_sdpa_config() -> DeepSeekV3ModelArgs:
return DeepSeekV3ModelArgs(
vocab_size=102400,
max_seq_len=4096,
dim=2048,
inter_dim=10944,
moe_inter_dim=1408,
n_layers=27,
n_dense_layers=1,
n_heads=16,
moe_args=_MoEArgs(
num_experts=64,
num_shared_experts=2,
top_k=6,
score_func="softmax",
route_norm=False,
score_before_experts=False,
mesh=None,
),
q_lora_rank=0,
kv_lora_rank=512,
qk_nope_head_dim=128,
qk_rope_head_dim=64,
v_head_dim=128,
mscale=0.70,
)
18 changes: 11 additions & 7 deletions torchtitan/experiments/autoparallel/local_map_deepseek_v3/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@

# Need to share same base class with torchtitan models
class DeepSeekV3Model(_DeepSeekV3Model, BaseModel):
def __init__(self, model_args: DeepSeekV3ModelArgs):
# Call _DeepSeekV3Model.__init__ which calls nn.Module.__init__
# Note: We don't call BaseModel.__init__ separately because:
# 1. nn.Module.__init__() is already called by _DeepSeekV3Model.__init__
# 2. Calling BaseModel.__init__ after would reset all module state
# (nn.Module.__init__ clears _modules, _parameters, etc.)
_DeepSeekV3Model.__init__(self, model_args)
def __init__(self, config: DeepSeekV3ModelArgs):
_DeepSeekV3Model.__init__(self, config)

def verify_module_protocol(self) -> None:
# Autoparallel submodules are standard nn.Modules,
# not torchtitan Module instances — skip the check.
pass


# Wire Configurable pattern: build() calls DeepSeekV3Model(config=...)
DeepSeekV3ModelArgs._owner = DeepSeekV3Model
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The "proper" way is to define DeepSeekV3ModelArgs inside DeepSeekV3Model, and you don't need this wiring.

Loading