Skip to content

Commit a1964bc

Browse files
authored
[NVIDIA#9643][fix] AutoDeploy: fix nano sharding config (NVIDIA#9668)
Signed-off-by: Lucas Liebenwein <[email protected]>
1 parent d9fba85 commit a1964bc

File tree

2 files changed

+20
-37
lines changed

2 files changed

+20
-37
lines changed

examples/auto_deploy/nano_v3.yaml

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,27 @@ kv_cache_config:
1313
enable_block_reuse: false
1414
transforms:
1515
detect_sharding:
16-
sharding_source: ['factory', 'heuristic']
1716
sharding_dims: ['ep', 'bmm']
17+
manual_config:
18+
head_dim: 128
19+
tp_plan:
20+
# mamba SSM layer
21+
"in_proj": "mamba"
22+
"out_proj": "rowwise"
23+
# attention layer
24+
"q_proj": "colwise"
25+
"k_proj": "colwise"
26+
"v_proj": "colwise"
27+
"o_proj": "rowwise"
28+
# NOTE: consider not sharding shared experts and/or
29+
# latent projections at all, keeping them replicated.
30+
# To do so, comment out the corresponding entries.
31+
# moe layer: SHARED experts
32+
"up_proj": "colwise"
33+
"down_proj": "rowwise"
34+
# MoLE: latent projections: simple shard
35+
"fc1_latent_proj": "gather"
36+
"fc2_latent_proj": "gather"
1837
multi_stream_moe:
1938
stage: compile
2039
enabled: true

tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs):
190190
# TODO: figure out how this can be incorporated into the export patch system
191191
AutoModelForCausalLM.from_config = get_model_from_config_patched
192192

193-
# _config_from_pretrained_original = AutoConfig.from_pretrained
194-
# _nemotron_h_base_model_tp_plan = {
195-
# # mamba SSM layer
196-
# "in_proj": "mamba",
197-
# "out_proj": "rowwise",
198-
# # attention layer
199-
# "q_proj": "colwise",
200-
# "k_proj": "colwise",
201-
# "v_proj": "colwise",
202-
# "o_proj": "rowwise",
203-
# # NOTE: consider not sharding shared experts and/or
204-
# # latent projections at all, keeping them replicated.
205-
# # To do so, comment out the corresponding entries.
206-
# # moe layer: SHARED experts
207-
# "up_proj": "colwise",
208-
# "down_proj": "rowwise",
209-
# # MoLE: latent projections: simple shard
210-
# "fc1_latent_proj": "gather",
211-
# "fc2_latent_proj": "gather",
212-
# }
213-
214-
215-
# def get_config_from_pretrained_patched(*args, **kwargs):
216-
# ret = _config_from_pretrained_original(*args, **kwargs)
217-
# config = ret[0] if isinstance(ret, tuple) else ret
218-
# # heuristic to check if it's a NemotronH MoE Model
219-
# model_type = getattr(config, "model_type", None)
220-
# num_moe_layers = getattr(config, "layers_block_type", []).count("moe")
221-
# if model_type == "nemotron_h" and num_moe_layers > 0:
222-
# config.base_model_tp_plan = _nemotron_h_base_model_tp_plan
223-
# return (config, *ret[1:]) if isinstance(ret, tuple) else config
224-
225-
226-
# # TODO: figure out how this can be incorporated into the export patch system
227-
# AutoConfig.from_pretrained = get_config_from_pretrained_patched
228-
229193
# TODO: figure out how this can be incorporated into the export patch system
230194
# Only patch if the module isn't available
231195
_mamba_ssm_module = "mamba_ssm"

0 commit comments

Comments
 (0)