[NVIDIA#9643][fix] AutoDeploy: fix nano sharding config (NVIDIA#9668)

lucaslie · web-flow · commit a1964bcbbcbb · 2025-12-04T03:10:25.000+08:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/nano_v3.yaml b/examples/auto_deploy/nano_v3.yaml
@@ -13,8 +13,27 @@ kv_cache_config:
   enable_block_reuse: false
 transforms:
   detect_sharding:
-    sharding_source: ['factory', 'heuristic']
     sharding_dims: ['ep', 'bmm']
+    manual_config:
+      head_dim: 128
+      tp_plan:
+        # mamba SSM layer
+        "in_proj": "mamba"
+        "out_proj": "rowwise"
+        # attention layer
+        "q_proj": "colwise"
+        "k_proj": "colwise"
+        "v_proj": "colwise"
+        "o_proj": "rowwise"
+        # NOTE: consider not sharding shared experts and/or
+        # latent projections at all, keeping them replicated.
+        # To do so, comment out the corresponding entries.
+        # moe layer: SHARED experts
+        "up_proj": "colwise"
+        "down_proj": "rowwise"
+        # MoLE: latent projections: simple shard
+        "fc1_latent_proj": "gather"
+        "fc2_latent_proj": "gather"
   multi_stream_moe:
     stage: compile
     enabled: true
diff --git a/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/patches/nemotron_h.py
@@ -190,42 +190,6 @@ def get_model_from_config_patched(config, **kwargs):
 # TODO: figure out how this can be incorporated into the export patch system
 AutoModelForCausalLM.from_config = get_model_from_config_patched
 
-# _config_from_pretrained_original = AutoConfig.from_pretrained
-# _nemotron_h_base_model_tp_plan = {
-#     # mamba SSM layer
-#     "in_proj": "mamba",
-#     "out_proj": "rowwise",
-#     # attention layer
-#     "q_proj": "colwise",
-#     "k_proj": "colwise",
-#     "v_proj": "colwise",
-#     "o_proj": "rowwise",
-#     # NOTE: consider not sharding shared experts and/or
-#     # latent projections at all, keeping them replicated.
-#     # To do so, comment out the corresponding entries.
-#     # moe layer: SHARED experts
-#     "up_proj": "colwise",
-#     "down_proj": "rowwise",
-#     # MoLE: latent projections: simple shard
-#     "fc1_latent_proj": "gather",
-#     "fc2_latent_proj": "gather",
-# }
-
-
-# def get_config_from_pretrained_patched(*args, **kwargs):
-#     ret = _config_from_pretrained_original(*args, **kwargs)
-#     config = ret[0] if isinstance(ret, tuple) else ret
-#     # heuristic to check if it's a NemotronH MoE Model
-#     model_type = getattr(config, "model_type", None)
-#     num_moe_layers = getattr(config, "layers_block_type", []).count("moe")
-#     if model_type == "nemotron_h" and num_moe_layers > 0:
-#         config.base_model_tp_plan = _nemotron_h_base_model_tp_plan
-#     return (config, *ret[1:]) if isinstance(ret, tuple) else config
-
-
-# # TODO: figure out how this can be incorporated into the export patch system
-# AutoConfig.from_pretrained = get_config_from_pretrained_patched
-
 # TODO: figure out how this can be incorporated into the export patch system
 # Only patch if the module isn't available
 _mamba_ssm_module = "mamba_ssm"