Skip to content

Commit 2d5eadf

Browse files
authored
[None][fix] fix TP support for DeepSeek-V3.2 on hopper (#9484)
Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
1 parent 51bf716 commit 2d5eadf

File tree

3 files changed

+8
-3
lines changed

3 files changed

+8
-3
lines changed

tensorrt_llm/_torch/modules/attention.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2060,9 +2060,10 @@ def forward_sparse_mla_kvcache_bf16(
20602060

20612061
# [seq, num_heads, kv_lora_rank], account for padding
20622062
attn_out_latent = attn_out_latent[:, :self.num_heads_tp, :]
2063-
# TODO: seems we need .contiguous() here when padding enabled before pass to bmm?
20642063
attn_out_latent = attn_out_latent.view(
20652064
[-1, self.num_heads_tp, self.kv_lora_rank])
2065+
if self.num_heads_tp != padding:
2066+
attn_out_latent = attn_out_latent.contiguous()
20662067

20672068
assert (attn_out_latent.shape[0] == q.shape[0]
20682069
and attn_out_latent.shape[1] == self.num_heads_tp)

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2409,8 +2409,12 @@ class TestDeepSeekV32(LlmapiAccuracyTestHarness):
24092409
(8, 1, 8, 1, False, True, True, True, 24, "_DEFAULT"),
24102410
(8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"),
24112411
(8, 1, 8, 3, False, False, True, True, 1, "TRTLLM"),
2412+
(8, 1, 8, 3, False, False, True, True, 1, "_DEFAULT"),
24122413
],
2413-
ids=["baseline", "baseline_mtp1", "baseline_fp8kv", "latency"])
2414+
ids=[
2415+
"baseline", "baseline_mtp1", "baseline_fp8kv", "latency",
2416+
"latency_default"
2417+
])
24142418
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
24152419
attention_dp, cuda_graph, overlap_scheduler,
24162420
max_batch_size, moe_backend):

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ l0_dgx_h200:
1818
# - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] # OOM
1919
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] # 1h
2020
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline]
21-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency]
21+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency_default]
2222
- accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
2323
- accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
2424
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True]

0 commit comments

Comments
 (0)