Skip to content

Commit 4931c5e

Browse files
authored
[None][feat] update deepgemm to the DeepGEMM/nv_dev branch (#9898)
Signed-off-by: Fanrong Li <23290157+lfr-0531@users.noreply.github.com>
1 parent d272f1a commit 4931c5e

File tree

5 files changed

+19
-27
lines changed

5 files changed

+19
-27
lines changed

3rdparty/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@ FetchContent_Declare(
3838

3939
FetchContent_Declare(
4040
deepgemm
41-
GIT_REPOSITORY https://github.com/ruoqianguo/DeepGEMM
42-
GIT_TAG 6cb8161516302550785d9af924d2778afef1f3f6 # swapab_sm100 branch
41+
GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM
42+
GIT_TAG 4ff3f54d9b7ed3129e4f36f9871232ea7ecab86b # nv_dev branch
4343
GIT_SUBMODULES_RECURSE
4444
ON
4545
SOURCE_SUBDIR

cpp/tensorrt_llm/deep_gemm/CMakeLists.txt

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,13 @@ foreach(SOURCE_FILE ${DEEP_GEMM_ALL_FILES})
3838
if(FILE_EXT STREQUAL ".py")
3939
# Read file content and replace module imports for Python files
4040
file(READ ${SOURCE_FILE} _content)
41-
string(REPLACE "deep_gemm_cpp" "tensorrt_llm.deep_gemm_cpp_tllm" _content
41+
string(REPLACE "from . import _C" "import tensorrt_llm.deep_gemm_cpp_tllm"
42+
_content "${_content}")
43+
string(REPLACE ".._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content
44+
"${_content}")
45+
string(REPLACE "._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content
46+
"${_content}")
47+
string(REPLACE "_C." "tensorrt_llm.deep_gemm_cpp_tllm." _content
4248
"${_content}")
4349

4450
# Add adaptation header

examples/models/core/deepseek_v3/README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,6 @@ To quickly run DeepSeek-V3, [examples/llm-api/quickstart_advanced.py](../llm-api
9090
cd examples/llm-api
9191
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --tp_size 8
9292
```
93-
Please include `--tokens_per_block 64` when running DeepSeek-V3.2-Exp, as this model uses the deep_gemm.fp8_paged_mqa_logits kernel, which requires a KV cache block size of 64.
9493

9594
The model will be run by PyTorch backend and generate outputs like:
9695
```
@@ -108,7 +107,7 @@ cd examples/llm-api
108107
python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
109108
```
110109

111-
`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared. Please include `--tokens_per_block 64` when running DeepSeek-V3.2-Exp.
110+
`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
112111

113112
#### Relaxed acceptance
114113
**NOTE: This feature can only be used for DeepSeek R1.**

tensorrt_llm/_torch/attention_backend/sparse/dsa.py

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -785,7 +785,6 @@ def on_update_kv_lens(self):
785785
# After changing the kv_lens/kv_lens_cuda, we may need to update other metadatas.
786786
# Especially for the changes in the _preprocess_inputs() of model_engine.py.
787787
if self.num_generations > 0:
788-
tokens_per_block = self.kv_cache_manager.indexer_k_cache_tokens_per_block
789788
torch.cumsum(
790789
self.kv_lens_cuda[self.num_contexts:self.
791790
num_seqs], # num_contexts should be 0
@@ -800,7 +799,7 @@ def on_update_kv_lens(self):
800799
out=self.gen_cached_token_indptr[1:self.num_generations + 1])
801800
scheduler_metadata_buffer = get_paged_mqa_logits_metadata(
802801
self.kv_lens_cuda[self.num_contexts:self.num_seqs],
803-
tokens_per_block, self.num_sms)
802+
self.kv_cache_manager.tokens_per_block, self.num_sms)
804803
self.scheduler_metadata_buffer.copy_(scheduler_metadata_buffer,
805804
non_blocking=True)
806805
if self.use_expanded_buffers_for_mtp:
@@ -827,7 +826,6 @@ def on_update_kv_lens(self):
827826

828827
def update_for_spec_dec(self):
829828
super().update_for_spec_dec()
830-
self.kv_cache_manager.indexer_k_cache_tokens_per_block
831829
# host
832830
self.max_ctx_kv_len = 0
833831
self.num_ctx_cached_tokens = 0
@@ -1030,7 +1028,7 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
10301028
request_ids = metadata.request_ids
10311029
seq_lens = metadata.seq_lens
10321030
head_dim = metadata.kv_cache_manager.index_head_dim
1033-
tokens_per_block = metadata.kv_cache_manager.indexer_k_cache_tokens_per_block
1031+
tokens_per_block = metadata.kv_cache_manager.tokens_per_block
10341032
quant_block_size = metadata.kv_cache_manager.quant_block_size
10351033
cached_tokens = metadata.kv_cache_params.num_cached_tokens_per_seq
10361034
total_tokens = seq_lens.sum().item()
@@ -1750,9 +1748,6 @@ def __init__(
17501748
) -> None:
17511749
self.quant_block_size = 128
17521750
self.index_head_dim = sparse_attn_config.index_head_dim
1753-
# Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints
1754-
self.indexer_k_cache_tokens_per_block = 64
1755-
assert self.indexer_k_cache_tokens_per_block == tokens_per_block, "tokens_per_block must be set to 64 for DeepSeek v3.2"
17561751

17571752
super().__init__(
17581753
kv_cache_config,
@@ -1778,7 +1773,7 @@ def __init__(
17781773
self.num_blocks = self.blocks_in_primary_pool
17791774

17801775
# Indexer K cache pool for DSA attention
1781-
# Shape: [num_blocks, self.indexer_k_cache_tokens_per_block * (index_head_dim + scale_size)]
1776+
# Shape: [num_blocks, self.tokens_per_block * (index_head_dim + scale_size)]
17821777
# Non-interleaved layout: [fp8_tok0 | fp8_tok1 | ... | scale_tok0 | scale_tok1 | ...]
17831778
# Store FP8-quantized k values from the indexer
17841779
self.indexer_k_cache_pool_per_layer = [
@@ -1805,9 +1800,7 @@ def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
18051800
config = model_config.pretrained_config
18061801
sparse_attn_config = model_config.sparse_attention_config
18071802
index_head_dim = sparse_attn_config.index_head_dim
1808-
tokens_per_block = kwargs['tokens_per_block']
18091803
quant_block_size = 128
1810-
indexer_k_cache_tokens_per_block = 64
18111804

18121805
# get kv cache dtype bytes
18131806
mem_per_token = 2
@@ -1827,17 +1820,15 @@ def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
18271820
# 1 for K, others for indexer K cache
18281821
head_dim_factor = (index_head_dim +
18291822
index_head_dim // quant_block_size * 4) / head_dim
1830-
tokens_per_block_factor = indexer_k_cache_tokens_per_block / tokens_per_block
1831-
kv_factor = 1 + head_dim_factor * tokens_per_block_factor
1823+
kv_factor = 1 + head_dim_factor
18321824
mem_per_token *= kv_factor
18331825
return mem_per_token
18341826

18351827
def get_cache_bytes_per_token(self):
18361828
# self.kv_factor for K, others for indexer K cache
18371829
head_dim_factor = (self.index_head_dim + self.index_head_dim //
18381830
self.quant_block_size * 4) / self.head_dim
1839-
tokens_per_block_factor = self.indexer_k_cache_tokens_per_block / self.tokens_per_block
1840-
kv_factor = self.kv_factor + head_dim_factor * tokens_per_block_factor
1831+
kv_factor = self.kv_factor + head_dim_factor
18411832
cache_size_per_token = math.ceil(
18421833
kv_factor * sum(self.num_kv_heads_per_layer) * self.head_dim)
18431834

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2633,14 +2633,12 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
26332633
if get_sm_version() == 100 or get_sm_version() == 103:
26342634
moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend
26352635
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2636-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
2637-
tokens_per_block=64)
2636+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
26382637
else:
26392638
if moe_backend != "_DEFAULT":
26402639
pytest.skip("Not supported MoE backend!")
26412640
moe_config = MoeConfig()
2642-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
2643-
tokens_per_block=64)
2641+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
26442642

26452643
pytorch_config = dict(
26462644
disable_overlap_scheduler=not overlap_scheduler,
@@ -2711,8 +2709,7 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
27112709
pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
27122710

27132711
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2714-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
2715-
tokens_per_block=64)
2712+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
27162713
cuda_graph_config = CudaGraphConfig(
27172714
enable_padding=True,
27182715
max_batch_size=max_batch_size) if cuda_graph else None
@@ -2775,8 +2772,7 @@ def test_nvfp4_multi_gpus_chunked_prefill(self, tp_size, pp_size, ep_size,
27752772
pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
27762773

27772774
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
2778-
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7,
2779-
tokens_per_block=64)
2775+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
27802776
cuda_graph_config = CudaGraphConfig(
27812777
enable_padding=True,
27822778
max_batch_size=max_batch_size) if cuda_graph else None

0 commit comments

Comments
 (0)