Skip to content

Commit 400f1c1

Browse files
committed
e2e eval works with gsm8k
[12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k results (scores normalized to range 0~100): |Tasks|Version| Filter |n-shot| Metric | | Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|------:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |63.6088|± |1.3253| | | |strict-match | 5|exact_match|↑ |63.6088|± |1.3253| [12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k average accuracy: 63.61 [12/08/2025-06:05:20] [TRT-LLM] [I] Hypothesis testing report: =========================================================== = ACCURACY HYPOTHESIS TESTING =========================================================== Alpha (Type I: False Positive): 0.050 Beta (Type II: False Negative): 0.200 Sigma (Standard deviation): 50.000 Higher is better: True Theta (Minimum detectable effect): 4.841 Reference accuracy: 64.740 Threshold: 61.537 =========================================================== Evaluated accuracy: 63.609 ===========================================================
1 parent 4e1a7d7 commit 400f1c1

File tree

6 files changed

+21
-19
lines changed

6 files changed

+21
-19
lines changed

tensorrt_llm/_torch/models/modeling_deepseekv3.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1129,7 +1129,8 @@ def forward(
11291129
**kwargs,
11301130
) -> Tuple[torch.Tensor, torch.Tensor]:
11311131
# Print only for the first layer and first decode iteration.
1132-
print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
1132+
# print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
1133+
print_condition = False
11331134
if print_condition:
11341135
print(f"[DeepseekV3DecoderLayer::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}]: BEFORE INPUT LAYERNORM hidden_states: {hidden_states.shape} \n {hidden_states}")
11351136
save_tensor(hidden_states, "before_input_layernorm", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)
@@ -1617,15 +1618,15 @@ def forward(
16171618
return_context_logits: bool = False,
16181619
**kwargs,
16191620
) -> torch.Tensor:
1620-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
1621-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
1622-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
1623-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
1624-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
1625-
assert attn_metadata.kv_cache_manager.tokens_per_block == 32
1626-
block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
1627-
for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
1628-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
1621+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
1622+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
1623+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
1624+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
1625+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
1626+
# assert attn_metadata.kv_cache_manager.tokens_per_block == 32
1627+
# block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
1628+
# for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
1629+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
16291630
return super().forward(attn_metadata=attn_metadata,
16301631
input_ids=input_ids,
16311632
position_ids=position_ids,

tensorrt_llm/_torch/modules/attention.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2163,7 +2163,8 @@ def forward(
21632163
latent_cache_gen=latent_cache_gen)
21642164

21652165
# Print only for the first layer and first decode iteration.
2166-
print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
2166+
# print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
2167+
print_condition = False
21672168
if print_condition:
21682169
print(f"[MLA::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}][tp_rank {self.mapping.tp_rank}]: BEFORE O_PROJ attn_output: {attn_output.shape} \n {attn_output} \n weight.shape: {self.o_proj.weight.shape} weight.tp_rank: {self.o_proj.tp_rank} weight.tp_size: {self.o_proj.tp_size} \n {self.o_proj.weight}")
21692170
save_tensor_mla(attn_output, "before_o_proj", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)

tensorrt_llm/_torch/pyexecutor/executor_request_queue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
684684
input_ids_this_rank = input_ids_this_rank[:-padding_len]
685685
position_ids_this_rank = position_ids_this_rank[:-padding_len]
686686

687-
print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
688-
print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
687+
# print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
688+
# print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
689689

690690
req = executor_request_to_llm_request(
691691
req_id=req_item.id,

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1905,7 +1905,7 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
19051905

19061906
for req in scheduled_batch.generation_requests:
19071907
if req.is_disagg_generation_transmission_complete:
1908-
print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
1908+
# print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
19091909
req.state = LlmRequestState.GENERATION_IN_PROGRESS
19101910
req.context_current_position = req.prompt_len
19111911
req.decoding_iter = 1

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -474,9 +474,9 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
474474
req.py_helix_is_inactive_rank = True
475475
# Skip allocating KV cache at decode for inactive helix ranks.
476476
if req.py_helix_is_inactive_rank:
477-
print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
477+
# print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
478478
continue
479-
print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
479+
# print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
480480
self.impl.add_token(req.py_request_id)
481481
for _ in range(get_draft_token_length(req)):
482482
self.impl.add_token(req.py_request_id)

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -907,10 +907,10 @@ def test_auto_dtype_with_helix(self, gen_pp, gen_tp, gen_cp):
907907
with launch_disaggregated_llm(disaggregated_server_config,
908908
ctx_server_config, gen_server_config,
909909
self.MODEL_PATH) as llm:
910-
task = MMLU(self.MODEL_NAME)
911-
task.evaluate(llm)
912-
# task = GSM8K(self.MODEL_NAME)
910+
# task = MMLU(self.MODEL_NAME)
913911
# task.evaluate(llm)
912+
task = GSM8K(self.MODEL_NAME)
913+
task.evaluate(llm)
914914

915915
@pytest.mark.skip_less_device(2)
916916
@pytest.mark.skip_less_device_memory(60000)

0 commit comments

Comments
 (0)