Skip to content

Commit ef0fab1

Browse files
committed
e2e eval works with gsm8k
[12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k results (scores normalized to range 0~100): |Tasks|Version| Filter |n-shot| Metric | | Value | |Stderr| |-----|------:|----------------|-----:|-----------|---|------:|---|-----:| |gsm8k| 3|flexible-extract| 5|exact_match|↑ |63.6088|± |1.3253| | | |strict-match | 5|exact_match|↑ |63.6088|± |1.3253| [12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k average accuracy: 63.61 [12/08/2025-06:05:20] [TRT-LLM] [I] Hypothesis testing report: =========================================================== = ACCURACY HYPOTHESIS TESTING =========================================================== Alpha (Type I: False Positive): 0.050 Beta (Type II: False Negative): 0.200 Sigma (Standard deviation): 50.000 Higher is better: True Theta (Minimum detectable effect): 4.841 Reference accuracy: 64.740 Threshold: 61.537 =========================================================== Evaluated accuracy: 63.609 ===========================================================
1 parent 0fbebf6 commit ef0fab1

File tree

6 files changed

+21
-19
lines changed

6 files changed

+21
-19
lines changed

tensorrt_llm/_torch/models/modeling_deepseekv3.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1131,7 +1131,8 @@ def forward(
11311131
**kwargs,
11321132
) -> Tuple[torch.Tensor, torch.Tensor]:
11331133
# Print only for the first layer and first decode iteration.
1134-
print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
1134+
# print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
1135+
print_condition = False
11351136
if print_condition:
11361137
print(f"[DeepseekV3DecoderLayer::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}]: BEFORE INPUT LAYERNORM hidden_states: {hidden_states.shape} \n {hidden_states}")
11371138
save_tensor(hidden_states, "before_input_layernorm", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)
@@ -1624,15 +1625,15 @@ def forward(
16241625
return_context_logits: bool = False,
16251626
**kwargs,
16261627
) -> torch.Tensor:
1627-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
1628-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
1629-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
1630-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
1631-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
1632-
assert attn_metadata.kv_cache_manager.tokens_per_block == 32
1633-
block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
1634-
for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
1635-
print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
1628+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
1629+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
1630+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
1631+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
1632+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
1633+
# assert attn_metadata.kv_cache_manager.tokens_per_block == 32
1634+
# block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
1635+
# for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
1636+
# print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
16361637
return super().forward(attn_metadata=attn_metadata,
16371638
input_ids=input_ids,
16381639
position_ids=position_ids,

tensorrt_llm/_torch/modules/attention.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2191,7 +2191,8 @@ def forward(
21912191
latent_cache_gen=latent_cache_gen)
21922192

21932193
# Print only for the first layer and first decode iteration.
2194-
print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
2194+
# print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
2195+
print_condition = False
21952196
if print_condition:
21962197
print(f"[MLA::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}][tp_rank {self.mapping.tp_rank}]: BEFORE O_PROJ attn_output: {attn_output.shape} \n {attn_output} \n weight.shape: {self.o_proj.weight.shape} weight.tp_rank: {self.o_proj.tp_rank} weight.tp_size: {self.o_proj.tp_size} \n {self.o_proj.weight}")
21972198
save_tensor_mla(attn_output, "before_o_proj", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)

tensorrt_llm/_torch/pyexecutor/executor_request_queue.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -684,8 +684,8 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
684684
input_ids_this_rank = input_ids_this_rank[:-padding_len]
685685
position_ids_this_rank = position_ids_this_rank[:-padding_len]
686686

687-
print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
688-
print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
687+
# print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
688+
# print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
689689

690690
req = executor_request_to_llm_request(
691691
req_id=req_item.id,

tensorrt_llm/_torch/pyexecutor/py_executor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2030,7 +2030,7 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
20302030

20312031
for req in scheduled_batch.generation_requests:
20322032
if req.is_disagg_generation_transmission_complete:
2033-
print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
2033+
# print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
20342034
req.state = LlmRequestState.GENERATION_IN_PROGRESS
20352035
req.context_current_position = req.prompt_len
20362036
req.decoding_iter = 1

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -474,9 +474,9 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
474474
req.py_helix_is_inactive_rank = True
475475
# Skip allocating KV cache at decode for inactive helix ranks.
476476
if req.py_helix_is_inactive_rank:
477-
print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
477+
# print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
478478
continue
479-
print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
479+
# print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
480480
self.impl.add_token(req.py_request_id)
481481
for _ in range(get_draft_token_length(req)):
482482
self.impl.add_token(req.py_request_id)

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -892,10 +892,10 @@ def test_auto_dtype_with_helix(self, gen_pp, gen_tp, gen_cp):
892892
with launch_disaggregated_llm(disaggregated_server_config,
893893
ctx_server_config, gen_server_config,
894894
self.MODEL_PATH) as llm:
895-
task = MMLU(self.MODEL_NAME)
896-
task.evaluate(llm)
897-
# task = GSM8K(self.MODEL_NAME)
895+
# task = MMLU(self.MODEL_NAME)
898896
# task.evaluate(llm)
897+
task = GSM8K(self.MODEL_NAME)
898+
task.evaluate(llm)
899899

900900
@pytest.mark.skip_less_device(2)
901901
@pytest.mark.skip_less_device_memory(60000)

0 commit comments

Comments
 (0)