e2e eval works with gsm8k

brb-nv · brb-nv · commit ef0fab19e2ac · 2025-12-15T00:26:34.000Z
[12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k results (scores normalized to range 0~100):
|Tasks|Version|     Filter     |n-shot|  Metric   |   | Value |   |Stderr|
|-----|------:|----------------|-----:|-----------|---|------:|---|-----:|
|gsm8k|      3|flexible-extract|     5|exact_match|↑  |63.6088|±  |1.3253|
|     |       |strict-match    |     5|exact_match|↑  |63.6088|±  |1.3253|

[12/08/2025-06:05:20] [TRT-LLM] [I] lm-eval gsm8k average accuracy: 63.61
[12/08/2025-06:05:20] [TRT-LLM] [I] Hypothesis testing report:
===========================================================
= ACCURACY HYPOTHESIS TESTING
===========================================================
Alpha (Type I:  False Positive): 0.050
Beta  (Type II: False Negative): 0.200
Sigma (Standard deviation): 50.000
Higher is better: True
Theta (Minimum detectable effect): 4.841
Reference accuracy: 64.740
Threshold: 61.537
===========================================================
Evaluated accuracy: 63.609
===========================================================
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -1131,7 +1131,8 @@ def forward(
         **kwargs,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         # Print only for the first layer and first decode iteration.
-        print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
+        # print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
+        print_condition = False
         if print_condition:
             print(f"[DeepseekV3DecoderLayer::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}]: BEFORE INPUT LAYERNORM hidden_states: {hidden_states.shape} \n {hidden_states}")
             save_tensor(hidden_states, "before_input_layernorm", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)
@@ -1624,15 +1625,15 @@ def forward(
         return_context_logits: bool = False,
         **kwargs,
     ) -> torch.Tensor:
-        print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
-        print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
-        print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
-        print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
-        print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
-        assert attn_metadata.kv_cache_manager.tokens_per_block == 32
-        block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
-        for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
-            print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
+        # print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] input_ids: {input_ids}")
+        # print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] position_ids: {position_ids}")
+        # print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] helix_is_inactive_rank: {attn_metadata.helix_is_inactive_rank}")
+        # print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_cache_params.num_cached_tokens_per_seq: {attn_metadata.kv_cache_params.num_cached_tokens_per_seq}")
+        # print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] kv_lens_cuda: {attn_metadata.kv_lens_cuda}")
+        # assert attn_metadata.kv_cache_manager.tokens_per_block == 32
+        # block_ids_per_seq = attn_metadata.kv_cache_manager.get_batch_cache_indices(attn_metadata.request_ids)
+        # for request_id, block_ids in zip(attn_metadata.request_ids, block_ids_per_seq):
+        #     print(f"[DeepseekV3ForCausalLM::forward][rank {self.model_config.mapping.rank}][cp_rank {self.model_config.mapping.cp_rank}] request_id: {request_id}, block_ids: {block_ids}")
         return super().forward(attn_metadata=attn_metadata,
                                input_ids=input_ids,
                                position_ids=position_ids,
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -2191,7 +2191,8 @@ def forward(
                               latent_cache_gen=latent_cache_gen)
 
         # Print only for the first layer and first decode iteration.
-        print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
+        # print_condition = self.layer_idx == 0 and len(position_ids) == 1 and position_ids[0][0].item() == 52
+        print_condition = False
         if print_condition:
             print(f"[MLA::forward][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}][tp_rank {self.mapping.tp_rank}]: BEFORE O_PROJ attn_output: {attn_output.shape} \n {attn_output} \n weight.shape: {self.o_proj.weight.shape} weight.tp_rank: {self.o_proj.tp_rank} weight.tp_size: {self.o_proj.tp_size} \n {self.o_proj.weight}")
             save_tensor_mla(attn_output, "before_o_proj", self.mapping.rank, self.mapping.cp_rank, self.mapping.tp_rank)
diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -684,8 +684,8 @@ def _merge_helix_requests(self, new_requests: list[RequestQueueItem],
                 input_ids_this_rank = input_ids_this_rank[:-padding_len]
                 position_ids_this_rank = position_ids_this_rank[:-padding_len]
 
-            print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
-            print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
+            # print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: input_ids_this_rank: {input_ids_this_rank}")
+            # print(f"[ExecutorRequestQueue::_merge_helix_requests][rank {self.dist.rank}][cp_rank {curr_cp_rank}]: position_ids_this_rank: {position_ids_this_rank}")
 
             req = executor_request_to_llm_request(
                 req_id=req_item.id,
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -2030,7 +2030,7 @@ def _prepare_disagg_gen_transmission_complete(self, scheduled_batch):
 
         for req in scheduled_batch.generation_requests:
             if req.is_disagg_generation_transmission_complete:
-                print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
+                # print(f"[PyExecutor::_prepare_disagg_gen_transmission_complete][rank {self.dist.rank}][cp_rank {self.dist.cp_rank}]: TRANSMISSION COMPLETE for request ID: {req.py_request_id}")
                 req.state = LlmRequestState.GENERATION_IN_PROGRESS
                 req.context_current_position = req.prompt_len
                 req.decoding_iter = 1
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -474,9 +474,9 @@ def prepare_resources(self, scheduled_batch: ScheduledRequests):
                         req.py_helix_is_inactive_rank = True
                 # Skip allocating KV cache at decode for inactive helix ranks.
                 if req.py_helix_is_inactive_rank:
-                    print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
+                    # print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Skipping KV allocation for request {req.py_request_id}.")
                     continue
-                print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
+                # print(f"[ResourceManager::prepare_resources][rank {self.mapping.rank}][cp_rank {self.mapping.cp_rank}] Adding KV allocation for request {req.py_request_id}.")
                 self.impl.add_token(req.py_request_id)
                 for _ in range(get_draft_token_length(req)):
                     self.impl.add_token(req.py_request_id)
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -892,10 +892,10 @@ def test_auto_dtype_with_helix(self, gen_pp, gen_tp, gen_cp):
         with launch_disaggregated_llm(disaggregated_server_config,
                                       ctx_server_config, gen_server_config,
                                       self.MODEL_PATH) as llm:
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            # task = GSM8K(self.MODEL_NAME)
+            # task = MMLU(self.MODEL_NAME)
             # task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
 
     @pytest.mark.skip_less_device(2)
     @pytest.mark.skip_less_device_memory(60000)