add integration accuracy tests and clean up coderabbit suggestion

yibinl-nvidia · yibinl-nvidia · commit 7d135ba30cb4 · 2025-11-13T21:43:31.000Z
Signed-off-by: Yibin Li &lt;109242046+yibinl-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_starcoder2.py b/tensorrt_llm/_torch/models/modeling_starcoder2.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 from typing import Optional
 
 import torch
@@ -264,13 +279,16 @@ def __init__(
             vocab_size=model_config.pretrained_config.vocab_size,
         )
 
-    def load_weights(self, weights, weight_mapper=None, skip_modules=[]):
+    def load_weights(self, weights, weight_mapper=None, skip_modules=None):
         """
         Load weights with custom mapping for StarCoder2.
 
         StarCoder2 uses GPT-2 style MLP naming (c_fc, c_proj)
         while our MLP module expects (up_proj, down_proj).
         """
+        if skip_modules is None:
+            skip_modules = []
+
         # Map HuggingFace StarCoder2 weight names to TensorRT-LLM names
         params_map = {
             r"(.*?)\.mlp\.c_fc\.(.*)": r"\1.mlp.up_proj.\2",
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -4054,3 +4054,49 @@ def test_auto_dtype(self):
                           extra_evaluator_kwargs=dict(
                               apply_chat_template=True,
                               chat_template_kwargs=chat_template_kwargs))
+
+
+class TestStarcoder2_3B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "bigcode/starcoder2-3b"
+    MODEL_PATH = f"{llm_models_root()}/starcoder2-3b/"
+
+    @skip_pre_hopper
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 attn_backend="TRTLLM",
+                 cuda_graph_config=None,
+                 max_batch_size=128,
+                 max_seq_len=4096) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestStarcoder2_7B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "bigcode/starcoder2-7b"
+    MODEL_PATH = f"{llm_models_root()}/starcoder2-7b/"
+
+    @skip_pre_hopper
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 attn_backend="TRTLLM",
+                 cuda_graph_config=None,
+                 max_batch_size=128,
+                 max_seq_len=4096) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+
+class TestStarcoder2_15B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "bigcode/starcoder2-15b"
+    MODEL_PATH = f"{llm_models_root()}/starcoder2-15b/"
+
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device_memory(80000)
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 attn_backend="TRTLLM",
+                 cuda_graph_config=None,
+                 max_batch_size=128,
+                 max_seq_len=4096) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -382,6 +382,10 @@ accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-c
 accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4
 accuracy/test_llm_api_pytorch.py::TestCodestral_22B_V01::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
+accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
+
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
 accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -259,6 +259,9 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_eagle3[llguidance-eagle3_one_model=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[xgrammar]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance]
+  - accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype
+  - accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype
+  - accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
   - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
   - test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
diff --git a/tests/unittest/_torch/modeling/test_modeling_starcoder2.py b/tests/unittest/_torch/modeling/test_modeling_starcoder2.py
@@ -14,7 +14,6 @@
 from tensorrt_llm._torch.metadata import KVCacheParams
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.models.modeling_starcoder2 import Starcoder2ForCausalLM
-from tensorrt_llm._torch.pyexecutor.cuda_graph_runner import CUDAGraphRunner
 from tensorrt_llm._torch.pyexecutor.resource_manager import KVCacheManager
 from tensorrt_llm.bindings.executor import KvCacheConfig
 from tensorrt_llm.mapping import Mapping
@@ -114,7 +113,7 @@ def get_kv_cache_manager(
         elif dtype == torch.bfloat16:
             kv_cache_dtype = tensorrt_llm.bindings.DataType.BF16
         else:
-            raise ValueError("Invalid dtype")
+            raise ValueError(f"Invalid dtype: {dtype}")
 
         mapping = Mapping(world_size=1, tp_size=1, rank=0)
         kv_cache_config = KvCacheConfig(
@@ -160,7 +159,7 @@ def test_starcoder2_sanity(self):
 
         input_ids = torch.tensor(
             [100, 200, 300, 400, 500, 600, 700, 800],
-            dtype=torch.int,
+            dtype=torch.long,
             device=device,
         )
 
@@ -188,7 +187,7 @@ def test_starcoder2_sanity(self):
 
         metadata_cls = get_attention_backend(model_config.attn_backend).Metadata
         attn_metadata = metadata_cls(
-            seq_lens=torch.tensor(sequence_lengths, dtype=torch.int),
+            seq_lens=torch.tensor(sequence_lengths, dtype=torch.long),
             num_contexts=len(context_sequence_lengths),
             kv_cache_params=KVCacheParams(
                 use_cache=True,
@@ -302,7 +301,7 @@ def test_starcoder2_allclose_to_hf(self, scenario: Scenario) -> None:
         # Context phase (no CUDA graphs for prefill)
         input_ids = torch.tensor(
             [100, 200, 300, 400, 500, 600, 700, 800],
-            dtype=torch.int32,
+            dtype=torch.long,
             device=device,
         )
         num_cached_tokens_per_seq = [0]
@@ -312,7 +311,7 @@ def test_starcoder2_allclose_to_hf(self, scenario: Scenario) -> None:
         kv_cache_manager.add_dummy_requests(request_ids, token_nums)
 
         attn_metadata = metadata_cls(
-            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
+            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.long),
             num_contexts=1,
             kv_cache_params=KVCacheParams(
                 use_cache=True,
@@ -325,7 +324,7 @@ def test_starcoder2_allclose_to_hf(self, scenario: Scenario) -> None:
             prompt_lens=prompt_lens,
         )
 
-        position_ids = [torch.arange(0, input_ids.size(-1), dtype=torch.int32)]
+        position_ids = [torch.arange(0, input_ids.size(-1), dtype=torch.long)]
         position_ids = torch.cat(position_ids).unsqueeze(0).cuda()
 
         with torch.inference_mode():
@@ -343,11 +342,11 @@ def test_starcoder2_allclose_to_hf(self, scenario: Scenario) -> None:
             torch.testing.assert_close(logits, ref.logits[:, -1].float(), atol=0.4, rtol=0.4)
 
         # Generation phase (optionally with CUDA graphs)
-        gen_input_ids = torch.tensor([900], dtype=torch.int32, device=device)
+        gen_input_ids = torch.tensor([900], dtype=torch.long, device=device)
         num_cached_tokens_per_seq = [input_ids.size(-1)]
 
         attn_metadata = metadata_cls(
-            seq_lens=torch.tensor([gen_input_ids.size(-1)], dtype=torch.int),
+            seq_lens=torch.tensor([gen_input_ids.size(-1)], dtype=torch.long),
             num_contexts=0,
             kv_cache_params=KVCacheParams(
                 use_cache=True,
@@ -362,18 +361,17 @@ def test_starcoder2_allclose_to_hf(self, scenario: Scenario) -> None:
 
         gen_position_ids = [
             torch.arange(
-                input_ids.size(-1), input_ids.size(-1) + gen_input_ids.size(-1), dtype=torch.int32
+                input_ids.size(-1), input_ids.size(-1) + gen_input_ids.size(-1), dtype=torch.long
             )
         ]
         gen_position_ids = torch.cat(gen_position_ids).unsqueeze(0).cuda()
 
         # Setup CUDA graph runner if requested
         graph_runner = None
         if use_cuda_graph:
-            from _torch.helpers import create_mock_engine
+            from _torch.helpers import create_mock_cuda_graph_runner
 
-            mock_engine = create_mock_engine(1)
-            graph_runner = CUDAGraphRunner(mock_engine)
+            graph_runner = create_mock_cuda_graph_runner(1)
             attn_metadata = attn_metadata.create_cuda_graph_metadata(1)
 
         # Run generation phase
@@ -476,7 +474,7 @@ def test_starcoder2_generated_tokens_match_hf(self, scenario: Scenario) -> None:
         # Encode test prompt
         input_ids = torch.tensor(
             tokenizer.encode(test_prompt),
-            dtype=torch.int32,
+            dtype=torch.long,
             device=device,
         )
 
@@ -508,7 +506,7 @@ def test_starcoder2_generated_tokens_match_hf(self, scenario: Scenario) -> None:
         kv_cache_manager.add_dummy_requests(request_ids, token_nums)
 
         attn_metadata = metadata_cls(
-            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.int),
+            seq_lens=torch.tensor([input_ids.size(-1)], dtype=torch.long),
             num_contexts=1,
             kv_cache_params=KVCacheParams(
                 use_cache=True,
@@ -522,7 +520,7 @@ def test_starcoder2_generated_tokens_match_hf(self, scenario: Scenario) -> None:
         )
 
         position_ids = torch.arange(
-            0, input_ids.size(-1), dtype=torch.int32, device=device
+            0, input_ids.size(-1), dtype=torch.long, device=device
         ).unsqueeze(0)
 
         with torch.inference_mode():
@@ -540,10 +538,10 @@ def test_starcoder2_generated_tokens_match_hf(self, scenario: Scenario) -> None:
 
         # Generation phase - generate remaining tokens
         for step in range(1, max_new_tokens):
-            gen_input_ids = torch.tensor([next_token_id], dtype=torch.int32, device=device)
+            gen_input_ids = torch.tensor([next_token_id], dtype=torch.long, device=device)
 
             attn_metadata = metadata_cls(
-                seq_lens=torch.tensor([1], dtype=torch.int),
+                seq_lens=torch.tensor([1], dtype=torch.long),
                 num_contexts=0,
                 kv_cache_params=KVCacheParams(
                     use_cache=True,
@@ -557,7 +555,7 @@ def test_starcoder2_generated_tokens_match_hf(self, scenario: Scenario) -> None:
             )
 
             gen_position_ids = torch.arange(
-                num_cached_tokens, num_cached_tokens + 1, dtype=torch.int32, device=device
+                num_cached_tokens, num_cached_tokens + 1, dtype=torch.long, device=device
             ).unsqueeze(0)
 
             with torch.inference_mode():