Fix TRTLLM API (#301)

meatybobby · web-flow · commit 159e06182584 · 2025-08-07T17:20:30.000Z
diff --git a/nemo_deploy/nlp/trtllm_api_deployable.py b/nemo_deploy/nlp/trtllm_api_deployable.py
@@ -38,7 +38,6 @@
 
 try:
     from tensorrt_llm import SamplingParams
-    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
     from tensorrt_llm.llmapi.llm import LLM, TokenizerBase
 
     HAVE_TENSORRT_LLM = True
@@ -90,9 +89,6 @@ def __init__(
         if not HAVE_TRITON:
             raise ImportError(MISSING_TRITON_MSG)
 
-        config_args = {k: kwargs.pop(k) for k in PyTorchConfig.__annotations__.keys() & kwargs.keys()}
-        pytorch_config = PyTorchConfig(**config_args)
-
         self.model = LLM(
             model=hf_model_id_path,
             tokenizer=hf_model_id_path if tokenizer is None else tokenizer,
@@ -104,7 +100,6 @@ def __init__(
             max_num_tokens=max_num_tokens,
             backend=backend,
             dtype=dtype,
-            pytorch_backend_config=pytorch_config,
             **kwargs,
         )
 
diff --git a/scripts/deploy/nlp/deploy_trtllm_api_triton.py b/scripts/deploy/nlp/deploy_trtllm_api_triton.py
@@ -48,9 +48,8 @@ def get_args():
     )
     parser.add_argument("-dt", "--dtype", default="auto", type=str, help="Model data type")
     parser.add_argument("-ab", "--attn_backend", default="TRTLLM", type=str, help="Attention kernel backend")
-    parser.add_argument("-eos", "--enable_overlap_scheduler", action="store_true", help="Enable overlap scheduler")
+    parser.add_argument("-dos", "--disable_overlap_scheduler", action="store_true", help="Disable overlap scheduler")
     parser.add_argument("-ecp", "--enable_chunked_prefill", action="store_true", help="Enable chunked prefill")
-    parser.add_argument("-ucg", "--use_cuda_graph", action="store_true", help="Use CUDA graph")
     parser.add_argument("-dm", "--debug_mode", action="store_true", help="Enable debug mode")
     args = parser.parse_args()
     return args
@@ -79,9 +78,8 @@ def trtllm_deploy():
         max_num_tokens=args.max_num_tokens,
         dtype=args.dtype,
         attn_backend=args.attn_backend,
-        enable_overlap_scheduler=args.enable_overlap_scheduler,
+        disable_overlap_scheduler=args.disable_overlap_scheduler,
         enable_chunked_prefill=args.enable_chunked_prefill,
-        use_cuda_graph=args.use_cuda_graph,
     )
 
     try:
diff --git a/tests/unit_tests/deploy/test_trtllm_api_deployable.py b/tests/unit_tests/deploy/test_trtllm_api_deployable.py
@@ -37,13 +37,6 @@ def mock_sampling_params():
         yield mock
 
 
-@pytest.fixture
-def mock_pytorch_config():
-    with patch("nemo_deploy.nlp.trtllm_api_deployable.PyTorchConfig") as mock:
-        mock.__annotations__ = {}
-        yield mock
-
-
 try:
     import tensorrt_llm  # noqa: F401
 
@@ -55,7 +48,7 @@ def mock_pytorch_config():
 @pytest.mark.skipif(not HAVE_TENSORRT_LLM, reason="TensorRT-LLM is not installed")
 @pytest.mark.run_only_on("GPU")
 class TestTensorRTLLMAPIDeployable:
-    def test_initialization_with_defaults(self, mock_pytorch_config):
+    def test_initialization_with_defaults(self):
         from nemo_deploy.nlp.trtllm_api_deployable import TensorRTLLMAPIDeployable
 
         with patch("nemo_deploy.nlp.trtllm_api_deployable.LLM") as mock_llm_class:
@@ -67,7 +60,7 @@ def test_initialization_with_defaults(self, mock_pytorch_config):
             assert deployer.model == mock_llm_instance
             mock_llm_class.assert_called_once()
 
-    def test_initialization_with_custom_params(self, mock_pytorch_config):
+    def test_initialization_with_custom_params(self):
         from nemo_deploy.nlp.trtllm_api_deployable import TensorRTLLMAPIDeployable
 
         with patch("nemo_deploy.nlp.trtllm_api_deployable.LLM") as mock_llm_class:
@@ -109,7 +102,7 @@ def test_generate_without_model(self):
             with pytest.raises(RuntimeError, match="Model is not initialized"):
                 deployer.generate(prompts=["test prompt"])
 
-    def test_generate_with_model(self, mock_llm, mock_sampling_params, mock_pytorch_config):
+    def test_generate_with_model(self, mock_llm, mock_sampling_params):
         from nemo_deploy.nlp.trtllm_api_deployable import TensorRTLLMAPIDeployable
 
         with patch("nemo_deploy.nlp.trtllm_api_deployable.LLM") as mock_llm_class:
@@ -122,7 +115,7 @@ def test_generate_with_model(self, mock_llm, mock_sampling_params, mock_pytorch_
             mock_llm.generate.assert_called_once()
             mock_sampling_params.assert_called_once()
 
-    def test_generate_with_parameters(self, mock_llm, mock_sampling_params, mock_pytorch_config):
+    def test_generate_with_parameters(self, mock_llm, mock_sampling_params):
         from nemo_deploy.nlp.trtllm_api_deployable import TensorRTLLMAPIDeployable
 
         with patch("nemo_deploy.nlp.trtllm_api_deployable.LLM") as mock_llm_class:
@@ -135,7 +128,7 @@ def test_generate_with_parameters(self, mock_llm, mock_sampling_params, mock_pyt
             mock_llm.generate.assert_called_once()
             mock_sampling_params.assert_called_once_with(max_tokens=100, temperature=0.8, top_k=50, top_p=0.95)
 
-    def test_triton_input_output_config(self, mock_pytorch_config):
+    def test_triton_input_output_config(self):
         from nemo_deploy.nlp.trtllm_api_deployable import TensorRTLLMAPIDeployable
 
         with patch("nemo_deploy.nlp.trtllm_api_deployable.LLM"):