NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_deploy/nlp/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎nemo_deploy/nlp/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎nemo_deploy/nlp/query_llm.py‎
Lines changed: 75 additions & 18 deletions b/‎nemo_deploy/nlp/query_llm.py‎
Lines changed: 75 additions & 18 deletions
diff --git a/‎nemo_deploy/nlp/trtllm_api_deployable.py‎
Lines changed: 170 additions & 0 deletions b/‎nemo_deploy/nlp/trtllm_api_deployable.py‎
Lines changed: 170 additions & 0 deletions
@@ -197,6 +197,8 @@ jobs:
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
           - script: L2_NeMo_2_Export_Qnemo_TRT_LLM
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+          - script: L2_TRTLLM_API_Deploy_Query
+            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
     needs: [cicd-unit-tests]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
 
@@ -13,10 +13,11 @@
 # limitations under the License.
 
 
-from nemo_deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMHF, NemoQueryLLMPyTorch
+from nemo_deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMHF, NemoQueryLLMPyTorch, NemoQueryTRTLLMAPI
 
 __all__ = [
     "NemoQueryLLM",
     "NemoQueryLLMHF",
     "NemoQueryLLMPyTorch",
+    "NemoQueryTRTLLMAPI",
 ]
@@ -62,12 +62,6 @@ class NemoQueryLLMPyTorch(NemoQueryLLMBase):
         print("prompts: ", prompts)
     """
 
-    def __init__(self, url, model_name):
-        super().__init__(
-            url=url,
-            model_name=model_name,
-        )
-
     # these arguments are explicitly defined in order to make it clear to user what they can pass
     # names and optionality should exactly match the get_triton_input() results for MegatronGPTDeployable
     def query_llm(
@@ -204,12 +198,6 @@ class NemoQueryLLMHF(NemoQueryLLMBase):
         print("prompts: ", prompts)
     """
 
-    def __init__(self, url, model_name):
-        super().__init__(
-            url=url,
-            model_name=model_name,
-        )
-
     # these arguments are explicitly defined in order to make it clear to user what they can pass
     # names and optionality should exactly match the get_triton_input() results for HuggingFaceLLMDeploy
     def query_llm(
@@ -322,12 +310,6 @@ class NemoQueryLLM(NemoQueryLLMBase):
         print("prompts: ", prompts)
     """
 
-    def __init__(self, url, model_name):
-        super().__init__(
-            url=url,
-            model_name=model_name,
-        )
-
     def query_llm(
         self,
         prompts,
@@ -459,3 +441,78 @@ def query_llm(
                     return sentences
             else:
                 return result_dict["outputs"]
+
+
+class NemoQueryTRTLLMAPI(NemoQueryLLMBase):
+    """Sends a query to Triton for TensorRT-LLM API deployment inference.
+
+    Example:
+        from nemo_deploy import NemoQueryTRTLLMAPI
+
+        nq = NemoQueryTRTLLMAPI(url="localhost", model_name="GPT-2B")
+
+        prompts = ["hello, testing GPT inference", "another GPT inference test?"]
+        output = nq.query_llm(
+            prompts=prompts,
+            max_length=100,
+            top_k=1,
+            top_p=None,
+            temperature=None,
+        )
+        print("prompts: ", prompts)
+    """
+
+    def query_llm(
+        self,
+        prompts: List[str],
+        max_length: int = 256,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        temperature: Optional[float] = None,
+        init_timeout: float = 60.0,
+    ):
+        """
+        Query the Triton server synchronously and return a list of responses.
+
+        Args:
+            prompts (List(str)): list of sentences.
+            max_length (int): max generated tokens.
+            top_k (int): limits us to a certain number (K) of the top tokens to consider.
+            top_p (float): limits us to the top tokens within a certain probability mass (p).
+            temperature (float): A parameter of the softmax function, which is the last layer in the network.
+            init_timeout (flat): timeout for the connection.
+
+        Returns:
+            List[str]: A list of generated texts, one for each input prompt.
+        """
+        prompts = str_list2numpy(prompts)
+        inputs = {
+            "prompts": prompts,
+        }
+
+        if max_length is not None:
+            inputs["max_length"] = np.full(prompts.shape, max_length, dtype=np.int_)
+
+        if temperature is not None:
+            inputs["temperature"] = np.full(prompts.shape, temperature, dtype=np.single)
+
+        if top_k is not None:
+            inputs["top_k"] = np.full(prompts.shape, top_k, dtype=np.int_)
+
+        if top_p is not None:
+            inputs["top_p"] = np.full(prompts.shape, top_p, dtype=np.single)
+
+        with ModelClient(self.url, self.model_name, init_timeout_s=init_timeout) as client:
+            result_dict = client.infer_batch(**inputs)
+            output_type = client.model_config.outputs[0].dtype
+
+            if output_type == np.bytes_:
+                if "sentences" in result_dict.keys():
+                    output = result_dict["sentences"]
+                else:
+                    return "Unknown output keyword."
+
+                sentences = np.char.decode(output.astype("bytes"), "utf-8")
+                return sentences
+            else:
+                return result_dict["sentences"]
@@ -0,0 +1,170 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from pathlib import Path
+from typing import List, Optional, Union
+
+import numpy as np
+from pytriton.decorators import batch, first_value
+from pytriton.model_config import Tensor
+from tensorrt_llm import SamplingParams
+from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+from tensorrt_llm.llmapi.llm import LLM, TokenizerBase
+from transformers import PreTrainedTokenizerBase
+
+from nemo_deploy import ITritonDeployable
+from nemo_deploy.utils import cast_output, str_ndarray2list
+
+LOGGER = logging.getLogger("NeMo")
+
+
+class TensorRTLLMAPIDeployable(ITritonDeployable):
+    """A Triton inference server compatible wrapper for TensorRT-LLM LLM-API.
+
+    This class provides a standardized interface for deploying TensorRT-LLM LLM-API
+    in Triton inference server. It handles model loading, inference, and deployment configurations.
+
+    Args:
+        hf_model_id_path (str): Path to the HuggingFace model or model identifier.
+            Can be a local path or a model ID from HuggingFace Hub.
+        tokenizer (Optional[Union[str, Path, TokenizerBase, PreTrainedTokenizerBase]]):
+            Path to the tokenizer or tokenizer instance.
+        tensor_parallel_size (int): Tensor parallelism size. Defaults to 1.
+        pipeline_parallel_size (int): Pipeline parallelism size. Defaults to 1.
+        moe_expert_parallel_size (int): MOE expert parallelism size. Defaults to -1.
+        moe_tensor_parallel_size (int): MOE tensor parallelism size. Defaults to -1.
+        max_batch_size (int): Maximum batch size. Defaults to 8.
+        max_num_tokens (int): Maximum total tokens across all sequences in a batch. Defaults to 8192.
+        backend (str): Backend to use for TRTLLM. Defaults to "pytorch".
+        dtype (str): Model data type. Defaults to "auto".
+        **kwargs: Additional keyword arguments to pass to model loading.
+    """
+
+    def __init__(
+        self,
+        hf_model_id_path: str,
+        tokenizer: Optional[Union[str, Path, TokenizerBase, PreTrainedTokenizerBase]] = None,
+        tensor_parallel_size: int = 1,
+        pipeline_parallel_size: int = 1,
+        moe_expert_parallel_size: int = -1,
+        moe_tensor_parallel_size: int = -1,
+        max_batch_size: int = 8,
+        max_num_tokens: int = 8192,
+        backend: str = "pytorch",
+        dtype: str = "auto",
+        **kwargs,
+    ):
+        config_args = {k: kwargs.pop(k) for k in PyTorchConfig.__annotations__.keys() & kwargs.keys()}
+        pytorch_config = PyTorchConfig(**config_args)
+
+        self.model = LLM(
+            model=hf_model_id_path,
+            tokenizer=hf_model_id_path if tokenizer is None else tokenizer,
+            tensor_parallel_size=tensor_parallel_size,
+            pipeline_parallel_size=pipeline_parallel_size,
+            moe_expert_parallel_size=moe_expert_parallel_size,
+            moe_tensor_parallel_size=moe_tensor_parallel_size,
+            max_batch_size=max_batch_size,
+            max_num_tokens=max_num_tokens,
+            backend=backend,
+            dtype=dtype,
+            pytorch_backend_config=pytorch_config,
+            **kwargs,
+        )
+
+    def generate(
+        self,
+        prompts: List[str],
+        max_length: int = 256,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        **kwargs,
+    ) -> List[str]:
+        """Generate text based on the provided input prompts.
+
+        This method processes input prompts through the loaded model and
+        generates text according to the specified parameters.
+
+        Args:
+            prompts: List of input prompts
+            max_length: Maximum number of tokens to generate. Defaults to 256.
+            temperature: Sampling temperature. Defaults to None.
+            top_k: Number of highest probability tokens to consider. Defaults to None.
+            top_p: Cumulative probability threshold for token sampling. Defaults to None.
+            **kwargs: Additional keyword arguments to sampling params.
+
+        Returns:
+            List[str]: A list of generated texts, one for each input prompt.
+
+        Raises:
+            RuntimeError: If the model is not initialized.
+        """
+        if not self.model:
+            raise RuntimeError("Model is not initialized")
+
+        sampling_params = SamplingParams(
+            max_tokens=max_length,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            **kwargs,
+        )
+
+        outputs = self.model.generate(
+            inputs=prompts,
+            sampling_params=sampling_params,
+        )
+
+        return [output.outputs[0].text for output in outputs]
+
+    @property
+    def get_triton_input(self):
+        inputs = (
+            Tensor(name="prompts", shape=(-1,), dtype=bytes),
+            Tensor(name="max_length", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="max_batch_size", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_k", shape=(-1,), dtype=np.int_, optional=True),
+            Tensor(name="top_p", shape=(-1,), dtype=np.single, optional=True),
+            Tensor(name="temperature", shape=(-1,), dtype=np.single, optional=True),
+        )
+        return inputs
+
+    @property
+    def get_triton_output(self):
+        return (Tensor(name="sentences", shape=(-1,), dtype=bytes),)
+
+    @batch
+    @first_value("temperature", "top_k", "top_p", "max_length")
+    def triton_infer_fn(self, **inputs: np.ndarray):
+        output_infer = {}
+
+        prompts = str_ndarray2list(inputs.pop("prompts"))
+        temperature = inputs.pop("temperature", None)
+        top_k = inputs.pop("top_k", None)
+        top_p = inputs.pop("top_p", None)
+        max_length = inputs.pop("max_length", 256)
+
+        output = self.generate(
+            prompts=prompts,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+            max_length=max_length,
+        )
+
+        output_infer = {"sentences": cast_output(output, np.bytes_)}
+
+        return output_infer