NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 40 additions & 15 deletions b/‎.github/workflows/cicd-main.yml‎
Lines changed: 40 additions & 15 deletions
diff --git a/‎nemo_deploy/nlp/trtllm_api_deployable.py‎
Lines changed: 30 additions & 6 deletions b/‎nemo_deploy/nlp/trtllm_api_deployable.py‎
Lines changed: 30 additions & 6 deletions
diff --git a/‎tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_Ray.sh‎
Lines changed: 0 additions & 23 deletions b/‎tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_Ray.sh‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_TRTLLM_Ray.sh‎
Lines changed: 0 additions & 25 deletions b/‎tests/functional_tests/L2_NeMo_2_Export_Deploy_Query_TRTLLM_Ray.sh‎
Lines changed: 0 additions & 25 deletions
diff --git a/‎tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh‎
Lines changed: 0 additions & 34 deletions b/‎tests/functional_tests/L2_NeMo_2_Export_In_Framework.sh‎
Lines changed: 0 additions & 34 deletions
diff --git a/‎tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh‎
Lines changed: 0 additions & 26 deletions b/‎tests/functional_tests/L2_NeMo_2_Export_TRT_LLM.sh‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎tests/functional_tests/tests_inframework/test_export.py‎
Lines changed: 85 additions & 0 deletions b/‎tests/functional_tests/tests_inframework/test_export.py‎
Lines changed: 85 additions & 0 deletions
@@ -143,15 +143,15 @@ jobs:
         run: |
           echo "Running CI tests"
 
-  cicd-unit-tests:
+  cicd-unit-tests-trtllm:
     strategy:
       fail-fast: false
       matrix:
         include:
-          - script: L0_Unit_Tests_GPU_Export_Deploy
+          - script: L0_Unit_Tests_GPU
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
             timeout: 30
-          - script: L0_Unit_Tests_CPU_Export_Deploy
+          - script: L0_Unit_Tests_CPU
             runner: linux-amd64-cpu16
             cpu-only: true
     needs: [pre-flight, cicd-wait-in-queue]
@@ -175,6 +175,38 @@ jobs:
           PAT: ${{ secrets.PAT }}
           inference-framework: trtllm
 
+  cicd-unit-tests-vllm:
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - script: L0_Unit_Tests_GPU
+            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
+            timeout: 30
+          - script: L0_Unit_Tests_CPU
+            runner: linux-amd64-cpu16
+            cpu-only: true
+    needs: [pre-flight, cicd-wait-in-queue]
+    runs-on: ${{ matrix.runner }}
+    name: ${{ matrix.script }}
+    environment: nemo-ci
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: main
+        uses: ./.github/actions/test-template
+        with:
+          script: ${{ matrix.script }}
+          timeout: ${{ matrix.timeout || 10 }}
+          is_unit_test: "true"
+          cpu-only: ${{ matrix.cpu-only || false }}
+          has-azure-credentials: "true"
+          azure-client-id: ${{ secrets.AZURE_CLIENT_ID }}
+          azure-tenant-id: ${{ secrets.AZURE_TENANT_ID }}
+          azure-subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
+          PAT: ${{ secrets.PAT }}
+          inference-framework: vllm
+
   cicd-e2e-tests-trtllm:
     strategy:
       fail-fast: false
@@ -183,23 +215,15 @@ jobs:
           - script: L2_Launch_TRTLLM
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
             timeout: 60
-          - script: L2_NeMo_2_Export_Deploy_Query_Ray
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-          - script: L2_NeMo_2_Export_Deploy_Query_TRTLLM_Ray
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
           - script: L2_ONNX_TRT_LLM_Embedding_Export
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
           - script: L2_ONNX_TRT_LLM_Embedding_Export_INT8
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-          - script: L2_NeMo_2_Export_TRT_LLM
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-          - script: L2_NeMo_2_Export_In_Framework
-            runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
           - script: L2_NeMo_2_Export_Qnemo_TRT_LLM
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
           - script: L2_TRTLLM_API_Deploy_Query
             runner: linux-amd64-gpu-rtxa6000-latest-2-nemo
-    needs: [cicd-unit-tests]
+    needs: [cicd-unit-tests-trtllm]
     runs-on: ${{ matrix.runner }}
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -221,7 +245,7 @@ jobs:
           inference-framework: trtllm
 
   cicd-e2e-tests-vllm:
-    needs: [cicd-unit-tests]
+    needs: [cicd-unit-tests-vllm]
     runs-on: linux-amd64-gpu-rtxa6000-latest-2-nemo
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -243,7 +267,7 @@ jobs:
           inference-framework: vllm
 
   cicd-e2e-tests-inframework:
-    needs: [cicd-unit-tests]
+    needs: [cicd-unit-tests-trtllm, cicd-unit-tests-vllm]
     runs-on: linux-amd64-gpu-rtxa6000-latest-2-nemo
     name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
     environment: nemo-ci
@@ -267,7 +291,8 @@ jobs:
   Nemo_CICD_Test:
     needs:
       - pre-flight
-      - cicd-unit-tests
+      - cicd-unit-tests-trtllm
+      - cicd-unit-tests-vllm
       - cicd-e2e-tests-trtllm
       - cicd-e2e-tests-vllm
     if: always()
 
@@ -17,15 +17,33 @@
 from typing import List, Optional, Union
 
 import numpy as np
-from pytriton.decorators import batch, first_value
-from pytriton.model_config import Tensor
-from tensorrt_llm import SamplingParams
-from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-from tensorrt_llm.llmapi.llm import LLM, TokenizerBase
 from transformers import PreTrainedTokenizerBase
 
 from nemo_deploy import ITritonDeployable
 from nemo_deploy.utils import cast_output, str_ndarray2list
+from nemo_export_deploy_common.import_utils import MISSING_TENSORRT_LLM_MSG, MISSING_TRITON_MSG, null_decorator
+
+try:
+    from pytriton.decorators import batch, first_value
+    from pytriton.model_config import Tensor
+
+    HAVE_TRITON = True
+except ImportError:
+    from unittest.mock import MagicMock
+
+    Tensor = MagicMock()
+    batch = null_decorator
+    first_value = null_decorator
+    HAVE_TRITON = False
+
+try:
+    from tensorrt_llm import SamplingParams
+    from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+    from tensorrt_llm.llmapi.llm import LLM, TokenizerBase
+
+    HAVE_TENSORRT_LLM = True
+except ImportError:
+    HAVE_TENSORRT_LLM = False
 
 LOGGER = logging.getLogger("NeMo")
 
@@ -55,7 +73,7 @@ class TensorRTLLMAPIDeployable(ITritonDeployable):
     def __init__(
         self,
         hf_model_id_path: str,
-        tokenizer: Optional[Union[str, Path, TokenizerBase, PreTrainedTokenizerBase]] = None,
+        tokenizer: Optional[Union[str, Path, "TokenizerBase", PreTrainedTokenizerBase]] = None,
         tensor_parallel_size: int = 1,
         pipeline_parallel_size: int = 1,
         moe_expert_parallel_size: int = -1,
@@ -66,6 +84,12 @@ def __init__(
         dtype: str = "auto",
         **kwargs,
     ):
+        if not HAVE_TENSORRT_LLM:
+            raise ImportError(MISSING_TENSORRT_LLM_MSG)
+
+        if not HAVE_TRITON:
+            raise ImportError(MISSING_TRITON_MSG)
+
         config_args = {k: kwargs.pop(k) for k in PyTorchConfig.__annotations__.keys() & kwargs.keys()}
         pytorch_config = PyTorchConfig(**config_args)
 
 
@@ -0,0 +1,85 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import shutil
+import subprocess
+import tempfile
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class TestInFrameworkExport:
+    @classmethod
+    def setup_class(cls):
+        # Create output directories
+        cls.testdir = tempfile.mkdtemp()
+        logger.info(f"Test directory: {cls.testdir}")
+
+        # Update HF model
+        subprocess.run(
+            [
+                "coverage",
+                "run",
+                "--data-file=/workspace/.coverage",
+                "--source=/workspace/",
+                "--parallel-mode",
+                "tests/functional_tests/utils/test_hf_import.py",
+                "--hf_model",
+                "meta-llama/Llama-3.2-1B",
+                "--output_path",
+                f"{cls.testdir}/nemo2_ckpt",
+                "--config",
+                "Llama32Config1B",
+            ],
+            check=True,
+        )
+
+    @classmethod
+    def teardown_class(cls):
+        logger.info(f"Removing test directory: {cls.testdir}")
+        shutil.rmtree(cls.testdir)
+
+    def test_inframework_export(self):
+        subprocess.run(
+            [
+                "coverage",
+                "run",
+                "--data-file=/workspace/.coverage",
+                "--source=/workspace/",
+                "--parallel-mode",
+                "tests/functional_tests/utils/run_nemo_export.py",
+                "--model_name",
+                "test",
+                "--model_type",
+                "llama",
+                "--checkpoint_dir",
+                f"{self.testdir}/nemo2_ckpt",
+                "--min_tps",
+                "1",
+                "--in_framework",
+                "True",
+                "--test_deployment",
+                "True",
+                "--run_accuracy",
+                "True",
+                "--test_data_path",
+                "tests/functional_tests/data/lambada.json",
+                "--accuracy_threshold",
+                "0.0",
+                "--debug",
+            ],
+            check=True,
+        )