[https://nvbugs/5614506][chore] Adding e+p+d e2e test (#8801)

pcastonguay · web-flow · commit afa75c9494e9 · 2025-10-31T09:52:42.000-07:00
Signed-off-by: Patrice Castonguay &lt;55748270+pcastonguay@users.noreply.github.com&gt;
diff --git a/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py b/tests/unittest/_torch/multimodal/test_mm_encoder_standalone.py
@@ -7,7 +7,7 @@
 
 from tensorrt_llm import MultimodalEncoder
 from tensorrt_llm.inputs import default_multimodal_input_loader
-from tensorrt_llm.llmapi import KvCacheConfig
+from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
 from tensorrt_llm.llmapi.llm import LLM, SamplingParams
 
 test_data_root = Path(
@@ -41,7 +41,8 @@ def multimodal_model_config():
 @pytest.mark.parametrize("model_key", [
     "llava-v1.6-mistral-7b-hf",
 ])
-def test_single_image_chat(model_key, multimodal_model_config):
+@pytest.mark.parametrize("pd_disagg", [False, True])
+def test_single_image_chat(model_key, pd_disagg, multimodal_model_config):
     """Test processing single image using encoder (pass mm_embeddings) + LLM API.
 
     This test verifies that encoder (pass mm_embeddings) + LLM API produces identical
@@ -59,7 +60,7 @@ def test_single_image_chat(model_key, multimodal_model_config):
 
     # Test configuration
     max_tokens = 64
-    free_gpu_memory_fraction = 0.6
+    free_gpu_memory_fraction = 0.6 if not pd_disagg else 0.2
     max_batch_size = 1
 
     # Test data - OpenAI chat completion format
@@ -76,10 +77,26 @@ def test_single_image_chat(model_key, multimodal_model_config):
     # Process multimodal data using encoder (pass mm_embeddings)
     encoder = MultimodalEncoder(model=encoder_model_dir,
                                 max_batch_size=max_batch_size)
+
+    cache_transceiver_cfg = CacheTransceiverConfig(
+        backend="DEFAULT") if pd_disagg else None
+
+    disable_overlap_scheduler = pd_disagg
+
     llm = LLM(model=encoder_model_dir,
               backend='pytorch',
               kv_cache_config=kv_cache_config,
-              trust_remote_code=True)
+              trust_remote_code=True,
+              cache_transceiver_config=cache_transceiver_cfg,
+              disable_overlap_scheduler=disable_overlap_scheduler)
+
+    llm_decode = None
+    if pd_disagg:
+        llm_decode = LLM(model=encoder_model_dir,
+                         backend='pytorch',
+                         kv_cache_config=kv_cache_config,
+                         trust_remote_code=True,
+                         cache_transceiver_config=cache_transceiver_cfg)
 
     # Load model configuration
     config_path = os.path.join(llm._hf_model_dir, 'config.json')
@@ -122,10 +139,23 @@ def test_single_image_chat(model_key, multimodal_model_config):
     ep_disaggregated_params = encoder_outputs[0].disaggregated_params
 
     assert ep_disaggregated_params is not None, "Encoder output disaggregated params is None"
-    ep_disaggregated_params.request_type = "context_and_generation"
+    ep_disaggregated_params.request_type = "context_and_generation" if not pd_disagg else "context_only"
     outputs = llm.generate(inputs,
                            sampling_params=sampling_params,
                            disaggregated_params=ep_disaggregated_params)
+
+    if pd_disagg:
+        # Generation using llm_decode
+        assert len(outputs) == 1
+        pd_disaggregated_params = outputs[0].disaggregated_params
+        pd_disaggregated_params.request_type = "generation_only"
+        sampling_params = SamplingParams(max_tokens=max_tokens)
+
+        outputs = llm_decode.generate(
+            inputs,
+            sampling_params=sampling_params,
+            disaggregated_params=pd_disaggregated_params)
+
     # Validate outputs
     assert len(outputs) == len(
         prompts), f"Expected {len(prompts)} outputs, got {len(outputs)}"