[TRTLLM-9522][fix] restore trtllm-serve mm_embedding_serve (#9669)

ixlmar · web-flow · commit 744f0eff1b92 · 2025-12-03T19:27:11.000-08:00
diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
@@ -216,12 +216,14 @@ def launch_mm_encoder_server(
     metadata_server_cfg: Optional[MetadataServerConfig] = None,
 ):
     model = encoder_args["model"]
+    encoder_args.pop("build_config")
     mm_encoder = MultimodalEncoder(**encoder_args)
 
     server = OpenAIServer(llm=mm_encoder,
                           model=model,
                           server_role=ServerRole.MM_ENCODER,
-                          metadata_server_cfg=metadata_server_cfg)
+                          metadata_server_cfg=metadata_server_cfg,
+                          tool_parser=None)
     asyncio.run(server(host, port))
 
 
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -50,6 +50,7 @@ l0_a10:
   - disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_simple_llama[True-True-TinyLlama-1.1B-Chat-v1.0]
   - test_e2e.py::test_openai_chat_guided_decoding
   - test_e2e.py::test_openai_chat_multimodal_example ISOLATION
+  - test_e2e.py::test_openai_mmencoder_example
   - test_e2e.py::test_openai_perf_metrics
   - test_e2e.py::test_openai_prometheus
   - test_e2e.py::test_openai_lora