[TRTLLM-7964][infra] Set nixl to default cache transceiver backend (NVIDIA#7926)

bo-nv · dominicshanshan · commit e33024c75a25 · 2025-11-02T19:11:57.000-08:00
Signed-off-by: Bo Deng &lt;deemod@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp
@@ -89,7 +89,7 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc
         }
         else
         {
-            backendType = executor::CacheTransceiverConfig::BackendType::UCX;
+            backendType = executor::CacheTransceiverConfig::BackendType::NIXL;
         }
     }
     cacheTransceiverConfig.value().setBackendType(backendType);
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
@@ -93,15 +93,6 @@ COPY docker/common/install_triton.sh \
 
 RUN bash ./install_triton.sh && rm install_triton.sh
 
-# Install UCX first
-RUN bash ./install_ucx.sh && rm install_ucx.sh
-
-# Install NIXL
-RUN bash ./install_nixl.sh && rm install_nixl.sh
-
-# Install etcd
-RUN bash ./install_etcd.sh && rm install_etcd.sh
-
 FROM ${DEVEL_IMAGE} AS wheel
 WORKDIR /src/tensorrt_llm
 COPY benchmarks benchmarks
diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md
@@ -106,7 +106,7 @@ cache_transceiver_config:
   max_tokens_in_buffer: <int>
 ```
 
-`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX.
+`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is NIXL.
 
 `max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
 
diff --git a/docs/source/installation/linux.md b/docs/source/installation/linux.md
@@ -17,6 +17,9 @@
    pip3 install torch==2.9.0 torchvision --index-url https://download.pytorch.org/whl/cu130
 
    sudo apt-get -y install libopenmpi-dev
+   
+   # Optional step: Only required for disagg-serving
+   sudo apt-get -y install libzmq3-dev
    ```
 
    ```{tip}
diff --git a/examples/disaggregated/README.md b/examples/disaggregated/README.md
@@ -12,7 +12,7 @@ The `trtllm-serve` command supports the `extra-llm-config.yaml` parameter. In th
 
 ```yaml
 cache_transceiver_config:
-  # KV cache transmission backend. Valid options include `DEFAULT` (i.e., UCX), `UCX`, `NIXL`.
+  # KV cache transmission backend. Valid options include `DEFAULT` (i.e., NIXL), `UCX`, `NIXL`.
   backend: <str>
   # KV cache buffer size. Set it ≥ the maximum ISL (Input Sequence Length) for best performance.
   max_tokens_in_buffer: <int>
diff --git a/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py b/tensorrt_llm/_torch/pyexecutor/kv_cache_transceiver.py
@@ -38,10 +38,10 @@ def create_kv_cache_transceiver(
 
     if cache_transceiver_config.backend == BackendTypeCpp.DEFAULT:
         # When cache_transceiver_config.backend is not set, fallback to env_vars settings
-        # UCX is the default backend
-        cache_transceiver_config.backend = BackendTypeCpp.UCX
+        # NIXL is the default backend
+        cache_transceiver_config.backend = BackendTypeCpp.NIXL
         # Ordered by priority
-        env_vars = [("TRTLLM_USE_NIXL_KVCACHE", BackendTypeCpp.NIXL),
+        env_vars = [("TRTLLM_USE_UCX_KVCACHE", BackendTypeCpp.UCX),
                     ("TRTLLM_USE_MPI_KVCACHE", BackendTypeCpp.MPI)]
         for env_var, be_type in env_vars:
             if getenv(env_var) == "1":
diff --git a/tests/integration/defs/disaggregated/test_disaggregated.py b/tests/integration/defs/disaggregated/test_disaggregated.py
@@ -430,6 +430,9 @@ def run_disaggregated_test(example_dir,
             config_file
         ]
     else:
+        pytest.skip(
+            "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend."
+        )
         with open(config_file, 'r') as f:
             config = yaml.safe_load(f)
 

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,7 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc`
`89`	`89`	`}`
`90`	`90`	`else`
`91`	`91`	`{`
`92`		`- backendType = executor::CacheTransceiverConfig::BackendType::UCX;`
	`92`	`+ backendType = executor::CacheTransceiverConfig::BackendType::NIXL;`
`93`	`93`	`}`
`94`	`94`	`}`
`95`	`95`	`cacheTransceiverConfig.value().setBackendType(backendType);`
Original file line number	Diff line number	Diff line change
`@@ -430,6 +430,9 @@ def run_disaggregated_test(example_dir,`
`430`	`430`	`config_file`
`431`	`431`	`]`
`432`	`432`	`else:`
	`433`	`+ pytest.skip(`
	`434`	`+ "https://nvbugs/5584607 Ray orchestrator is not supported with NIXL(DEFAULT) cache transceiver backend."`
	`435`	`+ )`
`433`	`436`	`with open(config_file, 'r') as f:`
`434`	`437`	`config = yaml.safe_load(f)`
`435`	`438`