NVIDIA
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/features/feature-combination-matrix.md‎
Lines changed: 21 additions & 17 deletions b/‎docs/source/features/feature-combination-matrix.md‎
Lines changed: 21 additions & 17 deletions
diff --git a/‎docs/source/models/supported-models.md‎
Lines changed: 2 additions & 1 deletion b/‎docs/source/models/supported-models.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎examples/constraints.txt‎
Lines changed: 1 addition & 1 deletion b/‎examples/constraints.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/layer_wise_benchmarks/run.py‎
Lines changed: 4 additions & 1 deletion b/‎examples/layer_wise_benchmarks/run.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎examples/models/core/mistral_large_3/README.md‎
Lines changed: 2 additions & 1 deletion b/‎examples/models/core/mistral_large_3/README.md‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎jenkins/L0_Test.groovy‎
Lines changed: 1 addition & 1 deletion b/‎jenkins/L0_Test.groovy‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎security_scanning/docs/poetry.lock‎
Lines changed: 3 additions & 3 deletions b/‎security_scanning/docs/poetry.lock‎
Lines changed: 3 additions & 3 deletions
@@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.2.0rc7-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.2.0rc8-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
 
@@ -1,19 +1,23 @@
 # Feature Combination Matrix
 
-| Feature                    | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP      | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding | LoRA |
-| -------------------------- | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- | ---- |
-| Overlap Scheduler          | ---               |            |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |      |
-| CUDA Graph                 | Yes               | ---        |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |      |
-| Attention Data Parallelism | Yes               | Yes        | ---                        |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |      |
-| Disaggregated Serving      | Yes               | Yes        | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |      |
-| Chunked Prefill            | Yes               | Yes        | Yes                        | Yes                   | ---             |          |                           |                           |               |                  |                |                        |                       |                 |      |
-| MTP                        | Yes               | Yes        | Yes                        | Yes                   | Yes             | ---      |                           |                           |               |                  |                |                        |                       |                 |      |
-| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | ---                       |                           |               |                  |                |                        |                       |                 |      |
-| EAGLE-3(Two Model Engine)  | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |      |
-| Torch Sampler              | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |      |
-| TLLM C++ Sampler           | Yes               | Yes        | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |      |
-| KV Cache Reuse             | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | ---            |                        |                       |                 |      |
-| Slide Window Attention     | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | Yes            | ---                    |                       |                 |      |
-| Logits Post Processor      | Yes               | Yes        | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |      |
-| Guided Decoding            | Yes               | Yes        | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |      |
-| LoRA                       | Yes               | No         | Untested                   | Untested              | Untested        | Untested | Untested                  | Untested                  | Yes           | Yes              | Yes            | Yes                    | Yes                   | Untested        | ---  |
+| Feature                    | Overlap Scheduler | CUDA Graph | Tensor Parallelism | Pipeline Parallelism | Expert Parallelism | Helix Parallelism | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP      | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding | LoRA     |
+| -------------------------- | ----------------- | ---------- | ------------------ | -------------------- | ------------------ | ----------------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- | -------- |
+| Overlap Scheduler          | ---               |            |                    |                      |                    |                   |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| CUDA Graph                 | Yes               | ---        |                    |                      |                    |                   |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Tensor Parallelism         | Yes               | Yes        | ---                |                      |                    |                   |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Pipeline Parallelism       | Yes               | Yes        | Yes                | ---                  |                    |                   |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Expert Parallelism         | Yes               | Yes        | Yes                | Yes                  | ---                |                   |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Helix Parallelism          | Untested          | Yes        | Yes                | Yes                  | Yes                | ---               |                            |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Attention Data Parallelism | Yes               | Yes        | Yes                | Yes                  | Yes                | Known issues      | ---                        |                       |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Disaggregated Serving      | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | ---                   |                 |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| Chunked Prefill            | Yes               | Yes        | Yes                | Untested             | Yes                | Yes               | Yes                        | Yes                   | ---             |          |                           |                           |               |                  |                |                        |                       |                 |          |
+| MTP                        | Yes               | Yes        | Yes                | No                   | Yes                | No                | Yes                        | Yes                   | Yes             | ---      |                           |                           |               |                  |                |                        |                       |                 |          |
+| EAGLE-3(One Model Engine)  | Yes               | Yes        | Yes                | No                   | Yes                | No                | Yes                        | Yes                   | Yes             | No       | ---                       |                           |               |                  |                |                        |                       |                 |          |
+| EAGLE-3(Two Model Engine)  | Yes               | Yes        | Yes                | No                   | Yes                | No                | Yes                        | Yes                   | Yes             | No       | No                        | ---                       |               |                  |                |                        |                       |                 |          |
+| Torch Sampler              | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | ---           |                  |                |                        |                       |                 |          |
+| TLLM C++ Sampler           | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | Yes                   | Yes             | No       | No                        | No                        | No            | ---              |                |                        |                       |                 |          |
+| KV Cache Reuse             | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | ---            |                        |                       |                 |          |
+| Slide Window Attention     | Yes               | Yes        | Yes                | Yes                  | Yes                | Untested          | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | Yes            | ---                    |                       |                 |          |
+| Logits Post Processor      | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | No                    | Yes             | No       | No                        | No                        | Yes           | Yes              | Yes            | Yes                    | ---                   |                 |          |
+| Guided Decoding            | Yes               | Yes        | Yes                | Yes                  | Yes                | Yes               | Yes                        | Yes                   | Yes             | Yes      | Yes                       | Yes                       | Yes           | Yes              | Yes            | Yes                    | Yes                   | ---             |          |
+| LoRA                       | Yes               | No         | Yes                | Yes                  | Untested           | Untested          | Untested                   | Untested              | Yes             | Untested | Untested                  | Untested                  | Yes           | Yes              | Yes            | Yes                    | Yes                   | Untested        | ---      |
@@ -40,10 +40,11 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Qwen3MoeForCausalLM`            | Yes               | Yes        | Yes                        | Yes                   | Yes             | No  | Yes                       | Yes                       | Yes           | Yes              | Yes            | N/A                      | Yes                   | Yes             |
 | `Qwen3NextForCausalLM`           | Yes                | Yes        | No                         | Untested                    | Yes              | No  | No                        | No                        | Yes            | Yes               | No             | No                       | Untested                    | Untested              |
 | `Llama4ForConditionalGeneration` | Yes               | Yes        | Yes                        | Yes                   | Yes             | No  | Yes                       | Yes                       | Yes           | Yes              | Untested       | N/A                      | Yes                   | Yes             |
-| `GptOssForCausalLM`            | Yes              | Yes         | Yes                        | Yes                   | No             | No   | Yes                       | No                        | Yes           | Yes              | No             | N/A                      | Yes                    | Yes             |
+| `GptOssForCausalLM`            | Yes              | Yes         | Yes                        | Yes                   | Yes             | No   | Yes                       | Yes [^3]                   | Yes           | Yes              | Yes             | N/A                      | Yes                    | Yes             |
 
 [^1]: Chunked Prefill for MLA can only be enabled on SM100/SM103.
 [^2]: KV cache reuse for MLA can only be enabled on SM90/SM100/SM103 and in BF16/FP8 KV cache dtype.
+[^3]: Overlap scheduler isn't supported when using EAGLE-3(Two Model Engine) for GPT-OSS.
 
 
 # Multimodal Feature Support Matrix (PyTorch Backend)
 
@@ -1,3 +1,3 @@
-tensorrt_llm==1.2.0rc7
+tensorrt_llm==1.2.0rc8
 evaluate~=0.4.1
 rouge_score~=0.1.2
@@ -10,10 +10,11 @@
 import yaml
 
 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
+from tensorrt_llm._torch.distributed import MPIDist, TorchDist
 from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.interface import AlltoallMethodType
 from tensorrt_llm._torch.modules.multi_stream_utils import with_multi_stream
-from tensorrt_llm._utils import local_mpi_rank, mpi_rank, mpi_world_size
+from tensorrt_llm._utils import local_mpi_rank, mpi_disabled, mpi_rank, mpi_world_size
 from tensorrt_llm.logger import logger
 from tensorrt_llm.tools.layer_wise_benchmarks import BalanceMethod, get_runner_cls, mark_ranges
 
@@ -173,6 +174,8 @@ def comma_separated_floats(s):
 )
 if args.enable_autotuner:
     cache_path = os.getenv("TLLM_AUTOTUNER_CACHE_PATH") or None
+    dist = TorchDist(mapping=mapping) if mpi_disabled() else MPIDist(mapping=mapping)
+    AutoTuner.get().setup_distributed_state(mapping, dist)
     with autotune(cache_path=cache_path):
         run_pack()
 else:
 
@@ -19,7 +19,8 @@ mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickst
     --max_tokens 100 \
     --checkpoint_format mistral \
     --model_type mistral_large_3 \
-    --moe_backend TRTLLM
+    --moe_backend TRTLLM \
+    --image_format pil
 ```
 
 ## LLM-only run
 
@@ -808,7 +808,7 @@ def getPytestBaseCommandLine(
         portEnvVars,
         pytestUtil,
         "pytest",
-        "-v",
+        "-vv",
         testFilter[(DETAILED_LOG)] ? "-s" : "",
         "--timeout-method=thread",
         "--apply-test-list-correction",