NVIDIA
diff --git a/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md‎
Lines changed: 0 additions & 4 deletions b/‎docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎examples/disaggregated/README.md‎
Lines changed: 2 additions & 2 deletions b/‎examples/disaggregated/README.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/disaggregated/disagg_config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/disaggregated/disagg_config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/llm-api/llm_inference_kv_events.py‎
Lines changed: 1 addition & 2 deletions b/‎examples/llm-api/llm_inference_kv_events.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 0 additions & 1 deletion b/‎examples/llm-api/llm_mgmn_trtllm_bench.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 4 additions & 5 deletions b/‎examples/models/core/deepseek_v3/README.md‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎examples/models/core/qwen/README.md‎
Lines changed: 1 addition & 2 deletions b/‎examples/models/core/qwen/README.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/pytorch/quickstart_advanced.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/pytorch/quickstart_advanced.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/scaffolding/run_best_of_n_with_reward.py‎
Lines changed: 1 addition & 1 deletion b/‎examples/scaffolding/run_best_of_n_with_reward.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 1 addition & 1 deletion
@@ -135,7 +135,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 pytorch_backend_config:
-    enable_overlap_scheduler: true
     use_cuda_graph: true
     moe_backend: TRTLLM
 speculative_config:
@@ -218,7 +217,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
@@ -260,7 +258,6 @@ YOUR_DATA_PATH=<your dataset file following the format>
 
 cat >./extra-llm-api-config.yml<<EOF
 pytorch_backend_config:
-    enable_overlap_scheduler: true
     use_cuda_graph: true
 speculative_config:
     decoding_type: MTP
@@ -314,7 +311,6 @@ pytorch_backend_config:
     use_cuda_graph: true
     cuda_graph_batch_sizes:
     - 128
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
 
@@ -9,7 +9,7 @@ You can use multiple `trtllm-serve` commands to launch the context and generatio
 for disaggregated serving. For example, you could launch two context servers and one generation servers as follows:
 
 ```
-echo -e "pytorch_backend_config:\n  enable_overlap_scheduler: False\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
+echo -e "pytorch_backend_config:\n  disable_overlap_scheduler: True\ncache_transceiver_config:\n  max_num_tokens: 2048" > context_extra-llm-api-config.yml
 echo -e "cache_transceiver_config:\n  max_num_tokens: 2048" > gen_extra-llm-api-config.yml
 
 export TRTLLM_USE_UCX_KVCACHE=1
@@ -65,7 +65,7 @@ model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
 
@@ -5,7 +5,7 @@ free_gpu_memory_fraction: 0.25
 backend: "pytorch"
 pytorch_backend_config:
   use_cuda_graph: False
-  enable_overlap_scheduler: False
+  disable_overlap_scheduler: True
 context_servers:
   num_instances: 1
   tensor_parallel_size: 1
 
@@ -6,8 +6,7 @@
 
 
 def main():
-    pytorch_config = PyTorchConfig(enable_overlap_scheduler=True,
-                                   autotuner_enabled=False,
+    pytorch_config = PyTorchConfig(autotuner_enabled=False,
                                    kv_cache_dtype='auto')
 
     llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
 
@@ -76,7 +76,6 @@ srun -l \
         cat > /tmp/pytorch_extra_args.txt << EOF
 pytorch_backend_config:
     use_cuda_graph: false
-    enable_overlap_scheduler: true
     cuda_graph_padding_enabled: false
     print_iter_log: true
 enable_attention_dp: false
 
@@ -21,7 +21,10 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
   - [Quick Start](#quick-start)
     - [Run a single inference](#run-a-single-inference)
     - [Multi-Token Prediction (MTP)](#multi-token-prediction-mtp)
+      - [Relaxed acceptance](#relaxed-acceptance)
     - [Long context support](#long-context-support)
+      - [ISL-64k-OSL-1024](#isl-64k-osl-1024)
+      - [ISL-128k-OSL-1024](#isl-128k-osl-1024)
   - [Evaluation](#evaluation)
   - [Serving](#serving)
     - [Use trtllm-serve](#use-trtllm-serve)
@@ -36,6 +39,7 @@ Please refer to [this guide](https://nvidia.github.io/TensorRT-LLM/installation/
     - [FP8 KV Cache and MLA](#fp8-kv-cache-and-mla)
     - [W4AFP8](#w4afp8)
   - [Notes and Troubleshooting](#notes-and-troubleshooting)
+  - [Known Issues](#known-issues)
 
 
 ## Hardware Requirements
@@ -136,7 +140,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 pytorch_backend_config:
-  enable_overlap_scheduler: true
   use_cuda_graph: true
   cuda_graph_padding_enabled: true
   cuda_graph_batch_sizes: [1, 4, 8, 12]
@@ -165,7 +168,6 @@ python /app/tensorrt_llm/benchmarks/cpp/prepare_dataset.py \
 
 cat <<EOF > /tmp/extra-llm-api-config.yml
 pytorch_backend_config:
-  enable_overlap_scheduler: true
   use_cuda_graph: true
   cuda_graph_padding_enabled: true
   cuda_graph_batch_sizes: [1, 2]
@@ -192,7 +194,6 @@ Evaluate the model accuracy using `trtllm-eval`.
 cat >./extra-llm-api-config.yml <<EOF
 pytorch_backend_config:
     use_cuda_graph: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 ```
@@ -249,7 +250,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
@@ -441,7 +441,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 ```
 
@@ -22,7 +22,7 @@ This document shows how to build and run a [Qwen](https://huggingface.co/Qwen) m
       - [Run a single inference](#run-a-single-inference)
     - [Evaluation](#evaluation)
     - [Serving](#serving)
-    - [Notes and Troubleshooting](#notes-and-troubleshooting)
+  - [Notes and Troubleshooting](#notes-and-troubleshooting)
   - [Credits](#credits)
 
 ## Overview
@@ -668,7 +668,6 @@ pytorch_backend_config:
     - 256
     - 384
     print_iter_log: true
-    enable_overlap_scheduler: true
 enable_attention_dp: true
 EOF
 
 
@@ -72,7 +72,7 @@ def add_llm_args(parser):
     parser.add_argument("--kv_cache_fraction", type=float, default=None)
 
     # Runtime
-    parser.add_argument('--enable_overlap_scheduler',
+    parser.add_argument('--disable_overlap_scheduler',
                         default=False,
                         action='store_true')
     parser.add_argument('--enable_chunked_prefill',
@@ -124,7 +124,7 @@ def parse_arguments():
 
 def setup_llm(args):
     pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=args.enable_overlap_scheduler,
+        disable_overlap_scheduler=args.disable_overlap_scheduler,
         kv_cache_dtype=args.kv_cache_dtype,
         attn_backend=args.attention_backend,
         use_cuda_graph=args.use_cuda_graph,
 
@@ -39,7 +39,7 @@ def main():
         max_batch_size=args.sample_num,
         max_num_tokens=8192,
         kv_cache_free_gpu_memory_fraction=0.2,
-        enable_overlap_scheduler=False)
+        disable_overlap_scheduler=True)
     workers[NativeGenerationController.WorkerTag.GENERATION] = gen_worker
     workers[QwenRewardController.WorkerTag.REWARD] = reward_worker
 
 
@@ -302,7 +302,7 @@ def create_autodeploy_executor(
         model_engine=engine,
         decoder=decoder,
         dist=mpi_dist,
-        enable_overlap_scheduler=py_config.enable_overlap_scheduler,
+        disable_overlap_scheduler=py_config.disable_overlap_scheduler,
         max_input_len=executor_config.max_input_len,
         max_batch_size=executor_config.max_batch_size,
         max_draft_tokens=executor_config.speculative_config.max_draft_tokens