NVIDIA
diff --git a/‎.buildinfo‎
Lines changed: 1 addition & 1 deletion b/‎.buildinfo‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎_cpp_gen/executor.html‎
Lines changed: 8725 additions & 8669 deletions b/‎_cpp_gen/executor.html‎
Lines changed: 8725 additions & 8669 deletions
diff --git a/‎_cpp_gen/runtime.html‎
Lines changed: 8226 additions & 8224 deletions b/‎_cpp_gen/runtime.html‎
Lines changed: 8226 additions & 8224 deletions
diff --git a/‎_downloads/289baa7eb9a76c4944f3e2800f51f4c4/llm_kv_cache_offloading.py‎
Lines changed: 134 additions & 0 deletions b/‎_downloads/289baa7eb9a76c4944f3e2800f51f4c4/llm_kv_cache_offloading.py‎
Lines changed: 134 additions & 0 deletions
diff --git a/‎_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py‎
Lines changed: 49 additions & 43 deletions b/‎_downloads/b509390ba70e52fabb10dbd9d15d5118/attention.py‎
Lines changed: 49 additions & 43 deletions
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 2820ecdc6d7d98c139c21bcb2df54fee
+config: 05441684cb2c0903bdac9ebb5abe267d
 tags: 645f666f9bcd5a90fca523b33c5a78b7
@@ -0,0 +1,134 @@
+### :title KV Cache Offloading
+### :order 6
+### :section Customization
+'''
+This script demonstrates the effectiveness of KV cache host offloading in TensorRT-LLM.
+
+**Scenario:**
+The script simulates a scenario where the GPU's KV cache is severely limited,
+while multiple requests with recurring prompts (like system prompts) are processed.
+
+1.  **Constrained GPU Cache:** The GPU KV cache is configured to be very small,
+    only large enough to hold the state for a single request.
+2.  **Alternating Prompts:** Four requests are sent sequentially (batch size of 1)
+    with two distinct prompts in an A, B, A, B pattern.
+3.  **Cache Eviction:** Due to the small GPU cache, processing prompt B will
+    force the eviction of the cache generated for prompt A.
+
+**Demonstration:**
+
+* **Without Offloading (Default):**
+    - When the first prompt 'A' is processed, its KV cache is stored on the GPU.
+    - When prompt 'B' arrives, the cache manager needs space and discards the cache for 'A'.
+    - When prompt 'A' is sent again, its cache must be recomputed from scratch.
+    - **Expected Outcome:** The log will show `reused blocks: 0` and `cache hit rate: 0`.
+
+* **With Offloading (`--enable_offloading`):**
+    - When prompt 'B' arrives, the cache for 'A' is not discarded but is instead
+      *offloaded* from the fast GPU VRAM to the slower (but larger) host CPU RAM.
+    - When prompt 'A' is sent again, its KV cache is loaded back from host RAM
+      to the GPU, which is significantly faster than recomputing it.
+    - **Expected Outcome:** The log will show positive values for `reused blocks`
+      and a non-zero `cache hit rate`, confirming that the cache was successfully
+      reused from the host.
+
+**How to Run & Verify:**
+
+1.  **Without Offloading:**
+    ```bash
+    TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py 2>&1 | tee offloading_disabled.log
+    ```
+    (Check the log for zero reuse)
+
+2.  **With Offloading:**
+    ```bash
+    TLLM_LOG_LEVEL=DEBUG python llm_kv_cache_offloading.py --enable_offloading 2>&1 | tee offloading_enabled.log
+    ```
+    (Check the log for non-zero reuse)
+'''
+
+import argparse
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.llmapi import KvCacheConfig
+
+
+def main(args):
+    # Define two distinct prompts to simulate different requests or system prompts.
+    prompt_a = (
+        "Returns the per-iterations statistics computed since last call to this method. "
+        "Contains at most iter_stats_max_iterations iterations.")
+    prompt_b = ("Use for skipping decoding step for non generation model, "
+                "and return the batch_output (such as mm_embeddings)")
+
+    # Use a batch size of 1 to process requests sequentially, making the cache
+    # eviction and reuse cycle easy to observe.
+    max_batch_size = 1
+    max_seq_len = 256
+
+    # --- KV Cache Configuration ---
+    # Set a small GPU KV cache size (in number of tokens). This is crucial for the demo,
+    # as it's only large enough to hold the KV cache for a single request.
+    kv_cache_max_tokens = 256
+    # Define the size of a single cache block.
+    kv_cache_page_size = 16
+    # Enable a 1 GB host cache if offloading is requested, otherwise disable it (size 0).
+    # This is the key toggle for the experiment.
+    kv_cache_host_size = 1024**3 if args.enable_offloading else 0
+
+    sampling_params = SamplingParams(max_tokens=max_seq_len)
+
+    llm = LLM(
+        model="Qwen/Qwen3-8B",
+        max_batch_size=max_batch_size,
+        max_seq_len=max_seq_len,
+        kv_cache_config=KvCacheConfig(
+            enable_block_reuse=True,  # Enable reuse of cached blocks
+            max_tokens=kv_cache_max_tokens,  # Max tokens in GPU cache
+            tokens_per_block=kv_cache_page_size,
+            host_cache_size=kv_cache_host_size  # Host cache size for offloading
+        ))
+
+    # Process four requests sequentially using two distinct prompts (A, B, A, B).
+    # This pattern is designed to showcase the cache eviction and reuse behavior.
+    print("--- First Round ---")
+    # 1. Process prompt A. Its cache is stored on the GPU.
+    output_a = llm.generate(prompt_a, sampling_params)
+    print(
+        f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}"
+    )
+    # 2. Process prompt B. Its cache replaces/offloads A's cache.
+    output_b = llm.generate(prompt_b, sampling_params)
+    print(
+        f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}"
+    )
+
+    print("\n--- Second Round ---")
+    # 3. Process prompt A again.
+    #    - Without offloading: Must recompute from scratch.
+    #    - With offloading: Recovers cache from host RAM.
+    output_a = llm.generate(prompt_a, sampling_params)
+    print(
+        f"Prompt: {output_a.prompt!r}, Generated text: {output_a.outputs[0].text!r}"
+    )
+    # 4. Process prompt B again.
+    #    - Without offloading: Must recompute from scratch.
+    #    - With offloading: Recovers cache from host RAM.
+    output_b = llm.generate(prompt_b, sampling_params)
+    print(
+        f"Prompt: {output_b.prompt!r}, Generated text: {output_b.outputs[0].text!r}"
+    )
+
+    llm.shutdown()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description=
+        "A script to demonstrate the effectiveness of KV cache host offloading."
+    )
+    parser.add_argument('--enable_offloading',
+                        action='store_true',
+                        help='Enable host RAM for KV cache offloading.')
+    args = parser.parse_args()
+    main(args)
@@ -5,7 +5,7 @@
 import torch
 from torch import nn
 
-from tensorrt_llm._utils import get_sm_version
+from tensorrt_llm._utils import get_sm_version, is_sm_100f
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 
@@ -24,7 +24,7 @@
 from .linear import Linear, TensorParallelMode, WeightMode, WeightsLoadingConfig
 from .multi_stream_utils import maybe_execute_in_parallel
 from .rms_norm import RMSNorm
-from .rotary_embedding import RotaryEmbedding
+from .rotary_embedding import MRotaryEmbedding, RotaryEmbedding
 
 
 def extract_extra_attrs(layer_idx: str, attn_type: str):
@@ -67,6 +67,16 @@ def extract_extra_attrs(layer_idx: str, attn_type: str):
     return metadata, attn_layer
 
 
+@torch.compile
+def compiled_copy_(dst, src):
+    dst.copy_(src)
+
+
+@torch.compile
+def compiled_cat(tensors, dim):
+    return torch.cat(tensors, dim)
+
+
 @torch.library.custom_op("trtllm::attn_custom_op_inplace",
                          mutates_args=("output", ))
 def attn_custom_op_inplace(
@@ -271,11 +281,19 @@ def __init__(
 
         self.rotary_emb = None
         if not self.rope_fusion and self.pos_embd_params is not None:
-            self.rotary_emb = RotaryEmbedding(
-                self.pos_embd_params.rope,
-                head_dim=self.head_dim,
-                is_neox=self.pos_embd_params.is_neox,
-            )
+            if self.pos_embd_params.type.is_mrope():
+                self.rotary_emb = MRotaryEmbedding(
+                    self.pos_embd_params.rope,
+                    head_dim=self.head_dim,
+                    is_neox=self.pos_embd_params.is_neox,
+                    mrope_section=self.pos_embd_params.mrope_section,
+                )
+            else:
+                self.rotary_emb = RotaryEmbedding(
+                    self.pos_embd_params.rope,
+                    head_dim=self.head_dim,
+                    is_neox=self.pos_embd_params.is_neox,
+                )
 
         self.attn = create_attention(
             self.attn_backend,
@@ -301,6 +319,12 @@ def create_weights(self):
         # which could be modified after __init__
         self.attn.update_quant_config(self.quant_config)
 
+        self.o_proj.create_weights()
+        self.has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
+                                or self.o_proj.has_fp8_block_scales
+                                or self.o_proj.has_fp8_rowwise
+                                or self.o_proj.has_w4a8_nvfp4_fp8)
+
     def split_qkv(self, q, k=None, v=None):
         if k is None and v is None:
             q, k, v = q.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
@@ -320,12 +344,8 @@ def create_output(self, q: torch.Tensor):
         out_dtype = q.dtype
 
         if self.attn_backend == "TRTLLM":
-            has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
-                               or self.o_proj.has_fp8_block_scales
-                               or self.o_proj.has_fp8_rowwise
-                               or self.o_proj.has_w4a8_nvfp4_fp8)
-            if has_quant_scale and (self.attn.has_fp8_kv_cache
-                                    or self.attn.has_fp4_kv_cache):
+            if self.has_quant_scale and (self.attn.has_fp8_kv_cache
+                                         or self.attn.has_fp4_kv_cache):
                 out_dtype = torch.float8_e4m3fn
         output = q.new_empty([num_tokens, hidden_size], dtype=out_dtype)
         return output
@@ -356,11 +376,7 @@ def _attn_impl(
 
         out_scale = None
         out_scale_sf = None
-        has_quant_scale = (self.o_proj.has_fp8_qdq or self.o_proj.has_nvfp4
-                           or self.o_proj.has_fp8_block_scales
-                           or self.o_proj.has_fp8_rowwise
-                           or self.o_proj.has_w4a8_nvfp4_fp8)
-        if has_quant_scale:
+        if self.has_quant_scale:
             out_scale = self.o_proj.inv_input_scale
         if self.o_proj.has_nvfp4 and self.support_nvfp4_output and enable_attn_nvfp4_output:
             out_scale_sf = self.o_proj.input_scale
@@ -585,7 +601,7 @@ def fp8_block_scaling_bmm_out(
                                                    output)
         out.copy_(output)
 
-    elif sm_version == 100:
+    elif is_sm_100f(sm_version):
         torch.bmm(mat1.transpose(0, 1), mat2_dequant.transpose(1, 2), out=out)
     else:
         raise NotImplementedError(f"SM{sm_version} is not supported")
@@ -858,6 +874,9 @@ def create_weights(self):
         self.mha.update_quant_config(self.quant_config)
         self.mqa.update_quant_config(self.quant_config)
 
+        # Although we use FP8 MLA for context/generation phase, the output is still in BF16
+        self.out_scale = None
+
         # k_b_proj_trans's dtype must be consistent with self.kv_b_proj,
         # which can be modified after __init__
         has_fp8_block_scales = (
@@ -900,7 +919,7 @@ def create_weights(self):
                 ),
                 requires_grad=False,
             )
-            if get_sm_version() == 100:
+            if is_sm_100f():
                 assert self.dtype == torch.bfloat16
                 self.k_b_proj_trans_dequant = nn.Parameter(
                     torch.empty(
@@ -1054,24 +1073,21 @@ def forward_context_default(
         )
 
         k = torch.empty_like(q).view(-1, self.num_heads, self.qk_head_dim)
-        k[..., :self.qk_nope_head_dim] = k_nope.view(-1, self.num_heads,
-                                                     self.qk_nope_head_dim)
+        compiled_copy_(k[..., :self.qk_nope_head_dim],
+                       k_nope.view(-1, self.num_heads, self.qk_nope_head_dim))
         if self.apply_rotary_emb:
             k[..., self.qk_nope_head_dim:] = k_pe.view(-1, 1,
                                                        self.qk_rope_head_dim)
         k = k.view(-1, self.num_heads * self.qk_head_dim)
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         attn_output = self.mha.forward(
             q,
             k,
             v,
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=latent_cache,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             output=output,
         )
 
@@ -1116,7 +1132,7 @@ def forward_context_with_cached_kv(
         full_k_nope = full_k_nope.view(-1, self.num_heads,
                                        self.qk_nope_head_dim)
         full_k_pe = full_k_pe.view(-1, 1, self.qk_rope_head_dim)
-        full_k = torch.cat(
+        full_k = compiled_cat(
             (full_k_nope, full_k_pe.expand(-1, self.num_heads, -1)), dim=-1)
         full_k = full_k.view(-1, self.num_heads * self.qk_head_dim)
 
@@ -1126,9 +1142,6 @@ def forward_context_with_cached_kv(
         full_kv = None
         full_k_nope = None
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         # latent_cache must be None to differentiate from normal context phase,
         # so that we can skip applying RoPE and appending KV cache inside attention op
         attn_output = self.mha.forward(
@@ -1138,7 +1151,7 @@ def forward_context_with_cached_kv(
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=None,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             output=output,
         )
 
@@ -1214,7 +1227,7 @@ def forward_context_with_chunked_prefill(
             chunked_k_nope = chunked_k_nope.view(-1, self.num_heads,
                                                  self.qk_nope_head_dim)
             chunked_k_pe = chunked_k_pe.view(-1, 1, self.qk_rope_head_dim)
-            chunked_k = torch.cat(
+            chunked_k = compiled_cat(
                 (chunked_k_nope, chunked_k_pe.expand(-1, self.num_heads, -1)),
                 dim=-1)
             chunked_k = chunked_k.view(-1, self.num_heads * self.qk_head_dim)
@@ -1232,7 +1245,6 @@ def forward_context_with_chunked_prefill(
                 loop_idx]
             attn_metadata.host_total_kv_lens[0] = total_ctx_chunked_tokens
 
-            out_scale = None
             # do not apply mask for attention within loop
             # latent_cache must be None to differentiate from normal context phase,
             # so that we can skip applying RoPE and appending KV cache inside attention op
@@ -1243,7 +1255,7 @@ def forward_context_with_chunked_prefill(
                 attn_metadata,
                 attention_input_type=AttentionInputType.context_only,
                 latent_cache=None,
-                out_scale=out_scale,
+                out_scale=self.out_scale,
                 attention_mask=PredefinedAttentionMask.FULL,
                 softmax_stats_tensor=self.temp_softmax_stats_tensor,
                 chunked_prefill_buffer_batch_size=attn_metadata.
@@ -1273,7 +1285,7 @@ def forward_context_with_chunked_prefill(
 
         k_nope = k_nope.view(-1, self.num_heads, self.qk_nope_head_dim)
         k_pe = k_pe.view(-1, 1, self.qk_rope_head_dim)
-        k = torch.cat((k_nope, k_pe.expand(-1, self.num_heads, -1)), dim=-1)
+        k = compiled_cat((k_nope, k_pe.expand(-1, self.num_heads, -1)), dim=-1)
         k = k.view(-1, self.num_heads * self.qk_head_dim)
 
         # copy q_lens to replace kv_lens_runtime
@@ -1284,9 +1296,6 @@ def forward_context_with_chunked_prefill(
                                                        num_contexts].sum().item(
                                                        )
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Currently we use BF16 MHA for context phase
-
         # latent_cache must be None to differentiate from normal context phase,
         # so that we can skip applying RoPE and appending KV cache inside attention op
         temp_attn_output = self.mha.forward(
@@ -1296,7 +1305,7 @@ def forward_context_with_chunked_prefill(
             attn_metadata,
             attention_input_type=AttentionInputType.context_only,
             latent_cache=None,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             softmax_stats_tensor=self.temp_softmax_stats_tensor,
             chunked_prefill_buffer_batch_size=attn_metadata.runtime_features.
             chunked_prefill_buffer_batch_size,
@@ -1394,16 +1403,13 @@ def forward_generation(
             self.num_heads * (self.kv_lora_rank + self.qk_rope_head_dim)
         ])
 
-        # out_scale = getattr(self.o_proj, "inv_input_scale", None)
-        out_scale = None  # Although we use FP8 MLA for generation phase, the output is still in BF16
-
         attn_out_latent = self.mqa.forward(
             fused_q,
             None,
             None,
             attn_metadata,
             attention_input_type=AttentionInputType.generation_only,
-            out_scale=out_scale,
+            out_scale=self.out_scale,
             latent_cache=latent_cache,  # kvcache and k_pe
             q_pe=q_pe,  # used by `invokeMLARopeGeneration`
         )