EmbeddedLLM
diff --git a/‎.github/workflows/aiter-test.yaml‎
Lines changed: 0 additions & 9 deletions b/‎.github/workflows/aiter-test.yaml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎.github/workflows/sglang_downstream.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/sglang_downstream.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/triton-test.yaml‎
Lines changed: 0 additions & 9 deletions b/‎.github/workflows/triton-test.yaml‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎aiter/mla.py‎
Lines changed: 1 addition & 11 deletions b/‎aiter/mla.py‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎aiter/ops/attention.py‎
Lines changed: 5 additions & 11 deletions b/‎aiter/ops/attention.py‎
Lines changed: 5 additions & 11 deletions
diff --git a/‎csrc/include/attention_asm_mla.h‎
Lines changed: 0 additions & 2 deletions b/‎csrc/include/attention_asm_mla.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/include/mla.h‎
Lines changed: 2 additions & 4 deletions b/‎csrc/include/mla.h‎
Lines changed: 2 additions & 4 deletions
diff --git a/‎csrc/include/rocm_ops.hpp‎
Lines changed: 0 additions & 4 deletions b/‎csrc/include/rocm_ops.hpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎csrc/kernels/mla/metadata.cu‎
Lines changed: 1 addition & 11 deletions b/‎csrc/kernels/mla/metadata.cu‎
Lines changed: 1 addition & 11 deletions
@@ -92,10 +92,6 @@ jobs:
             git submodule update --init --recursive --depth 1 --jobs 4
           fi
 
-      - name: Clean up Rocm processes
-        run: |
-          ./.github/scripts/clean_up_rocm.sh
-
       - name: Run the container
         run: |
           set -ex
@@ -158,11 +154,6 @@ jobs:
         if: always()
         run: |
           docker rm -f aiter_test || true
-  
-      - name: Clean up Rocm processes
-        if: always()
-        run: |
-          ./.github/scripts/clean_up_rocm.sh
 
   multi-gpu:
     name: Multi-GPU Tests (8 GPU)
 
@@ -97,6 +97,7 @@ jobs:
           docker run -dt --user root --device=/dev/kfd $DEVICE_FLAG \
           -v "${GITHUB_WORKSPACE:-$PWD}/sglang:/sglang-checkout" \
           --ipc=host --group-add video \
+          --network=host \
           --shm-size 32g \
           --cap-add=SYS_PTRACE \
           -e HF_TOKEN="${HF_TOKEN:-}" \
 
@@ -51,10 +51,6 @@ jobs:
           fetch-depth: 1
           submodules: 'recursive'
 
-      - name: Clean up Rocm processes
-        run: |
-          ./.github/scripts/clean_up_rocm.sh
-
       - name: Run the container
         run: |
           set -ex
@@ -148,8 +144,3 @@ jobs:
         if: always()
         run: |
           docker rm -f triton_test || true
-
-      - name: Clean up Rocm processes
-        if: always()
-        run: |
-          ./.github/scripts/clean_up_rocm.sh
@@ -150,8 +150,6 @@ def mla_decode_fwd(
     kv_indices,
     kv_last_page_lens,
     max_seqlen_q,
-    page_size=1,
-    nhead_kv=1,
     sm_scale=None,  # 1.0 / (qk_head_dim**0.5)
     logit_cap=0.0,
     num_kv_splits=None,  # for experts only!!!
@@ -170,11 +168,7 @@ def mla_decode_fwd(
 ):
     device = q.device
     assert logit_cap <= 0, f"{logit_cap=} is not support yet"
-    if kv_buffer.dtype != torch.uint8:
-        _, _, _, qk_head_dim = kv_buffer.shape
-    else:
-        _, _, qk_head_dim = q.shape
-
+    num_page, page_size, nhead_kv, qk_head_dim = kv_buffer.shape
     if sm_scale is None:
         sm_scale = 1.0 / (qk_head_dim**0.5)
 
@@ -233,8 +227,6 @@ def mla_decode_fwd(
             None,
             None,
             max_seqlen_q,
-            page_size,
-            nhead_kv,
             sm_scale,
             logits,
             attn_lse,
@@ -327,8 +319,6 @@ def mla_decode_fwd(
             work_indptr,
             work_info_set,
             max_seqlen_q,
-            page_size,
-            nhead_kv,
             sm_scale,
             logits,
             attn_lse,
 
@@ -566,8 +566,6 @@ def mla_decode_stage1_asm_fwd(
     work_indptr: Optional[torch.Tensor],
     work_info_set: Optional[torch.Tensor],
     max_seqlen_q: int,
-    page_size: int,
-    nhead_kv: int,
     softmax_scale: float,
     # [batch_size, num_kv_splits, num_heads, v_head_dim]
     splitData: torch.Tensor,
@@ -856,7 +854,6 @@ def get_mla_metadata_info_v1(
 def get_mla_metadata_v1(
     seqlens_qo_indptr: torch.Tensor,
     seqlens_kv_indptr: torch.Tensor,
-    kv_last_page_lens: torch.Tensor,
     num_heads_per_head_k: int,
     num_heads_k: int,
     is_causal: bool,
@@ -866,7 +863,6 @@ def get_mla_metadata_v1(
     reduce_indptr: torch.Tensor,
     reduce_final_map: torch.Tensor,
     reduce_partial_map: torch.Tensor,
-    page_size: int = 1,
     kv_granularity: int = 16,
     max_seqlen_qo: int = -1,
     uni_seqlen_qo: int = -1,
@@ -880,14 +876,12 @@ def get_mla_metadata_v1(
     """
     Inputs:
         cumulated seqlens of q/o: (batch_size + 1), dtype torch.int32.
-        cumulated page indices of k/v: (batch_size + 1), dtype torch.int32.
-        Length of last page of k/v: (batch_size), dtype torch.int32.
+        cumulated seqlens of k/v: (batch_size + 1), dtype torch.int32.
         num_heads_per_head_k: Equals to num_heads_q // num_heads_k.
         num_heads_k: num_heads_k.
         is_causal: Whether causal mask is enabled.
         Options: Detailed settings for spliting. All of them are optional.
-            page_size: default=1. The size of a page.
-            kv_granularity: default=16. The granularity on kv page nums when cutting batch.
+            kv_granularity: default=16. The granularity on kv sequence length when cutting batch.
             max_seqlen_qo: default=-1. Used to check lds usage and save time. value less than 1 means unknown.
             uni_seqlen_qo: default=-1. Sequence length of qo is uniform across batches. value less than 1 means the
                            length is not fixed.
@@ -905,11 +899,11 @@ def get_mla_metadata_v1(
         [2.2] q_start:          (#work),            The global index in seq where q/o starts. Use global index here can
                                                     reduce memory access count in kernel.
         [2.3] q_end:            (#work),            The global index in seq where q/o ends (not included).
-        [2.4] kv_start:         (#work),            The global index in page where k/v starts.
-        [2.5] kv_end:           (#work),            The global index in page where k/v ends (not included). Note that
+        [2.4] kv_start:         (#work),            The global index in seq where k/v starts.
+        [2.5] kv_end:           (#work),            The global index in seq where k/v ends (not included). Note that
                                                     this value indicates the end of last qo sequence if there are
                                                     multiple qo sequences included in the current work and causal mask
-                                                    is enabled when page_size is 1.
+                                                    is enabled.
         [2.6] kv_offset:        (#work),            Remaining length in seq from kv_end to the end of current batch.
         [2.7] pad               (#work, 1),         Pad to 8 DWs.
         [3] reduce_indptr:      (sum(qo_seqlen_blk_count) + 1),
 
@@ -15,8 +15,6 @@ void mla_decode_stage1_asm_fwd(
     std::optional<torch::Tensor>& work_indptr,          //   metadata
     std::optional<torch::Tensor>& work_info_set,        //   [batch_size+1]
     int max_seqlen_q,
-    int page_size,
-    int nhead_kv,
     float softmax_scale,
     // following are output
     torch::Tensor& splitData, //[batch_size, num_kv_splits, num_heads, v_head_dim]
 
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -37,7 +37,6 @@ static_assert(kSizeMlaPartialTileInfoInDw == 2);
 
 void get_mla_metadata_v1(const torch::Tensor& seqlens_qo_indptr, // [batch size + 1]
                          const torch::Tensor& seqlens_kv_indptr, // [batch size + 1]
-                         const torch::Tensor& kv_last_page_lens, // [batch size]
                          const int32_t num_heads_per_head_k,
                          const int32_t num_heads_k,
                          const bool is_causal,
@@ -47,14 +46,13 @@ void get_mla_metadata_v1(const torch::Tensor& seqlens_qo_indptr, // [batch size
                          torch::Tensor& reduce_indptr,
                          torch::Tensor& reduce_final_map,
                          torch::Tensor& reduce_partial_map,
-                         const int32_t page_size,
                          const int32_t kv_granularity,
                          const int32_t max_seqlen_qo,
                          const int32_t uni_seqlen_qo,
                          const bool fast_mode,
                          const int32_t topk,
                          const int32_t max_split_per_batch,
-                         const bool intra_batch_mode,
+                         const bool    intra_batch_mode,
                          const std::optional<at::ScalarType> dtype_q,
                          const std::optional<at::ScalarType> dtype_kv);
 
 
@@ -57,8 +57,6 @@ namespace py = pybind11;
           py::arg("work_indptr"),              \
           py::arg("work_info_set"),            \
           py::arg("max_seqlen_q"),             \
-          py::arg("page_size"),                \
-          py::arg("nhead_kv"),                 \
           py::arg("softmax_scale"),            \
           py::arg("splitData"),                \
           py::arg("splitLse"),                 \
@@ -1656,7 +1654,6 @@ namespace py = pybind11;
           "get_mla_metadata_v1",                         \
           py::arg("seqlens_qo_indptr"),                  \
           py::arg("seqlens_kv_indptr"),                  \
-          py::arg("kv_last_page_lens"),                  \
           py::arg("num_heads_per_head_k"),               \
           py::arg("num_heads_k"),                        \
           py::arg("is_causal"),                          \
@@ -1666,7 +1663,6 @@ namespace py = pybind11;
           py::arg("reduce_indptr"),                      \
           py::arg("reduce_final_map"),                   \
           py::arg("reduce_partial_map"),                 \
-          py::arg("page_size")           = 1,            \
           py::arg("kv_granularity")      = 16,           \
           py::arg("max_seqlen_qo")       = -1,           \
           py::arg("uni_seqlen_qo")       = -1,           \
 
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (C) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h>
 #include "metadata/v1_0_device.cuh"
@@ -40,7 +40,6 @@
 void get_mla_metadata_v1(
     const torch::Tensor&                seqlens_qo_indptr,     // [batch size + 1]
     const torch::Tensor&                seqlens_kv_indptr,     // [batch size + 1]
-    const torch::Tensor&                kv_last_page_lens,     // [batch size]
     const int32_t                       num_heads_per_head_k,
     const int32_t                       num_heads_k,
     const bool                          is_causal,
@@ -50,7 +49,6 @@ void get_mla_metadata_v1(
     torch::Tensor&                      reduce_indptr,
     torch::Tensor&                      reduce_final_map,
     torch::Tensor&                      reduce_partial_map,
-    const int32_t                       page_size,
     const int32_t                       kv_granularity,
     const int32_t                       max_seqlen_qo,
     const int32_t                       uni_seqlen_qo,
@@ -65,8 +63,6 @@ void get_mla_metadata_v1(
 
     TORCH_CHECK((kv_granularity & (kv_granularity - 1)) == 0,
                 __func__, ": kv_granularity Must be power of 2!");
-    TORCH_CHECK((page_size & (page_size - 1)) == 0,
-                __func__, ": page_size Must be power of 2!");
     TORCH_CHECK(seqlens_qo_indptr.stride(0) == 1,
                 __func__, ": seqlens_qo_indptr should be continuous!");
     TORCH_CHECK(seqlens_qo_indptr.scalar_type() == at::ScalarType::Int,
@@ -75,10 +71,6 @@ void get_mla_metadata_v1(
                 __func__, ": seqlens_kv_indptr should be continuous!");
     TORCH_CHECK(seqlens_kv_indptr.scalar_type() == at::ScalarType::Int,
                 __func__, ": seqlens_kv_indptr's element type should be int!");
-    TORCH_CHECK(kv_last_page_lens.stride(0) == 1,
-                __func__, ": kv_last_page_lens should be continuous!");
-    TORCH_CHECK(kv_last_page_lens.scalar_type() == at::ScalarType::Int,
-                __func__, ": kv_last_page_lens's element type should be int!");
 
     at::ScalarType q_dtype = dtype_q.has_value() ? dtype_q.value() : at::ScalarType::BFloat16;
     at::ScalarType kv_dtype = dtype_kv.has_value() ? dtype_kv.value() : at::ScalarType::BFloat16;
@@ -88,11 +80,9 @@ void get_mla_metadata_v1(
         get_mla_metadata_v1_2_device(
             seqlens_qo_indptr,
             seqlens_kv_indptr,
-            kv_last_page_lens,
             num_heads_per_head_k,
             num_heads_k,
             is_causal,
-            page_size,
             kv_granularity,
             max_seqlen_qo,
             uni_seqlen_qo,