PaddlePaddle
diff --git a/‎.github/workflows/ci_iluvatar.yml‎
Lines changed: 5 additions & 1 deletion b/‎.github/workflows/ci_iluvatar.yml‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 0 deletions b/‎custom_ops/gpu_ops/helper.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎custom_ops/iluvatar_ops/mixed_fused_attn.cu‎
Lines changed: 376 additions & 0 deletions b/‎custom_ops/iluvatar_ops/mixed_fused_attn.cu‎
Lines changed: 376 additions & 0 deletions
diff --git a/‎custom_ops/iluvatar_ops/moe_dispatch.cu‎
Lines changed: 5 additions & 1 deletion b/‎custom_ops/iluvatar_ops/moe_dispatch.cu‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎custom_ops/iluvatar_ops/paged_attn.cu‎
Lines changed: 33 additions & 47 deletions b/‎custom_ops/iluvatar_ops/paged_attn.cu‎
Lines changed: 33 additions & 47 deletions
@@ -28,18 +28,22 @@ jobs:
           REPO="https://github.com/${{ github.repository }}.git"
           FULL_REPO="${{ github.repository }}"
           REPO_NAME="${FULL_REPO##*/}"
+          BASE_BRANCH="${{ github.base_ref }}"
           # Clean the repository directory before starting
           docker run --rm --net=host -v $(pwd):/workspace -w /workspace \
           -e "REPO_NAME=${REPO_NAME}" \
+          -e "BASE_BRANCH=${BASE_BRANCH}" \
           ${docker_image} /bin/bash -c '
             if [ -d ${REPO_NAME} ]; then
               echo "Directory ${REPO_NAME} exists, removing it..."
               rm -rf ${REPO_NAME}
             fi
           '
+          git config --global http.proxy "http://61.151.249.150:33128"
+          git config --global https.proxy "http://61.151.249.150:33128"
           git config --global user.name "FastDeployCI"
           git config --global user.email "[email protected]"
-          git clone ${REPO} ${REPO_NAME}
+          git clone --recursive ${REPO} ${REPO_NAME} -b ${BASE_BRANCH}
           cd FastDeploy
           if [ "${{ github.event_name }}" = "pull_request" ]; then
             git fetch origin pull/${{ github.event.pull_request.number }}/head:pr/${{ github.event.pull_request.number }}
 
@@ -193,11 +193,13 @@ template <> class PDTraits<paddle::DataType::UINT8> {
   typedef uint8_t data_t;
 };
 
+#ifndef PADDLE_WITH_COREX
 template <> class PDTraits<paddle::DataType::FLOAT8_E4M3FN> {
 public:
   typedef __nv_fp8_e4m3 DataType;
   typedef paddle::float8_e4m3fn data_t;
 };
+#endif
 
 template <typename T, int Size> struct alignas(sizeof(T) * Size) AlignedVector {
   T val[Size];
 
@@ -53,6 +53,7 @@ void MoeDispatchKernel(const paddle::Tensor& input,
                        const paddle::optional<paddle::Tensor>& gating_correction_bias,
                        const int moe_topk,
                        const bool group_moe,
+                       const std::string &moe_quant_type,
                        const bool topk_only_mode,
                        const int num_rows,
                        const int hidden_size,
@@ -183,6 +184,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
     const paddle::optional<paddle::Tensor>& w4a8_in_scale,
     const int moe_topk,
     const bool group_moe,
+    const std::string &moe_quant_type,
     const bool topk_only_mode) {
   const auto input_type = input.dtype();
   auto place = input.place();
@@ -220,6 +222,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
                                                     gating_correction_bias,
                                                     moe_topk,
                                                     group_moe,
+                                                    moe_quant_type,
                                                     topk_only_mode,
                                                     num_rows,
                                                     hidden_size,
@@ -236,6 +239,7 @@ std::vector<paddle::Tensor> MoeExpertDispatch(
                                                    gating_correction_bias,
                                                    moe_topk,
                                                    group_moe,
+                                                   moe_quant_type,
                                                    topk_only_mode,
                                                    num_rows,
                                                    hidden_size,
@@ -305,7 +309,7 @@ PD_BUILD_STATIC_OP(moe_expert_dispatch)
               "top_k_weight",
               "top_k_indices",
               "expert_idx_per_token"})
-    .Attrs({"moe_topk:int", "group_moe:bool", "topk_only_mode:bool"})
+    .Attrs({"moe_topk:int", "group_moe:bool", "moe_quant_type:std::string", "topk_only_mode:bool"})
     .SetKernelFn(PD_KERNEL(MoeExpertDispatch))
     .SetInferShapeFn(PD_INFER_SHAPE(MoeExpertDispatchInferShape))
     .SetInferDtypeFn(PD_INFER_DTYPE(MoeExpertDispatchInferDtype));
@@ -27,6 +27,8 @@ void PagedAttnKernel(const paddle::Tensor& q,
                      const paddle::optional<paddle::Tensor> &v,
                      const paddle::optional<paddle::Tensor> &rope_sin,
                      const paddle::optional<paddle::Tensor> &rope_cos,
+                     int num_heads,
+                     int head_dim,
                      int num_kv_heads,
                      float scale,
                      int block_size,
@@ -86,32 +88,36 @@ void PagedAttnKernel(const paddle::Tensor& q,
                       common::errors::InvalidArgument(
                           "paged_attention expects seq_lens is contiguous"));
     // check dim and shape
-    // k_cache: [num_blocks, kv_num_heads, block_size, head_size]
-    // v_cache: [num_blocks, kv_num_heads, block_size, head_size]
+    // k_cache: [num_blocks, kv_num_heads, block_size, head_dim]
+    // v_cache: [num_blocks, kv_num_heads, block_size, head_dim]
     // block_table: [num_seqs, max_num_blocks_per_seq]
     // seq_lens: [num_seqs]
     // q and out:
-    // merged_qkv = false: [num_seqs, num_heads, head_size]
-    // merged_qkv = true: [num_seqs, num_heads+2*num_kv_heads, head_size]
+    // if merged_qkv = false:
+    // q:[num_seqs, hidden_size]
+    // out:[num_seqs, hidden_size]
+    // if merged_qkv = true:
+    // q: [num_seqs, (num_heads+2*num_kv_heads)*head_dim]
+    // out: [num_seqs, hidden_size]
 
     const auto& q_dims = q.dims();
     PADDLE_ENFORCE_EQ(q_dims.size(),
-                      3,
+                      2,
                       common::errors::InvalidArgument(
                           "paged_attn receive query dims is "
-                          "[num_seqs, num_heads, head_size]"));
+                          "[num_seqs, (num_heads+2*num_kv_heads)*head_dim]"));
     PADDLE_ENFORCE_EQ(out.dims().size(),
-                      3,
+                      2,
                       common::errors::InvalidArgument(
                           "paged_attn receive out dims is "
-                          "[num_seqs, num_heads, head_size]"));
+                          "[num_seqs, hidden_size]"));
 
     const auto& kv_cache_dims = k_cache.dims();
     PADDLE_ENFORCE_EQ(kv_cache_dims.size(),
                       4,
                       common::errors::InvalidArgument(
                           "paged_attn receive kv cache dims is "
-                          "[num_blocks, kv_num_heads, block_size, head_size]"));
+                          "[num_blocks, kv_num_heads, block_size, head_dim]"));
 
     const auto& block_table_dims = block_table.dims();
     PADDLE_ENFORCE_EQ(block_table_dims.size(),
@@ -127,8 +133,6 @@ void PagedAttnKernel(const paddle::Tensor& q,
                           "paged_attn receive seq_lens dims is [num_seqs]"));
 
     int num_seqs = q_dims[0];
-    int num_heads = merged_qkv ? q_dims[1] - 2 * num_kv_heads : q_dims[1];
-    int head_size = q_dims[2];
     int max_num_blocks_per_seq = block_table_dims[1];
     int q_stride = q.strides()[0];
     int num_blocks = kv_cache_dims[0];
@@ -142,9 +146,9 @@ void PagedAttnKernel(const paddle::Tensor& q,
                       common::errors::InvalidArgument(
                           "kv_cache_dims[2] must be equal to block_size"));
     PADDLE_ENFORCE_EQ(kv_cache_dims[3],
-                      head_size,
+                      head_dim,
                       common::errors::InvalidArgument(
-                          "kv_cache_dims[3] must be equal to head_size"));
+                          "kv_cache_dims[3] must be equal to head_dim"));
     PADDLE_ENFORCE_EQ(block_table_dims[0],
                       num_seqs,
                       common::errors::InvalidArgument(
@@ -162,14 +166,13 @@ void PagedAttnKernel(const paddle::Tensor& q,
     const float *rope_sin_ptr = merged_qkv ? rope_sin.get().data<float>() : nullptr;
     const float *rope_cos_ptr = merged_qkv ? rope_cos.get().data<float>() : nullptr;
 
-    auto dev_ctx = static_cast<const phi::CustomContext*>(paddle::experimental::DeviceContextPool::Instance().Get(q.place()));
     cuinferHandle_t cuinfer_handle = iluvatar::getContextInstance()->getIxInferHandle();
 
     size_t workspace_size = 0;
     CUINFER_CHECK(cuInferPageAttentionGetWorkspaceV7(num_seqs,
                                                      num_heads,
                                                      num_kv_heads,
-                                                     head_size,
+                                                     head_dim,
                                                      block_size,
                                                      max_context_len,
                                                      &workspace_size));
@@ -189,7 +192,7 @@ void PagedAttnKernel(const paddle::Tensor& q,
                                          num_seqs,
                                          num_heads,
                                          num_kv_heads,
-                                         head_size,
+                                         head_dim,
                                          q_stride,
                                          kv_block_stride,
                                          kv_head_stride,
@@ -215,6 +218,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                       const paddle::optional<paddle::Tensor> &v,
                                       const paddle::optional<paddle::Tensor> &rope_sin,
                                       const paddle::optional<paddle::Tensor> &rope_cos,
+                                      int num_heads,
+                                      int head_dim,
                                       int num_kv_heads,
                                       float scale,
                                       int block_size,
@@ -228,11 +233,7 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                       bool merged_qkv) {
 
     const auto dtype = q.dtype();
-    auto out_shape = q.shape();
-    if (merged_qkv) {
-        out_shape[1] -=  2 * num_kv_heads;
-    }
-    auto out = paddle::empty(out_shape, dtype, q.place());
+    auto out = paddle::empty({q.shape()[0], num_heads * head_dim}, dtype, q.place());
 
     switch (dtype) {
         case paddle::DataType::BFLOAT16:
@@ -246,6 +247,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                                         v,
                                                         rope_sin,
                                                         rope_cos,
+                                                        num_heads,
+                                                        head_dim,
 						                                num_kv_heads,
                                                         scale,
                                                         block_size,
@@ -270,6 +273,8 @@ std::vector<paddle::Tensor> PagedAttn(const paddle::Tensor& q,
                                                        v,
                                                        rope_sin,
                                                        rope_cos,
+                                                       num_heads,
+                                                       head_dim,
 						                               num_kv_heads,
                                                        scale,
                                                        block_size,
@@ -299,6 +304,8 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
                                                       const std::vector<int64_t>& v_shape,
                                                       const std::vector<int64_t>& rope_sin_shape,
                                                       const std::vector<int64_t>& rope_cos_shape,
+                                                      int num_heads,
+                                                      int head_dim,
                                                       int num_kv_heads,
                                                       float scale,
                                                       int block_size,
@@ -311,36 +318,13 @@ std::vector<std::vector<int64_t>> PagedAttnInferShape(const std::vector<int64_t>
                                                       bool use_sqrt_alibi,
                                                       bool merged_qkv) {
     if (merged_qkv) {
-        int64_t num_tokens = q_shape[0];
-        int64_t num_heads = q_shape[1] - 2 * num_kv_heads;
-        int64_t head_dim = q_shape[2];
-        return {{num_tokens, num_heads, head_dim}};
+        return {{q_shape[0], num_heads * head_dim}};
     } else {
         return {q_shape};
     }
 }
 
-std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype,
-                                                  const paddle::DataType& k_cache_dtype,
-                                                  const paddle::DataType& v_cache_dtype,
-                                                  const paddle::DataType& block_table_dtype,
-                                                  const paddle::DataType& seq_lens_dtype,
-                                                  const paddle::DataType& alibi_slopes_dtype,
-                                                  const paddle::DataType& k_dtype,
-                                                  const paddle::DataType& v_dtype,
-                                                  const paddle::DataType& rope_sin_dtype,
-                                                  const paddle::DataType& rope_cos_dtype,
-                                                  int num_kv_heads,
-                                                  float scale,
-                                                  int block_size,
-                                                  int max_context_len,
-                                                  bool causal,
-                                                  int window_left,
-                                                  int window_right,
-                                                  float softcap,
-                                                  bool enable_cuda_graph,
-                                                  bool use_sqrt_alibi,
-                                                  bool merged_qkv) {
+std::vector<paddle::DataType> PagedAttnInferDtype(const paddle::DataType& q_dtype) {
     return {q_dtype};
 }
 
@@ -351,7 +335,9 @@ PD_BUILD_STATIC_OP(paged_attn)
              paddle::Optional("v"), paddle::Optional("rope_sin"),
              paddle::Optional("rope_cos")})
     .Outputs({"out"})
-    .Attrs({"num_kv_heads:int",
+    .Attrs({"num_heads:int",
+            "head_dim:int",
+            "num_kv_heads:int",
             "scale:float",
             "block_size:int",
             "max_context_len:int",