PaddlePaddle
diff --git a/‎paddle/fluid/framework/ir/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎paddle/fluid/framework/ir/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc‎
Lines changed: 125 additions & 0 deletions b/‎paddle/fluid/framework/ir/xpu/block_multihead_attention_xpu_pass.cc‎
Lines changed: 125 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/paddle_pass_builder.cc‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/inference/api/paddle_pass_builder.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/phi/backends/xpu/xpu2_op_list.cc‎
Lines changed: 3 additions & 1 deletion b/‎paddle/phi/backends/xpu/xpu2_op_list.cc‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/phi/infermeta/fusion.cc‎
Lines changed: 83 additions & 0 deletions b/‎paddle/phi/infermeta/fusion.cc‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎paddle/phi/infermeta/fusion.h‎
Lines changed: 43 additions & 0 deletions b/‎paddle/phi/infermeta/fusion.h‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/cpu/tile_kernel.cc‎
Lines changed: 1 addition & 0 deletions b/‎paddle/phi/kernels/cpu/tile_kernel.cc‎
Lines changed: 1 addition & 0 deletions
@@ -334,6 +334,8 @@ if(WITH_XPU)
                DEPS ${XPU_PASS_DEPS})
   pass_library(weight_only_linear_xpu_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(block_multihead_attention_xpu_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
 endif()
 
 cc_library(
 
@@ -0,0 +1,125 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class BlockMultiHeadAttentionXPUPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void InplaceBlockMultiHeadAttentionXPU(ir::Graph* graph) const;
+
+  const std::string name_scope_{"block_multihead_attention_xpu_pass"};
+};
+
+void BlockMultiHeadAttentionXPUPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+
+  InplaceBlockMultiHeadAttentionXPU(graph);
+}
+
+void BlockMultiHeadAttentionXPUPass::InplaceBlockMultiHeadAttentionXPU(
+    ir::Graph* graph) const {
+  const int64_t max_batch_size = 10;
+  auto* scope = param_scope();
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "block_multihead_attention") {
+      auto* op_desc = node->Op();
+      op_desc->SetType("block_multihead_attention_xpu");
+      phi::DenseTensor cache_k_per_batch_maxs;
+      auto base_name = op_desc->Input("qkv")[0];
+      int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+      std::string cache_k_per_batch_maxs_name = base_name + "_max_cache_k";
+      VarDesc cache_k_per_batch_maxs_desc(cache_k_per_batch_maxs_name);
+      cache_k_per_batch_maxs_desc.SetPersistable(true);
+      cache_k_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_k_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_k_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_k_per_batch_maxs_desc);
+      phi::DenseTensor cpu_tensor;
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      cpu_tensor.set_type(phi::DataType::FLOAT32);
+      cpu_tensor.Resize({max_batch_size, max_ptr_size});
+      std::vector<float> tmp(max_batch_size * max_ptr_size, 0);
+      memcpy(cpu_ctx->Alloc<float>(&cpu_tensor),
+             tmp.data(),
+             max_batch_size * max_ptr_size * sizeof(float));
+      Assign(cpu_tensor,
+             scope->Var(cache_k_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_k_per_batch_maxs",
+                        {cache_k_per_batch_maxs_name});
+
+      std::string cache_v_per_batch_maxs_name = base_name + "_max_cache_v";
+      VarDesc cache_v_per_batch_maxs_desc(cache_v_per_batch_maxs_name);
+      cache_v_per_batch_maxs_desc.SetPersistable(true);
+      cache_v_per_batch_maxs_desc.SetShape(
+          {max_batch_size, static_cast<int64_t>(max_ptr_size)});
+      cache_v_per_batch_maxs_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      Node* cache_v_per_batch_maxs_in =
+          graph->CreateVarNode(&cache_v_per_batch_maxs_desc);
+      Assign(cpu_tensor,
+             scope->Var(cache_v_per_batch_maxs_name)
+                 ->GetMutable<phi::DenseTensor>());
+      op_desc->SetInput("cache_v_per_batch_maxs",
+                        {cache_v_per_batch_maxs_name});
+
+      IR_NODE_LINK_TO(cache_k_per_batch_maxs_in, node);
+      IR_NODE_LINK_TO(cache_v_per_batch_maxs_in, node);
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(block_multihead_attention_xpu_pass,
+              paddle::framework::ir::BlockMultiHeadAttentionXPUPass);
+
+REGISTER_PASS_CAPABILITY(block_multihead_attention_xpu_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "block_multihead_attention_xpu", 0));
@@ -538,6 +538,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "group_norm_silu_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "qk_qkv_attention_xpu_fuse_pass",
+      "block_multihead_attention_xpu_pass",
       "multi_encoder_xpu_fuse_pass",
       "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
       "multi_encoder_xpu_slice_fuse_pass",
 
@@ -1055,7 +1055,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::INT64,
                      phi::DataType::BOOL,
                      phi::DataType::FLOAT64,
-                     phi::DataType::FLOAT32})},
+                     phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16})},
       {"tile_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"transpose2_grad",
        XPUKernelSet({phi::DataType::FLOAT32,
@@ -1248,6 +1249,7 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
       {"sequence_unpad_xpu",
        XPUKernelSet({phi::DataType::FLOAT16, phi::DataType::FLOAT32})},
+      {"block_multihead_attention_xpu", XPUKernelSet({phi::DataType::FLOAT16})},
   };
 
   return s_xpu2_kernels;
 
@@ -377,6 +377,89 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
   }
 }
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out) {
+  BlockMultiheadAttentionInferMeta(qkv,
+                                   key_cache,
+                                   value_cache,
+                                   seq_lens_encoder,
+                                   seq_lens_decoder,
+                                   seq_lens_this_time,
+                                   padding_offsets,
+                                   cum_offsets,
+                                   cu_seqlens_q,
+                                   cu_seqlens_k,
+                                   block_tables,
+                                   pre_key_cache,
+                                   pre_value_cache,
+                                   rope_emb,
+                                   mask,
+                                   tgt_mask,
+                                   cache_k_quant_scales,
+                                   cache_v_quant_scales,
+                                   cache_k_dequant_scales,
+                                   cache_v_dequant_scales,
+                                   qkv_out_scale,
+                                   qkv_bias,
+                                   out_shift,
+                                   out_smooth,
+                                   max_enc_len_this_time,
+                                   max_dec_len_this_time,
+                                   max_seq_len,
+                                   block_size,
+                                   use_neox_style,
+                                   dynamic_cachekv_quant,
+                                   quant_round_type,
+                                   quant_max_bound,
+                                   quant_min_bound,
+                                   out_scale,
+                                   compute_dtype,
+                                   fmha_out,
+                                   qkv_out,
+                                   key_cache_out,
+                                   value_cache_out);
+}
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
 
@@ -128,6 +128,49 @@ void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       MetaTensor* key_cache_out,
                                       MetaTensor* value_cache_out);
 
+void BlockMultiheadAttentionInferXPUMeta(
+    const MetaTensor& qkv,
+    const MetaTensor& key_cache,
+    const MetaTensor& value_cache,
+    const MetaTensor& seq_lens_encoder,
+    const MetaTensor& seq_lens_decoder,
+    const MetaTensor& seq_lens_this_time,
+    const MetaTensor& padding_offsets,
+    const MetaTensor& cum_offsets,
+    const MetaTensor& cu_seqlens_q,
+    const MetaTensor& cu_seqlens_k,
+    const MetaTensor& cache_k_per_batch_maxs,
+    const MetaTensor& cache_v_per_batch_maxs,
+    const MetaTensor& block_tables,
+    const MetaTensor& pre_key_cache,
+    const MetaTensor& pre_value_cache,
+    const MetaTensor& rope_emb,
+    const MetaTensor& mask,
+    const MetaTensor& tgt_mask,
+    const MetaTensor& cache_k_quant_scales,
+    const MetaTensor& cache_v_quant_scales,
+    const MetaTensor& cache_k_dequant_scales,
+    const MetaTensor& cache_v_dequant_scales,
+    const MetaTensor& qkv_out_scale,
+    const MetaTensor& qkv_bias,
+    const MetaTensor& out_shift,
+    const MetaTensor& out_smooth,
+    const MetaTensor& max_enc_len_this_time,
+    const MetaTensor& max_dec_len_this_time,
+    int max_seq_len,
+    int block_size,
+    bool use_neox_style,
+    bool dynamic_cachekv_quant,
+    const int quant_round_type,
+    const float quant_max_bound,
+    const float quant_min_bound,
+    const float out_scale,
+    const std::string& compute_dtype,
+    MetaTensor* fmha_out,
+    MetaTensor* qkv_out,
+    MetaTensor* key_cache_out,
+    MetaTensor* value_cache_out);
+
 void Conv1dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& x_max,
                         const MetaTensor& filter,
 
@@ -27,5 +27,6 @@ PD_REGISTER_KERNEL(tile,
                    double,
                    int,
                    int64_t,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}