PaddlePaddle
diff --git a/‎cmake/neuware.cmake‎
Lines changed: 3 additions & 1 deletion b/‎cmake/neuware.cmake‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎paddle/fluid/imperative/prepared_operator.cc‎
Lines changed: 48 additions & 0 deletions b/‎paddle/fluid/imperative/prepared_operator.cc‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/collective/barrier_op_mlu.cc‎
Lines changed: 63 additions & 0 deletions b/‎paddle/fluid/operators/collective/barrier_op_mlu.cc‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/collective/c_allgather_op_mlu.cc‎
Lines changed: 46 additions & 8 deletions b/‎paddle/fluid/operators/collective/c_allgather_op_mlu.cc‎
Lines changed: 46 additions & 8 deletions
diff --git a/‎paddle/fluid/operators/detection/CMakeLists.txt‎
Lines changed: 5 additions & 2 deletions b/‎paddle/fluid/operators/detection/CMakeLists.txt‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎paddle/fluid/operators/detection/prior_box_op_mlu.cc‎
Lines changed: 104 additions & 0 deletions b/‎paddle/fluid/operators/detection/prior_box_op_mlu.cc‎
Lines changed: 104 additions & 0 deletions
@@ -15,12 +15,14 @@ set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
 include_directories(${NEUWARE_INCLUDE_DIR})
 
 set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
+set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
 set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
 set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
 
 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
+set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
+                     ${CNPAPI_LIB})
 
 if(WITH_CNCL)
   message(STATUS "Compile with CNCL!")
 
@@ -146,6 +146,48 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       kernel_signature_(std::move(kernel_signature)),
       phi_kernel_(phi_kernel) {}
 
+#ifdef PADDLE_WITH_MLU
+
+static void tokenize(const std::string& ops,
+                     char delim,
+                     std::unordered_set<std::string>* op_set) {
+  std::string::size_type beg = 0;
+  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
+       ++end) {
+    op_set->insert(ops.substr(beg, end - beg));
+    beg = end + 1;
+  }
+
+  op_set->insert(ops.substr(beg));
+}
+
+static bool is_in_mlu_black_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> mlu_black_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("MLU_BLACK_LIST") != nullptr) {
+        std::string ops(std::getenv("MLU_BLACK_LIST"));
+        tokenize(ops, ',', &mlu_black_list);
+      }
+      inited = true;
+      VLOG(3) << "MLU Black List: ";
+      for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
+    return true;
+  }
+  return false;
+}
+
+#endif
+
 template <typename VarType>
 PreparedOp PrepareImpl(
     const NameVarMap<VarType>& ins,
@@ -194,6 +236,12 @@ PreparedOp PrepareImpl(
 
 #endif
 
+#ifdef PADDLE_WITH_MLU
+  if (is_in_mlu_black_list(op.Type())) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+  }
+#endif
+
   bool has_phi_kernel = false;
 
   const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
 
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    auto* comm = cncl_comm->comm();
+    auto comm_stream = cncl_comm->stream();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    dev_ctx.Wait();
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with CNCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -27,15 +28,14 @@ template <typename T>
 class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
 #if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int nranks = ctx.Attr<int>("nranks");
     int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
     auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
     PADDLE_ENFORCE_EQ(
         nranks,
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(out_dims, place);
 
     uint32_t send_numel = x->numel();
-    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+    void* send_buff;
+    void* recv_buff;
+    phi::DenseTensor in_tensor, out_tensor;
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast from int64 to int32 since cncl do not support int64
+      in_tensor.mutable_data<int32_t>(x->dims(), place);
+      out_tensor.mutable_data<int32_t>(out->dims(), place);
+      MLUCnnlTensorDesc x_int64_desc(*x);
+      MLUCnnlTensorDesc x_int32_desc(in_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    x_int64_desc.get(),
+                    GetBasePtr(x),
+                    x_int32_desc.get(),
+                    GetBasePtr(&in_tensor));
+      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
+    } else {
+      in_tensor.ShareDataWith(*x);
+      out_tensor.ShareDataWith(*out);
+      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
+    }
 
     mluStream stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
+    cnclDataType_t dtype = platform::ToCNCLDataType(
+        framework::TransToProtoVarType(in_tensor.dtype()));
 
     PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
         send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast back from int64 out_tensor to out
+      MLUCnnlTensorDesc out_int64_desc(*out);
+      MLUCnnlTensorDesc out_int32_desc(out_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    out_int32_desc.get(),
+                    GetBasePtr(&out_tensor),
+                    out_int64_desc.get(),
+                    GetBasePtr(out));
+    }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with MLU."));
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
                        ops::CAllGatherOpMLUKernel<int>,
                        ops::CAllGatherOpMLUKernel<int8_t>,
                        ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<int64_t>,
                        ops::CAllGatherOpMLUKernel<plat::float16>);
@@ -42,19 +42,23 @@ if(WITH_XPU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_xpu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 elseif(WITH_MLU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 elseif(WITH_ASCEND_CL)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_npu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op.cu)
   detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()
 
@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
 
@@ -0,0 +1,104 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class PriorBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
+    float step_w = ctx.Attr<float>("step_w");
+    float step_h = ctx.Attr<float>("step_h");
+    float offset = ctx.Attr<float>("offset");
+    bool clip = ctx.Attr<bool>("clip");
+    bool min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+
+    int im_width = image->dims()[3];
+    int im_height = image->dims()[2];
+    int width = input->dims()[3];
+    int height = input->dims()[2];
+
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    bool flip = ctx.Attr<bool>("flip");
+    std::vector<float> new_aspect_ratios;
+    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    phi::DenseTensor ratios;
+    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
+    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    phi::DenseTensor min;
+    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
+    MLUOpTensorDesc min_sizes_desc(min);
+
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    phi::DenseTensor max;
+    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
+    MLUOpTensorDesc max_sizes_desc(max);
+
+    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
+    phi::DenseTensor var_tensor;
+    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
+    MLUOpTensorDesc variances_attr_desc(var_tensor);
+
+    auto place = ctx.GetPlace();
+
+    boxes->mutable_data<T>(place);
+    variances->mutable_data<T>(place);
+
+    MLUOpTensorDesc var_desc(*variances);
+    MLUOpTensorDesc output_desc(*boxes);
+    MLUOP::OpPriorBox(ctx,
+                      min_sizes_desc.get(),
+                      GetBasePtr(&min),
+                      new_aspect_ratios_desc.get(),
+                      GetBasePtr(&ratios),
+                      variances_attr_desc.get(),
+                      GetBasePtr(&var_tensor),
+                      max_sizes_desc.get(),
+                      GetBasePtr(&max),
+                      height,
+                      width,
+                      im_height,
+                      im_width,
+                      step_h,
+                      step_w,
+                      offset,
+                      clip,
+                      min_max_aspect_ratios_order,
+                      output_desc.get(),
+                      GetBasePtr(boxes),
+                      var_desc.get(),
+                      GetBasePtr(variances));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);