zzf/add custom fallback for addmm linear bmm mm (#566)

zhangzefeng92 · web-flow · commit 583de90e5502 · 2023-12-27T10:52:10.000+08:00
* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm

* add custom fallback for addmm linear bmm mm
diff --git a/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml b/dipu/scripts/autogen_diopi_wrapper/diopi_functions.yaml
@@ -481,6 +481,7 @@
   interface: diopiSum(ctx, out, self_dtype_diopi, diopi_size)
 
 - schema: "addmm.out(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   custom_code_at_the_beginning: |
   interface: diopiAddmm(&context, out, self, mat1, mat2, beta, alpha)
 
@@ -744,6 +745,7 @@
   interface: diopiLinearBackward(ctx, grad_input, grad_weight, grad_bias, grad_output, input, weight)
 
 - schema: "linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor"
+  custom_fallback: True
   device: [all, -cuda]
   custom_code_at_the_beginning: |
     std::vector<int64_t> output_size(input.sizes().begin(), input.sizes().end());
@@ -1470,6 +1472,7 @@
   interface: diopiCosInp(ctx, self)
 
 - schema: "bmm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiBmm(ctx, out, self, mat2)
 
 - schema: "silu.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)"
@@ -1484,6 +1487,7 @@
   interface: diopiNormalInp(ctx, self, mean, std, generator)
 
 - schema: "mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+  custom_fallback: True
   interface: diopiMm(ctx, out, self, mat2)
 
 - schema: "matmul(Tensor self, Tensor other) -> Tensor"
@@ -2434,6 +2438,7 @@
 
 # this copy_ aten op may use both diopiCastDtype and diopiCopyInp. it's a proxy/composite op
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   dummy_call_diopi: True
   custom_fallback: True
   device: [cuda, camb, ascend, droplet, supa, kunlunxin]
@@ -2445,6 +2450,7 @@
 
 # vendor who has no fully implemented diopi and proper fallback DIPUCopy sub-class
 - schema: copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)
+  autocompare: disable
   custom_fallback: True
   dummy_call_diopi: True
   custom_code_at_the_beginning: |
@@ -2453,6 +2459,7 @@
   interface: diopiCopyInp(ctx, src, self)
 
 - schema: _amp_foreach_non_finite_check_and_unscale_(at::TensorList self, Tensor(b!) found_inf, Tensor inv_scale) -> void
+  autocompare: disable
   custom_fallback: True
   custom_code_at_the_beginning: |
     std::vector<diopiTensorHandle_t> diopiTensorHandles(self.size(), nullptr);
diff --git a/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp b/dipu/torch_dipu/csrc_dipu/aten/ops/CustomFallbackFunctions.hpp
@@ -17,7 +17,7 @@ static c10::optional<at::Tensor> dipu_to_cpu(
   return cpu_tensor;
 }
 
-static at::Tensor to_cpu_no_half(const at::Tensor& devtensor) {
+static at::Tensor to_cpu_with_half_to_float(const at::Tensor& devtensor) {
   auto cpu_tensor = devtensor.cpu();
   auto intype = devtensor.options().dtype_opt()->toScalarType();
   if (intype == at::ScalarType::Half) {
@@ -30,8 +30,9 @@ static at::Tensor& custom_fallback_dipu_silu_out(const at::Tensor& self,
                                                  at::Tensor& out) {
   DIPU_OP_LOG_WARNING_ONCE("custom fallback to cpu, name=silu_out"
                            << std::endl);
-  auto self_cpu = to_cpu_no_half(self);
-  auto out_cpu = to_cpu_no_half(self);
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto out_cpu = to_cpu_with_half_to_float(self);
+
   // NOLINTNEXTLINE(readability-suspicious-call-argument): It's the correct order
   out_cpu = at::silu_out(self_cpu, out_cpu);
   out.copy_(out_cpu);
@@ -339,5 +340,64 @@ at::Tensor& custom_fallback_dipu__amp_update_scale_(at::Tensor& current_scale,
                                                     double backoff_factor,
                                                     int64_t growth_interval);
 
+static at::Tensor& custom_fallback_dipu_addmm_out(
+    const at::Tensor& self, const at::Tensor& mat1, const at::Tensor& mat2,
+    const at::Scalar& beta, const at::Scalar& alpha, at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat1_cpu = to_cpu_with_half_to_float(mat1);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::addmm_out(out_cpu, self_cpu, mat1_cpu, mat2_cpu, beta, alpha);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_bmm_out(const at::Tensor& self,
+                                                const at::Tensor& mat2,
+                                                at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::bmm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor& custom_fallback_dipu_mm_out(const at::Tensor& self,
+                                               const at::Tensor& mat2,
+                                               at::Tensor& out) {
+  auto self_cpu = to_cpu_with_half_to_float(self);
+  auto mat2_cpu = to_cpu_with_half_to_float(mat2);
+  auto out_cpu = to_cpu_with_half_to_float(out);
+  out_cpu = at::mm_out(out_cpu, self_cpu, mat2_cpu);
+  out.copy_(out_cpu);
+  return out;
+}
+
+static at::Tensor custom_fallback_dipu_linear(
+    const at::Tensor& input, const at::Tensor& weight,
+    const c10::optional<at::Tensor>& bias) {
+  auto input_cpu = to_cpu_with_half_to_float(input);
+  auto weight_cpu = to_cpu_with_half_to_float(weight);
+  c10::optional<at::Tensor> bias_cpu = c10::nullopt;
+
+  at::Tensor out;
+  at::Tensor out_cpu;
+
+  if (bias.has_value() && bias.value().defined()) {
+    if (bias.value().options().dtype_opt()->toScalarType() ==
+        at::ScalarType::Half) {
+      bias_cpu = bias.value().to(at::ScalarType::Float).cpu();
+    } else {
+      bias_cpu = bias.value().cpu();
+    }
+  }
+
+  out_cpu = at::linear(input_cpu, weight_cpu, bias_cpu);
+  out = out_cpu.to(input.device())
+            .to(input.options().dtype_opt()->toScalarType());
+  return out;
+}
+
 }  // namespace native
 }  // namespace dipu