Add support for bias in optimized op_linear.cpp. (#11210)

hsharma35 · facebook-github-bot · commit 985f90c0df34 · 2025-05-30T00:05:26.000-07:00
Summary:

Diff uses `op_add_sub_impl` to add bias after optimized gemm call.

Reviewed By: zonglinpeng

Differential Revision: D75491158
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
@@ -6,30 +6,75 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#include <array>
+
+#include <c10/util/irange.h>
+
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
+#include <executorch/kernels/optimized/vec/functional_base.h>
+#include <executorch/kernels/optimized/vec/vec.h>
 #include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-#include <array>
-
 namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = executorch::aten::Tensor;
+namespace {
+using ::executorch::aten::Tensor;
+using ::executorch::cpublas::gemm;
+using ::executorch::cpublas::TransposeType;
+using ::executorch::vec::map;
+using ::executorch::vec::Vectorized;
+
+// Use vector store to initialize with scalar bias.
+template <typename scalar_t>
+void initialize_scalar(
+    const ssize_t out_numel,
+    const scalar_t init,
+    scalar_t* out) {
+  using Vec = Vectorized<scalar_t>;
+
+  // Initialize a vector with the scalar initial value.
+  Vec init_vec(init);
+
+  int d = 0;
+  for (; d < out_numel - (out_numel % Vec::size()); d += Vec::size()) {
+    // Vector-length store.
+    init_vec.store(out + d);
+  }
+  if (out_numel - d > 0) {
+    // Sub-vector-length store.
+    init_vec.store(out + d, out_numel - d);
+  }
+}
+
+// Use std::memcpy to initialize with vector bias.
+template <typename scalar_t>
+void initialize_to_vector(
+    const int n,
+    const int m,
+    const scalar_t* bias,
+    scalar_t* out) {
+  // Output is a n x m x scalar_t, while bias is m x scalar_t.
+  const size_t row_size = static_cast<size_t>(m) * sizeof(scalar_t);
+  for (const auto col : c10::irange(n)) {
+    std::memcpy(
+        // Point to Column `col` of the output tensor.
+        out + col * m,
+        bias,
+        row_size);
+  }
+}
+
+} // namespace
 
 Tensor& opt_linear_out(
     RuntimeContext& ctx,
     const Tensor& in,
     const Tensor& mat2,
     const optional<Tensor>& bias,
     Tensor& out) {
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      !bias.has_value(),
-      InvalidArgument,
-      out,
-      "bias not supported yet in linear");
   ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
@@ -46,28 +91,63 @@ Tensor& opt_linear_out(
     return out;
   }
 
-  int flattened_input_dim = 1;
+  ssize_t n = 1;
   for (int ii = 0; ii < in.dim() - 1; ++ii) {
-    flattened_input_dim *= in.sizes()[ii];
+    n *= in.sizes()[ii];
   }
+  const ssize_t k = in.sizes()[in.dim() - 1];
+  const ssize_t m = mat2.size(0);
+
+  if (bias.has_value()) {
+    ET_KERNEL_CHECK_MSG(
+        ctx,
+        // Either no bias or bias is a 1D tensor of size m or 1.
+        bias->dtype() == out.dtype(),
+        InvalidArgument,
+        out,
+        "Bias has wrong dimensionality! Expected 1-D tensor of size %ld or empty,"
+        " but got %zd-D tensor with %ld elements",
+        m,
+        bias->dim(),
+        bias->numel());
+  }
+
   ET_SWITCH_REAL_TYPES_AND2(
-      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
-        size_t n = flattened_input_dim;
-        size_t k = in.sizes()[in.dim() - 1];
-        size_t m = mat2.size(0);
-
-        executorch::cpublas::gemm(
-            executorch::cpublas::TransposeType::Transpose,
-            executorch::cpublas::TransposeType::NoTranspose,
+      Half, BFloat16, out.scalar_type(), ctx, "linear.out", CTYPE, [&] {
+        // Fill output with bias if it is provided.
+        if (bias.has_value() && bias->numel() == 1) {
+          // Scalar version of initialization.
+          initialize_scalar<CTYPE>(
+              out.numel(),
+              *bias->const_data_ptr<CTYPE>(),
+              out.mutable_data_ptr<CTYPE>());
+        } else if (bias.has_value()) {
+          // Assume bias is a 1D tensor of size m.
+          initialize_to_vector<CTYPE>(
+              n,
+              m,
+              bias->const_data_ptr<CTYPE>(),
+              out.mutable_data_ptr<CTYPE>());
+        }
+
+        // Set beta to 1 if bias was applied so that GEMM adds to the pre-filled
+        // bias, otherwise beta remains 0 (i.e. the output is fully overwritten
+        // by GEMM).
+        const CTYPE beta =
+            bias.has_value() ? static_cast<CTYPE>(1) : static_cast<CTYPE>(0);
+
+        gemm(
+            /*transa=*/TransposeType::Transpose,
+            /*transb=*/TransposeType::NoTranspose,
             m,
             n,
             k,
-            static_cast<CTYPE>(1),
+            /*alpha=*/static_cast<CTYPE>(1),
             mat2.const_data_ptr<CTYPE>(),
             k,
             in.const_data_ptr<CTYPE>(),
             k,
-            static_cast<CTYPE>(0),
+            beta,
             out.mutable_data_ptr<CTYPE>(),
             m);
       });
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
@@ -31,6 +31,14 @@ class OpLinearOutTest : public OperatorTest {
     return torch::executor::aten::linear_outf(context_, self, mat2, {}, out);
   }
 
+  Tensor& op_linear_out(
+      const Tensor& self,
+      const Tensor& mat2,
+      const Tensor& bias,
+      Tensor& out) {
+    return torch::executor::aten::linear_outf(context_, self, mat2, bias, out);
+  }
+
   template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
@@ -88,6 +96,70 @@ TEST_F(OpLinearOutTest, AllDtypesSupported) {
   // for those types.
 }
 
+TEST_F(OpLinearOutTest, BiasTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Initialize input tensors.
+  constexpr int kReduceDim = 4;
+  constexpr int kDimX = 3, kDimY = 5;
+  constexpr int kValueX = 1;
+  constexpr int kValueY = 2;
+  constexpr int kValueBias = 4;
+  Tensor x = tf.full({kDimX, kReduceDim}, kValueX);
+  Tensor y = tf.full({kDimY, kReduceDim}, kValueY);
+  Tensor b = tf.full({kDimY}, kValueBias);
+  // Output matrix is also empty
+  Tensor out = tf.zeros({kDimX, kDimY});
+  // Initialize expected tensor.
+  constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias;
+  Tensor expected = tf.full({kDimX, kDimY}, kValueExpected);
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected);
+}
+
+TEST_F(OpLinearOutTest, BiasBroadcastTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Initialize input tensors.
+  constexpr int kReduceDim = 4;
+  constexpr int kDimX = 3, kDimY = 5;
+  constexpr int kValueX = 1;
+  constexpr int kValueY = 2;
+  constexpr int kValueBias = 4;
+  Tensor x = tf.full({kDimX, kReduceDim}, kValueX);
+  Tensor y = tf.full({kDimY, kReduceDim}, kValueY);
+  Tensor b = tf.full({1}, kValueBias);
+  // Output matrix is also empty
+  Tensor out = tf.zeros({kDimX, kDimY});
+  // Initialize expected tensor.
+  constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias;
+  Tensor expected = tf.full({kDimX, kDimY}, kValueExpected);
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected);
+}
+
+TEST_F(OpLinearOutTest, Bias2DTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Initialize input tensors.
+  constexpr int kReduceDim = 4;
+  constexpr int kDimX = 3, kDimY = 5;
+  constexpr int kValueX = 1;
+  constexpr int kValueY = 2;
+  constexpr int kValueBias = 4;
+  Tensor x = tf.full({kDimX, kReduceDim}, kValueX);
+  Tensor y = tf.full({kDimY, kReduceDim}, kValueY);
+  // Same size as output.
+  Tensor b = tf.full({kDimX, kDimY}, kValueBias);
+  // Output matrix is also empty
+  Tensor out = tf.zeros({kDimX, kDimY});
+  // Initialize expected tensor.
+  constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias;
+  Tensor expected = tf.full({kDimX, kDimY}, kValueExpected);
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected);
+}
+
 TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Float> tf;