Add support for bias in optimized op_linear.cpp. (#11210)

hsharma35 · facebook-github-bot · commit e69df21135c0 · 2025-05-29T22:39:56.000-07:00
Summary: Pull Request resolved: #11210 Diff uses `op_add_sub_impl` to add bias after optimized gemm call. Reviewed By: zonglinpeng Differential Revision: D75491158
diff --git a/kernels/optimized/cpu/op_linear.cpp b/kernels/optimized/cpu/op_linear.cpp
@@ -16,20 +16,16 @@ namespace torch {
 namespace executor {
 namespace native {
 
-using Tensor = executorch::aten::Tensor;
+using ::executorch::aten::Tensor;
+using ::executorch::cpublas::gemm;
+using ::executorch::cpublas::TransposeType;
 
 Tensor& opt_linear_out(
     RuntimeContext& ctx,
     const Tensor& in,
     const Tensor& mat2,
     const optional<Tensor>& bias,
     Tensor& out) {
-  ET_KERNEL_CHECK_MSG(
-      ctx,
-      !bias.has_value(),
-      InvalidArgument,
-      out,
-      "bias not supported yet in linear");
   ET_KERNEL_CHECK(ctx, check_linear_args(in, mat2, out), InvalidArgument, out);
 
   size_t output_ndim = 0;
@@ -46,28 +42,44 @@ Tensor& opt_linear_out(
     return out;
   }
 
-  int flattened_input_dim = 1;
+  ssize_t flattened_input_dim = 1;
   for (int ii = 0; ii < in.dim() - 1; ++ii) {
     flattened_input_dim *= in.sizes()[ii];
   }
+
   ET_SWITCH_REAL_TYPES_AND2(
-      Half, BFloat16, in.scalar_type(), ctx, "mm.out", CTYPE, [&]() {
-        size_t n = flattened_input_dim;
-        size_t k = in.sizes()[in.dim() - 1];
-        size_t m = mat2.size(0);
+      Half, BFloat16, in.scalar_type(), ctx, "linear.out", CTYPE, [&] {
+        const ssize_t n = flattened_input_dim;
+        const ssize_t k = in.sizes()[in.dim() - 1];
+        const ssize_t m = mat2.size(0);
+
+        // Output is a n x m x CTYPE, while bias is m x CTYPE.
+        const size_t row_size = static_cast<size_t>(m) * sizeof(CTYPE);
+        for (const auto col : c10::irange(n)) {
+          std::memcpy(
+              // Point to Column `col` of the output tensor.
+              out.mutable_data_ptr<CTYPE>() + col * m,
+              bias->const_data_ptr<CTYPE>(),
+              row_size);
+        }
+        // Set beta to 1 if bias was applied so that GEMM adds to the pre-filled
+        // bias, otherwise beta remains 0 (i.e. the output is fully overwritten
+        // by GEMM).
+        const CTYPE beta =
+            bias.has_value() ? static_cast<CTYPE>(1) : static_cast<CTYPE>(0);
 
-        executorch::cpublas::gemm(
-            executorch::cpublas::TransposeType::Transpose,
-            executorch::cpublas::TransposeType::NoTranspose,
+        gemm(
+            /*transa=*/TransposeType::Transpose,
+            /*transb=*/TransposeType::NoTranspose,
             m,
             n,
             k,
-            static_cast<CTYPE>(1),
+            /*alpha=*/static_cast<CTYPE>(1),
             mat2.const_data_ptr<CTYPE>(),
             k,
             in.const_data_ptr<CTYPE>(),
             k,
-            static_cast<CTYPE>(0),
+            beta,
             out.mutable_data_ptr<CTYPE>(),
             m);
       });
diff --git a/kernels/test/op_linear_test.cpp b/kernels/test/op_linear_test.cpp
@@ -31,6 +31,14 @@ class OpLinearOutTest : public OperatorTest {
     return torch::executor::aten::linear_outf(context_, self, mat2, {}, out);
   }
 
+  Tensor& op_linear_out(
+      const Tensor& self,
+      const Tensor& mat2,
+      const Tensor& bias,
+      Tensor& out) {
+    return torch::executor::aten::linear_outf(context_, self, mat2, bias, out);
+  }
+
   template <class CTYPE, executorch::aten::ScalarType DTYPE>
   void test_dtype() {
     TensorFactory<DTYPE> tf;
@@ -88,6 +96,48 @@ TEST_F(OpLinearOutTest, AllDtypesSupported) {
   // for those types.
 }
 
+TEST_F(OpLinearOutTest, BiasTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Initialize input tensors.
+  constexpr int kReduceDim = 4;
+  constexpr int kDimX = 3, kDimY = 5;
+  constexpr int kValueX = 1;
+  constexpr int kValueY = 2;
+  constexpr int kValueBias = 4;
+  Tensor x = tf.full({kDimX, kReduceDim}, kValueX);
+  Tensor y = tf.full({kDimY, kReduceDim}, kValueY);
+  Tensor b = tf.full({kDimY}, kValueBias);
+  // Output matrix is also empty
+  Tensor out = tf.zeros({kDimX, kDimY});
+  // Initialize expected tensor.
+  constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias;
+  Tensor expected = tf.full({kDimX, kDimY}, kValueExpected);
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected);
+}
+
+TEST_F(OpLinearOutTest, BiasBroadcastTest) {
+  TensorFactory<ScalarType::Int> tf;
+
+  // Initialize input tensors.
+  constexpr int kReduceDim = 4;
+  constexpr int kDimX = 3, kDimY = 5;
+  constexpr int kValueX = 1;
+  constexpr int kValueY = 2;
+  constexpr int kValueBias = 4;
+  Tensor x = tf.full({kDimX, kReduceDim}, kValueX);
+  Tensor y = tf.full({kDimY, kReduceDim}, kValueY);
+  Tensor b = tf.full({1}, kValueBias);
+  // Output matrix is also empty
+  Tensor out = tf.zeros({kDimX, kDimY});
+  // Initialize expected tensor.
+  constexpr int kValueExpected = kValueX * kValueY * kReduceDim + kValueBias;
+  Tensor expected = tf.full({kDimX, kDimY}, kValueExpected);
+
+  EXPECT_TENSOR_EQ(op_linear_out(x, y, b, out), expected);
+}
+
 TEST_F(OpLinearOutTest, EmptyInputWithEmptyOutTensorPasses) {
   TensorFactory<ScalarType::Float> tf;