Update on "[ET][Portable][Build Size] REALHBF16 binary ops: maximum, minimum, mul"

manuelcandales · manuelcandales · commit c3085d520346 · 2024-10-08T15:38:44.000-07:00
- mul: 1.69 M -> 15 K - maximum: 353 K -> 11 K - minimum: 353 K -> 11 K Differential Revision: [D63909726](https://our.internmc.facebook.com/intern/diff/D63909726/) [ghstack-poisoned]
diff --git a/kernels/portable/cpu/op_mul.cpp b/kernels/portable/cpu/op_mul.cpp
@@ -43,6 +43,12 @@ Tensor& mul_out(
 
   static constexpr const char op_name[] = "mul.out";
 
+  ET_KERNEL_CHECK(
+      ctx,
+      (executorch::runtime::isRealType(compute_type) || compute_type == ScalarType::Bool),
+      InvalidArgument,
+      out);
+
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     utils::apply_bitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
         [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) {
@@ -87,9 +93,7 @@ Tensor& mul_scalar_out(
   ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() {
     const CTYPE_COMPUTE val_b = utils::scalar_to<CTYPE_COMPUTE>(b);
     utils::apply_unitensor_elementwise_fn<CTYPE_COMPUTE, op_name>(
-        [val_b](const CTYPE_COMPUTE val_a) {
-          return val_a * val_b;
-        },
+        [val_b](const CTYPE_COMPUTE val_a) {return val_a * val_b;},
         ctx,
         a,
         utils::SupportedTensorDtypes::REALHBBF16,
diff --git a/kernels/portable/test/op_mul_test.cpp b/kernels/portable/test/op_mul_test.cpp
@@ -49,12 +49,14 @@ TEST_F(OpMulOutKernelTest, UnhandledDtypeDies) {
   std::vector<exec_aten::qint8> b_data(a_data);
   std::vector<exec_aten::qint8> out_data(a_data);
 
+  std::vector<exec_aten::DimOrderType> dim_order = {0, 1};
+
   auto a_impl = torch::executor::TensorImpl(
-      ScalarType::QInt8, 2, sizes.data(), a_data.data());
+      ScalarType::QInt8, 2, sizes.data(), a_data.data(), dim_order.data());
   auto b_impl = torch::executor::TensorImpl(
-      ScalarType::QInt8, 2, sizes.data(), b_data.data());
+      ScalarType::QInt8, 2, sizes.data(), b_data.data(), dim_order.data());
   auto out_impl = torch::executor::TensorImpl(
-      ScalarType::QInt8, 2, sizes.data(), out_data.data());
+      ScalarType::QInt8, 2, sizes.data(), out_data.data(), dim_order.data());
 
   // Two input tensors.
   Tensor a(&a_impl);