[ET][Kernels] Increase Half/Bfloat16 support (#13719)

pytorchbot · manuelcandales · web-flow · commit 99e63490c043 · 2025-08-28T10:30:08.000-04:00
Add Half/Bfloat16 dtype support for the following ops:
 - bmm.out
 - max.dim_max
 - min.dim_min
 - scatter_add.out

Differential Revision: D80963875
@diff-train-skip-merge

Co-authored-by: Manuel Candales &lt;manuelcandales@gmail.com&gt;
Co-authored-by: Manuel Candales &lt;42380156+manuelcandales@users.noreply.github.com&gt;
diff --git a/kernels/optimized/cpu/op_bmm.cpp b/kernels/optimized/cpu/op_bmm.cpp
@@ -158,7 +158,7 @@ Tensor& opt_bmm_out(
       bmm_kernel<CTYPE>(self, mat2, out);
     });
   } else {
-    ET_SWITCH_REALH_TYPES(self_type, ctx, name, CTYPE, [&]() {
+    ET_SWITCH_REALHBF16_TYPES(self_type, ctx, name, CTYPE, [&]() {
       bmm_kernel<CTYPE>(self, mat2, out);
     });
   }
diff --git a/kernels/portable/cpu/op_bmm.cpp b/kernels/portable/cpu/op_bmm.cpp
@@ -46,7 +46,7 @@ Tensor& bmm_out(
       internal::bmm_out_impl<CTYPE>(in, mat2, out);
     });
   } else {
-    ET_SWITCH_REALH_TYPES(in_type, ctx, op_name, CTYPE, [&]() {
+    ET_SWITCH_REALHBF16_TYPES(in_type, ctx, op_name, CTYPE, [&]() {
       internal::bmm_out_impl<CTYPE>(in, mat2, out);
     });
   }
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
@@ -79,8 +79,8 @@ std::tuple<Tensor&, Tensor&> max_out(
 
   dim = dim < 0 ? dim + in.dim() : dim;
 
-  ET_SWITCH_REAL_TYPES_AND(
-      Bool, in.scalar_type(), ctx, "max.dim_max", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(
+      in.scalar_type(), ctx, "max.dim_max", CTYPE, [&]() {
         CTYPE* max_data = max.mutable_data_ptr<CTYPE>();
         long* max_indices_data = max_indices.mutable_data_ptr<long>();
 
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
@@ -79,8 +79,8 @@ std::tuple<Tensor&, Tensor&> min_out(
 
   dim = dim < 0 ? dim + in.dim() : dim;
 
-  ET_SWITCH_REAL_TYPES_AND(
-      Bool, in.scalar_type(), ctx, "min.dim_min", CTYPE, [&]() {
+  ET_SWITCH_REALHBBF16_TYPES(
+      in.scalar_type(), ctx, "min.dim_min", CTYPE, [&]() {
         CTYPE* min_data = min.mutable_data_ptr<CTYPE>();
         long* min_indices_data = min_indices.mutable_data_ptr<long>();
 
diff --git a/kernels/portable/cpu/op_scatter_add.cpp b/kernels/portable/cpu/op_scatter_add.cpp
@@ -79,24 +79,24 @@ Tensor& scatter_add_out(
 
   ScalarType self_type = self.scalar_type();
 
-  ET_SWITCH_REAL_TYPES_AND(
-      Bool, self_type, ctx, "scatter_add.out", CTYPE, [&]() {
-        const CTYPE* self_data = self.const_data_ptr<CTYPE>();
-        const long* index_data = index.const_data_ptr<long>();
-        const CTYPE* src_data = src.const_data_ptr<CTYPE>();
-        CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
-
-        memcpy(out_data, self_data, self.nbytes());
-
-        if (index.numel() != 0) {
-          if (self.dim() == 0) {
-            out_data[0] += nonempty_size(index, 0) * src_data[0];
-          } else {
-            scatter_add_helper<CTYPE>(
-                src_data, index_data, out_data, src, index, out, dim);
-          }
-        }
-      });
+  ET_SWITCH_REALHBBF16_TYPES(self_type, ctx, "scatter_add.out", CTYPE, [&]() {
+    const CTYPE* self_data = self.const_data_ptr<CTYPE>();
+    const long* index_data = index.const_data_ptr<long>();
+    const CTYPE* src_data = src.const_data_ptr<CTYPE>();
+    CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+    memcpy(out_data, self_data, self.nbytes());
+
+    if (index.numel() != 0) {
+      if (self.dim() == 0) {
+        out_data[0] +=
+            static_cast<CTYPE>(nonempty_size(index, 0)) * src_data[0];
+      } else {
+        scatter_add_helper<CTYPE>(
+            src_data, index_data, out_data, src, index, out, dim);
+      }
+    }
+  });
 
   return out;
 }
diff --git a/kernels/test/op_bmm_test.cpp b/kernels/test/op_bmm_test.cpp
@@ -189,7 +189,7 @@ TEST_F(OpBmmOutTest, OutputDimFloat) {
 /// zeros().
 TEST_F(OpBmmOutTest, AllRealDtypesSupported) {
 #define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
   // TODO: Also add tests for half, complex, quantized, and other types. Easiest
   // way to do that would be to make TensorFactory support zeros() and ones()
diff --git a/kernels/test/op_max_test.cpp b/kernels/test/op_max_test.cpp
@@ -316,7 +316,7 @@ TEST_F(OpMaxOutTest, MismatchedDTypesDies) {
 
 TEST_F(OpMaxOutTest, AllRealInputLongOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_max_out_dtype<ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+  ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
diff --git a/kernels/test/op_min_test.cpp b/kernels/test/op_min_test.cpp
@@ -312,7 +312,7 @@ TEST_F(OpMinOutTest, MismatchedDTypesDies) {
 
 TEST_F(OpMinOutTest, AllRealInputLongOutputPasses) {
 #define TEST_ENTRY(ctype, dtype) test_min_out_dtype<ScalarType::dtype>();
-  ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);
+  ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 
diff --git a/kernels/test/op_scatter_add_test.cpp b/kernels/test/op_scatter_add_test.cpp
@@ -281,7 +281,7 @@ class OpScatterAddOutTest : public OperatorTest {
 
 TEST_F(OpScatterAddOutTest, AllValidInputOutputSupport) {
 #define TEST_ENTRY(CTYPE, DTYPE) test_scatter_add_out<ScalarType::DTYPE>();
-  ET_FORALL_REAL_TYPES(TEST_ENTRY);
+  ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);
 #undef TEST_ENTRY
 }
 

Original file line number	Diff line number	Diff line change
`@@ -158,7 +158,7 @@ Tensor& opt_bmm_out(`
`158`	`158`	`bmm_kernel<CTYPE>(self, mat2, out);`
`159`	`159`	`});`
`160`	`160`	`} else {`
`161`		`- ET_SWITCH_REALH_TYPES(self_type, ctx, name, CTYPE, [&]() {`
	`161`	`+ ET_SWITCH_REALHBF16_TYPES(self_type, ctx, name, CTYPE, [&]() {`
`162`	`162`	`bmm_kernel<CTYPE>(self, mat2, out);`
`163`	`163`	`});`
`164`	`164`	`}`
Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ Tensor& bmm_out(`
`46`	`46`	`internal::bmm_out_impl<CTYPE>(in, mat2, out);`
`47`	`47`	`});`
`48`	`48`	`} else {`
`49`		`- ET_SWITCH_REALH_TYPES(in_type, ctx, op_name, CTYPE, [&]() {`
	`49`	`+ ET_SWITCH_REALHBF16_TYPES(in_type, ctx, op_name, CTYPE, [&]() {`
`50`	`50`	`internal::bmm_out_impl<CTYPE>(in, mat2, out);`
`51`	`51`	`});`
`52`	`52`	`}`
Original file line number	Diff line number	Diff line change
`@@ -316,7 +316,7 @@ TEST_F(OpMaxOutTest, MismatchedDTypesDies) {`
`316`	`316`
`317`	`317`	`TEST_F(OpMaxOutTest, AllRealInputLongOutputPasses) {`
`318`	`318`	`#define TEST_ENTRY(ctype, dtype) test_max_out_dtype<ScalarType::dtype>();`
`319`		`- ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);`
	`319`	`+ ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);`
`320`	`320`	`#undef TEST_ENTRY`
`321`	`321`	`}`
`322`	`322`
Original file line number	Diff line number	Diff line change
`@@ -312,7 +312,7 @@ TEST_F(OpMinOutTest, MismatchedDTypesDies) {`
`312`	`312`
`313`	`313`	`TEST_F(OpMinOutTest, AllRealInputLongOutputPasses) {`
`314`	`314`	`#define TEST_ENTRY(ctype, dtype) test_min_out_dtype<ScalarType::dtype>();`
`315`		`- ET_FORALL_REAL_TYPES_AND(Bool, TEST_ENTRY);`
	`315`	`+ ET_FORALL_REALHBBF16_TYPES(TEST_ENTRY);`
`316`	`316`	`#undef TEST_ENTRY`
`317`	`317`	`}`
`318`	`318`
Original file line number	Diff line number	Diff line change
`@@ -281,7 +281,7 @@ class OpScatterAddOutTest : public OperatorTest {`
`281`	`281`
`282`	`282`	`TEST_F(OpScatterAddOutTest, AllValidInputOutputSupport) {`
`283`	`283`	`#define TEST_ENTRY(CTYPE, DTYPE) test_scatter_add_out<ScalarType::DTYPE>();`
`284`		`- ET_FORALL_REAL_TYPES(TEST_ENTRY);`
	`284`	`+ ET_FORALL_REALHBF16_TYPES(TEST_ENTRY);`
`285`	`285`	`#undef TEST_ENTRY`
`286`	`286`	`}`
`287`	`287`