Support Half/BFloat16 in softmax (pytorch#7867)

swolchok · Zonglin Peng · commit d13c6b6ca9eb · 2025-01-30T15:17:40.000-08:00
Partial fix for pytorch#7748.
diff --git a/kernels/portable/cpu/op_softmax.cpp b/kernels/portable/cpu/op_softmax.cpp
@@ -42,47 +42,48 @@ Tensor& softmax_out(
   // Adjust for negative dim
   dim = dim < 0 ? dim + nonzero_dim(in) : dim;
 
-  ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
-    const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
-    CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
+  ET_SWITCH_FLOATHBF16_TYPES(
+      in.scalar_type(), ctx, "_softmax.out", CTYPE, [&]() {
+        const CTYPE* const in_data = in.const_data_ptr<CTYPE>();
+        CTYPE* const out_data = out.mutable_data_ptr<CTYPE>();
 
-    apply_over_dim(
-        [in_data, out_data](
-            const size_t size, const size_t stride, const size_t base) {
-          // calculate max in softmax dim. During softmax computation each
-          // value is subtracted by the maximum in value before calling exp
-          // to preserve numerical stability.
-          const CTYPE max_in = apply_unary_reduce_fn(
-              [](const CTYPE val_in, CTYPE val_accum) {
-                return std::max(val_in, val_accum);
-              },
-              in_data + base,
-              size,
-              stride);
+        apply_over_dim(
+            [in_data, out_data](
+                const size_t size, const size_t stride, const size_t base) {
+              // calculate max in softmax dim. During softmax computation each
+              // value is subtracted by the maximum in value before calling exp
+              // to preserve numerical stability.
+              const CTYPE max_in = apply_unary_reduce_fn(
+                  [](const CTYPE val_in, CTYPE val_accum) {
+                    return std::max(val_in, val_accum);
+                  },
+                  in_data + base,
+                  size,
+                  stride);
 
-          const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
-              [max_in](const CTYPE val_in) {
-                return std::exp(val_in - max_in);
-              },
-              [](const CTYPE mapped_in, CTYPE val_accum) {
-                return val_accum + mapped_in;
-              },
-              in_data + base,
-              size,
-              stride);
+              const CTYPE temp_sum = apply_unary_map_reduce_fn<CTYPE, CTYPE>(
+                  [max_in](const CTYPE val_in) {
+                    return std::exp(val_in - max_in);
+                  },
+                  [](const CTYPE mapped_in, CTYPE val_accum) {
+                    return val_accum + mapped_in;
+                  },
+                  in_data + base,
+                  size,
+                  stride);
 
-          apply_unary_map_fn(
-              [max_in, temp_sum](const CTYPE val_in) {
-                return std::exp(val_in - max_in) / temp_sum;
-              },
-              in_data + base,
-              out_data + base,
-              size,
-              stride);
-        },
-        in,
-        dim);
-  });
+              apply_unary_map_fn(
+                  [max_in, temp_sum](const CTYPE val_in) {
+                    return std::exp(val_in - max_in) / temp_sum;
+                  },
+                  in_data + base,
+                  out_data + base,
+                  size,
+                  stride);
+            },
+            in,
+            dim);
+      });
 
   return out;
 }
diff --git a/kernels/test/op_softmax_test.cpp b/kernels/test/op_softmax_test.cpp
@@ -61,7 +61,15 @@ class OpSoftmaxOutTest : public OperatorTest {
       });
     // clang-format on
 
-    EXPECT_TENSOR_CLOSE(out, expected);
+    if (DTYPE == ScalarType::BFloat16) {
+      EXPECT_TENSOR_CLOSE_WITH_TOL(
+          out,
+          expected,
+          1e-2,
+          executorch::runtime::testing::internal::kDefaultAtol);
+    } else {
+      EXPECT_TENSOR_CLOSE(out, expected);
+    }
   }
 };
 
@@ -100,9 +108,10 @@ TEST_F(OpSoftmaxOutTest, HalfSupport) {
 }
 
 TEST_F(OpSoftmaxOutTest, AllDtypesSupported) {
-  test_dtype<float, ScalarType::Float>();
-  test_dtype<double, ScalarType::Double>();
-  // TODO: Also add tests for half, complex, quantized, and other types. Easiest
+#define TEST_ENTRY(ctype, dtype) test_dtype<ctype, ScalarType::dtype>();
+  ET_FORALL_FLOATHBF16_TYPES(TEST_ENTRY);
+#undef TEST_ENTRY
+  // TODO: Also add tests for complex, quantized, and other types. Easiest
   // way to do that would be to make TensorFactory support zeros() and ones()
   // for those types.
 }