diff --git a/kernels/optimized/cpu/op_sub.cpp b/kernels/optimized/cpu/op_sub.cpp
index db2f1dd97f7..58f8d2a7fdf 100644
--- a/kernels/optimized/cpu/op_sub.cpp
+++ b/kernels/optimized/cpu/op_sub.cpp
@@ -85,7 +85,11 @@ Tensor& opt_sub_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  ET_KERNEL_CHECK(ctx, tensor_is_realh_type(out), InvalidArgument, out);
+  ET_KERNEL_CHECK(
+      ctx,
+      executorch::runtime::tensor_is_realhbf16_type(out),
+      InvalidArgument,
+      out);
   if (a.numel() == 1 || b.numel() == 1) {
     if (a_type == b_type && a_type == out_type && a_type != ScalarType::Half) {
       const Tensor* tensor;
@@ -169,7 +173,7 @@ Tensor& opt_sub_scalar_out(
   ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");
 
   if (a_type == common_type && a_type == out_type &&
-      a_type != ScalarType::Half) {
+      a_type != ScalarType::Half && a_type != ScalarType::BFloat16) {
     ET_SWITCH_REAL_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE, [&]() {
       CTYPE b_casted = utils::scalar_to<CTYPE>(b);
       CTYPE alpha_val;
@@ -186,9 +190,9 @@ Tensor& opt_sub_scalar_out(
           out.numel());
     });
   } else {
-    ET_SWITCH_REALH_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHBF16_TYPES(a_type, ctx, "sub.Scalar_out", CTYPE_A, [&]() {
       ET_SWITCH_REAL_TYPES(common_type, ctx, "sub.Scalar_out", CTYPE_IN, [&]() {
-        ET_SWITCH_REALH_TYPES(
+        ET_SWITCH_REALHBF16_TYPES(
             out_type, ctx, "sub.Scalar_out", CTYPE_OUT, [&]() {
               CTYPE_IN b_casted = utils::scalar_to<CTYPE_IN>(b);
               CTYPE_IN alpha_val;
diff --git a/kernels/test/op_floor_divide_test.cpp b/kernels/test/op_floor_divide_test.cpp
index d871b8d5216..8be1168eee1 100644
--- a/kernels/test/op_floor_divide_test.cpp
+++ b/kernels/test/op_floor_divide_test.cpp
@@ -57,10 +57,9 @@ class OpFloorDivideTest : public OperatorTest {
     Tensor out = tf.zeros(sizes);
 
     // floor_divide two tensors.
-    // std::floor(-0.5 / -0.1) == 5.0, but -0.5 // -0.1 yeilds 4.0
     op_floor_divide_out(
-        tf.make(sizes, /*data=*/{-5.3, 1.1, 2.2, 4.4, 6.8, -0.5}),
-        tf.make(sizes, /*data=*/{2.7, 2.0, 2.0, 2.0, 2.0, -0.1}),
+        tf.make(sizes, /*data=*/{-5.3, 1.1, 2.2, 4.4, 6.8, -0.9}),
+        tf.make(sizes, /*data=*/{2.7, 2.0, 2.0, 2.0, 2.0, -0.2}),
         out);
 
     // Check that it matches the expected output.
@@ -113,6 +112,14 @@ TEST_F(OpFloorDivideTest, DoubleTensors) {
   test_floating_point_floor_divide<ScalarType::Double>();
 }
 
+TEST_F(OpFloorDivideTest, HalfTensors) {
+  test_floating_point_floor_divide<ScalarType::Half>();
+}
+
+TEST_F(OpFloorDivideTest, BFloat16Tensors) {
+  test_floating_point_floor_divide<ScalarType::BFloat16>();
+}
+
 TEST_F(OpFloorDivideTest, UnhandledDtypeDies) {
   // floor_divide() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
@@ -331,3 +338,17 @@ TEST_F(OpFloorDivideTest, DynamicShapeUnbound) {
   Tensor ret = op_floor_divide_out(x, y, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
+
+// std::floor(0.5 / 0.1) == 5.0, but 0.5 // 0.1 yeilds 4.0
+TEST_F(OpFloorDivideTest, FloatFloorDivideEdgeCase) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make({1, 2}, {0.5, -0.5});
+  Tensor y = tf.make({1, 2}, {0.1, -0.1});
+  Tensor expected_result = tf.make({1, 2}, {4.0, 4.0});
+
+  Tensor out = tf.zeros({1, 2});
+  Tensor ret = op_floor_divide_out(x, y, out);
+  EXPECT_TENSOR_EQ(ret, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
diff --git a/kernels/test/op_rsub_test.cpp b/kernels/test/op_rsub_test.cpp
index f3fa5eedf9e..e2bcbd78dcc 100644
--- a/kernels/test/op_rsub_test.cpp
+++ b/kernels/test/op_rsub_test.cpp
@@ -64,14 +64,17 @@ class OpRSubScalarOutTest : public OperatorTest {
     Tensor out = tf.zeros(sizes);
 
     // Performs substraction of tensor from scalar.
+    // Values selected to be exactly representable to avoid throwing off
+    // half/bfloat16 tests.
     op_rsub_scalar_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
-        1.1,
+        tf.make(sizes, /*data=*/{1.25, 2.25, 4.5, 8.875}),
+        1.0,
         /*alpha=*/1,
         out);
 
     // Check that it matches the expected output.
-    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.0, -1.1, -3.3, -7.7}));
+    EXPECT_TENSOR_CLOSE(
+        out, tf.make(sizes, /*data=*/{-0.25, -1.25, -3.5, -7.875}));
   }
 
   /* %python
@@ -168,6 +171,14 @@ TEST_F(OpRSubScalarOutTest, DoubleTensors) {
   test_floating_point_rsub_scalar_out<ScalarType::Double>();
 }
 
+TEST_F(OpRSubScalarOutTest, HalfTensors) {
+  test_floating_point_rsub_scalar_out<ScalarType::Half>();
+}
+
+TEST_F(OpRSubScalarOutTest, BFloat16Tensors) {
+  test_floating_point_rsub_scalar_out<ScalarType::BFloat16>();
+}
+
 TEST_F(OpRSubScalarOutTest, UnhandledDtypeDies) {
   // op_rsub_scalar_out() doesn't handle Bool.
   TensorFactory<ScalarType::Bool> tf;
diff --git a/kernels/test/op_sub_test.cpp b/kernels/test/op_sub_test.cpp
index aafaf688b0d..aa7d4d51e4e 100644
--- a/kernels/test/op_sub_test.cpp
+++ b/kernels/test/op_sub_test.cpp
@@ -90,13 +90,15 @@ class OpSubOutTest : public OperatorTest {
 
     // Performs substraction on two tensors.
     op_sub_out(
-        tf.make(sizes, /*data=*/{1.1, 2.2, 4.4, 8.8}),
+        tf.make(sizes, /*data=*/{1.25, 2.25, 4.5, 8.875}),
         tf.ones(sizes),
         /*alpha=*/1,
         out);
 
-    // Check that it matches the expected output.
-    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.1, 1.2, 3.4, 7.8}));
+    // Check that it matches the expected output. Values selected to
+    // be exactly representable to avoid throwing off half/bfloat16
+    // tests.
+    EXPECT_TENSOR_CLOSE(out, tf.make(sizes, /*data=*/{0.25, 1.25, 3.5, 7.875}));
   }
 
   template <ScalarType DTYPE>
@@ -260,6 +262,14 @@ TEST_F(OpSubOutTest, DoubleTensors) {
   test_floating_point_sub_out<ScalarType::Double>();
 }
 
+TEST_F(OpSubOutTest, HalfTensors) {
+  test_floating_point_sub_out<ScalarType::Half>();
+}
+
+TEST_F(OpSubOutTest, BFloat16Tensors) {
+  test_floating_point_sub_out<ScalarType::BFloat16>();
+}
+
 TEST_F(OpSubOutTest, BroadcastSupported) {
   TensorFactory<ScalarType::Float> tf;