Update base for Update on "[executorch][flat_tensor] Serialize flat tensor"

lucylq · lucylq · commit 810253f27a3b · 2025-01-13T16:58:24.000-08:00
Serialize a flat tensor file. The resulting file looks like: Header with - flatbuffer offset and size - segment data offset and size Flatbuffer Tensor data (in segment) Differential Revision: [D66374253](https://our.internmc.facebook.com/intern/diff/D66374253/) [ghstack-poisoned]
diff --git a/backends/cadence/fusion_g3/operators/op_mean.cpp b/backends/cadence/fusion_g3/operators/op_mean.cpp
@@ -59,7 +59,7 @@ int prepare_data(
   return num_axis_dims;
 }
 
-Tensor& mean_dim_out(
+Tensor& mean_out(
     KernelRuntimeContext& ctx,
     const Tensor& in,
     optional<ArrayRef<int64_t>> dim_list,
@@ -199,4 +199,4 @@ Tensor& mean_dim_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
@@ -33,6 +33,7 @@ def setUp(self):
         self.num_kv_heads = 8
         self.head_dim = 64
         self.max_seq_len = 128
+        self.encoder_max_seq_len = 128
         self.rope_base = 500_000
         self.scale_factor = 32
 
@@ -86,16 +87,26 @@ def setUp(self):
             max_seq_len=self.max_seq_len,
         )
         self.et_mha.load_state_dict(self.tt_mha.state_dict())
+
         # Common inputs.
         seq_len = 10
         self.x = torch.randn(1, seq_len, self.embed_dim)
+        self.y = torch.randn(1, seq_len, self.embed_dim)
         self.input_pos = torch.arange(seq_len).unsqueeze(0)  # shape [1, seq_len]
-        seq_len_dim = torch.export.Dim("seq_len", min=1, max=100)
-        self.dynamic_shapes = (
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim, 2: torch.export.Dim.STATIC},
-            {0: torch.export.Dim.STATIC, 1: seq_len_dim},
-        )
+        self.seq_len_dim = torch.export.Dim("seq_len", min=1, max=self.max_seq_len)
+        self.dynamic_shapes = {
+            "x": {
+                0: torch.export.Dim.STATIC,
+                1: self.seq_len_dim,
+                2: torch.export.Dim.STATIC,
+            },
+            "y": {
+                0: torch.export.Dim.STATIC,
+                1: self.seq_len_dim,
+                2: torch.export.Dim.STATIC,
+            },
+            "input_pos": {0: torch.export.Dim.STATIC, 1: self.seq_len_dim},
+        }
         self.causal_mask = torch.tril(
             torch.ones(
                 size=(self.max_seq_len, self.max_seq_len),
@@ -110,8 +121,8 @@ def test_attention_eager(self):
         assert_close(et_res, tt_res)
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=20)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         et_res = self.et_mha(self.x, self.x)  # Self attention.
         tt_res = self.tt_mha(self.x, self.x)  # Self attention.
@@ -144,12 +155,12 @@ def test_attention_export(self):
         # Self attention.
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         with torch.no_grad():
             et_mha_ep = torch.export.export(
                 self.et_mha,
-                (self.x, self.x),
+                (self.x, self.y),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
                 strict=True,
@@ -166,8 +177,8 @@ def test_attention_aoti(self):
         # Self attention.
 
         # test with kv cache
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         with torch.no_grad():
             so = torch._export.aot_compile(
                 self.et_mha,
@@ -189,13 +200,13 @@ def test_attention_aoti(self):
 
     def test_attention_executorch(self):
         # Self attention.
-        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         with torch.no_grad():
             et_mha_ep = torch.export.export(
                 self.et_mha,
-                (self.x, self.x),
+                (self.x, self.y),
                 kwargs={"input_pos": self.input_pos},
                 dynamic_shapes=self.dynamic_shapes,
                 strict=True,
@@ -222,22 +233,18 @@ def test_attention_executorch(self):
 
     def test_attention_torch_cond_eager(self):
         # Different from vanilla torchtune MHA, we rewrite the if condition with torch.cond. We need to make sure they are giving the same results regarding the if condition.
-        # For the first run of MHA we provide `y` (self.x) but for the second run it will be a tensor full of nan.
+        # For the first run of MHA we provide `y` but for the second run it will be a tensor full of nan.
         self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
         mask = self.causal_mask[self.input_pos, :]
         # First run.
-        et_res = self.et_mha(
-            self.x, self.x, mask=mask, input_pos=self.input_pos
-        )  # Self attention with input pos.
-        tt_res = self.tt_mha(
-            self.x, self.x, mask=mask, input_pos=self.input_pos
-        )  # Self attention with input pos.
+        et_res = self.et_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
 
         assert_close(et_res, tt_res)
 
-        # Second run test kv cache read. Input pos is [10, 11, ..., 19]
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
         next_input_pos = torch.arange(10, 20).unsqueeze(0)
 
         empty_y = torch.full_like(self.x, torch.nan)
@@ -246,3 +253,101 @@ def test_attention_torch_cond_eager(self):
         tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
 
         assert_close(et_res, tt_res)
+
+    def test_attention_torch_cond_export(self):
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        mask = self.causal_mask[self.input_pos, :]
+        dynamic_shapes = {
+            **self.dynamic_shapes,
+            **{
+                "mask": {
+                    0: torch.export.Dim.STATIC,
+                    1: self.seq_len_dim,
+                    2: torch.export.Dim.STATIC,
+                }
+            },
+        }
+        with torch.no_grad():
+            et_mha_ep = torch.export.export(
+                self.et_mha,
+                (self.x, self.y),
+                kwargs={
+                    "mask": mask,
+                    "input_pos": self.input_pos,
+                },
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+            )
+
+        # First run.
+        et_res = et_mha_ep.module()(self.x, self.y, mask=mask, input_pos=self.input_pos)
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+
+        assert_close(et_res, tt_res)
+
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
+        next_input_pos = torch.arange(10, 20).unsqueeze(0)
+        empty_y = torch.full_like(self.y, torch.nan)
+        mask = self.causal_mask[next_input_pos, :]
+        et_res = et_mha_ep.module()(
+            self.x, empty_y, mask=mask, input_pos=next_input_pos
+        )
+        tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
+
+        assert_close(et_res, tt_res)
+
+    def test_attention_torch_cond_executorch(self):
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
+        mask = self.causal_mask[self.input_pos, :]
+        dynamic_shapes = {
+            **self.dynamic_shapes,
+            **{
+                "mask": {
+                    0: torch.export.Dim.STATIC,
+                    1: self.seq_len_dim,
+                    2: torch.export.Dim.STATIC,
+                }
+            },
+        }
+        with torch.no_grad():
+            et_mha_ep = torch.export.export(
+                self.et_mha,
+                (self.x, self.y),
+                kwargs={
+                    "mask": mask,
+                    "input_pos": self.input_pos,
+                },
+                dynamic_shapes=dynamic_shapes,
+                strict=True,
+            )
+        et_program = to_edge(
+            et_mha_ep,
+            compile_config=EdgeCompileConfig(
+                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg],
+                _check_ir_validity=False,
+            ),
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                passes=[InitializedMutableBufferPass(["cache_pos"])],
+            )
+        )
+
+        # First run.
+        runtime = Runtime.get()
+        program = runtime.load_program(et_program.buffer)
+        method = program.load_method("forward")
+        et_res = method.execute((self.x, self.y, mask, self.input_pos))
+        tt_res = self.tt_mha(self.x, self.y, mask=mask, input_pos=self.input_pos)
+
+        assert_close(et_res[0], tt_res)
+
+        # Second run tests kv cache read. Input pos is [10, 11, ..., 19]
+        next_input_pos = torch.arange(10, 20).unsqueeze(0)
+        empty_y = torch.full_like(self.y, torch.nan)
+        mask = self.causal_mask[next_input_pos, :]
+        et_res = method.execute((self.x, empty_y, mask, next_input_pos))
+        tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
+
+        assert_close(et_res[0], tt_res)
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
@@ -257,6 +257,8 @@
 
 - op: mean.out
 
+- op: mean.dtype_out
+
 - op: min.dim_min
 
 - op: min.unary_out
diff --git a/kernels/portable/cpu/op_mean.cpp b/kernels/portable/cpu/op_mean.cpp
@@ -66,6 +66,14 @@ Tensor& mean_dim_out(
   return out;
 }
 
+Tensor& mean_dtype_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& in,
+    optional<ScalarType> dtype,
+    Tensor& out) {
+  return mean_dim_out(ctx, in, ArrayRef<int64_t>(), false, dtype, out);
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/util/reduce_util.cpp b/kernels/portable/cpu/util/reduce_util.cpp
@@ -386,6 +386,7 @@ bool check_mean_dim_args(
       check_reduction_args(in, dim_list, keepdim, dtype, out));
 
   if (dtype) {
+    ET_LOG(Info, "dtype is %hhd", static_cast<int8_t>(dtype.value()));
     ET_LOG_AND_RETURN_IF_FALSE(torch::executor::isFloatingType(dtype.value()));
     ET_LOG_AND_RETURN_IF_FALSE(out.scalar_type() == dtype.value());
   } else {
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
@@ -577,6 +577,11 @@
     - arg_meta: null
       kernel_name: torch::executor::mean_dim_out
 
+- op: mean.dtype_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::mean_dtype_out
+
 - op: min.dim_min
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op_mean_test.cpp b/kernels/test/op_mean_test.cpp
@@ -9,7 +9,7 @@
 #include <executorch/kernels/test/FunctionHeaderWrapper.h> // Declares the operator
 #include <executorch/kernels/test/TestUtil.h>
 #include <executorch/kernels/test/supported_features.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/error.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_util.h>
 #include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
@@ -22,6 +22,7 @@ using exec_aten::ArrayRef;
 using exec_aten::optional;
 using exec_aten::ScalarType;
 using exec_aten::Tensor;
+using executorch::runtime::Error;
 using torch::executor::testing::TensorFactory;
 
 class OpMeanOutTest : public OperatorTest {
@@ -36,6 +37,13 @@ class OpMeanOutTest : public OperatorTest {
         context_, self, dim, keepdim, dtype, out);
   }
 
+  Tensor& op_mean_dtype_out(
+      const Tensor& self,
+      optional<ScalarType> dtype,
+      Tensor& out) {
+    return torch::executor::aten::mean_outf(context_, self, dtype, out);
+  }
+
   template <ScalarType IN_DTYPE, ScalarType OUT_DTYPE>
   void test_mean_dim_out_invalid_dimensions() {
     TensorFactory<IN_DTYPE> tf_in;
@@ -466,3 +474,68 @@ TEST_F(OpMeanOutTest, DynamicShapeUnbound) {
       op_mean_out(x, ArrayRef<int64_t>{1}, false, ScalarType::Float, out);
   EXPECT_TENSOR_CLOSE(out, expected_result);
 }
+
+TEST_F(OpMeanOutTest, DTypeOutFloatValid) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {10, 10},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  Tensor expected_result = tf.make({}, {1.0});
+
+  Tensor out = tf.zeros({});
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatToBoolInvalid) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make(
+      {10, 10},
+      {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+       1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0});
+  Tensor expected_result = tf.make({}, {1.0});
+
+  Tensor out = tf.zeros({});
+
+  ET_EXPECT_KERNEL_FAILURE(
+      context_, op_mean_dtype_out(x, ScalarType::Bool, out));
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatInfinity) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make({2, 1}, {INFINITY, INFINITY});
+  Tensor expected_result = tf.make({}, {INFINITY});
+
+  Tensor out = tf.zeros({});
+
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
+
+TEST_F(OpMeanOutTest, DTypeOutFloatNAN) {
+  TensorFactory<ScalarType::Float> tf;
+
+  Tensor x = tf.make({2, 1}, {NAN, INFINITY});
+  Tensor expected_result = tf.make({}, {NAN});
+
+  Tensor out = tf.zeros({});
+
+  Tensor ret = op_mean_dtype_out(x, ScalarType::Float, out);
+  EXPECT_TENSOR_CLOSE(out, expected_result);
+}
diff --git a/runtime/executor/method_meta.cpp b/runtime/executor/method_meta.cpp
diff --git a/test/size_test.cpp b/test/size_test.cpp