Run example llama2 model with fp16 (#1902)

digantdesai · facebook-github-bot · commit eb50c4648cf4 · 2024-02-09T13:43:35.000-08:00
Summary: Pull Request resolved: #1902 FYI - there are hardcoded `float` in rmsnorm which makes bunch of nodes in the graph as fp32. ``` aten_embedding_default: "f16[1, 3, 64]" = executorch_exir_dialects_edge__ops_aten_embedding_default(arg11_1, arg55_1); arg11_1 = arg55_1 = None aten_slice_copy_tensor: "f16[3, 4]" = executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor(arg48_1, 0, 0, 3); arg48_1 = None aten_slice_copy_tensor_1: "f16[3, 4]" = executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor(arg49_1, 0, 0, 3); arg49_1 = None aten__to_copy_default: "f32[1, 3, 64]" = executorch_exir_dialects_edge__ops_aten__to_copy_default(aten_embedding_default, dtype = torch.float32) (a lot of nodes in fp32 after this, and then we go back to fp16 and so on) ``` Copy op from - https://www.internalfb.com/code/fbsource/%5B7e45e7bcd969%5D/xplat/executorch/examples/models/llama2/model.py?lines=78 Reviewed By: larryliu0820 Differential Revision: D53596500 fbshipit-source-id: b6b3ebddfb9a25d1e52e9202d216e9ead9a6c62d
diff --git a/backends/xnnpack/test/models/llama2_et_example.py b/backends/xnnpack/test/models/llama2_et_example.py
@@ -6,25 +6,45 @@
 
 import unittest
 
+import torch
+
 from executorch.backends.xnnpack.test.tester import Tester
 from executorch.examples.models.llama2.model import Llama2Model
 
 
 class TestLlama2ETExample(unittest.TestCase):
-    llama2 = Llama2Model()
-    model = llama2.get_eager_model()
-    example_inputs = llama2.get_example_inputs()
+    def test_f32(self):
+        self._test()
+
+    def test_f16(self):
+        self._test(torch.float16)
 
     # TODO - dynamic shape
 
-    def test_fp32(self):
+    def _test(self, dtype: torch.dtype = torch.float):
+        assert dtype in [
+            torch.float,
+            torch.float16,
+        ], f"Only fp32 and fp16 are supported, but got dtype: {dtype}"
+
+        llama2 = Llama2Model()
+        model = llama2.get_eager_model().to(dtype)
+
+        # Only convert fp32 inputs to dtype
+        example_inputs = tuple(
+            tensor.to(dtype) if tensor.dtype == torch.float32 else tensor
+            for tensor in llama2.get_example_inputs()
+        )
+
         (
-            Tester(self.model, self.example_inputs)
+            Tester(model, example_inputs)
             .export()
             .to_edge()
+            .dump_artifact()
             .partition()
+            .dump_artifact()
             .to_executorch()
             .serialize()
             .run_method()
-            .compare_outputs()
+            .compare_outputs(atol=5e-2)
         )