Update base for Update on "[ET-VK] Refine paritioner to account for storage type and memory layout"

SS-JIA · SS-JIA · commit 827d50f17915 · 2024-11-04T13:11:43.000-08:00
## Context There are a variety of ways that tensors can be represented in Vulkan. The two main descriptors for how a tensor is laid out in memory is: 1. Storage Type (buffer or texture) 2. Memory Layout (which dim is packed along a texel, which dim has a stride of 1, etc.) Due to the differences between buffers and textures, and the differences between different memory layouts, an implementation for an operator may only support a specific set of (storage type, memory layout) combinations. Furthermore, if an operator implementation supports multiple (storage type, memory layout) combinations, there may be a "preferred" setting which results in optimal performance. These changes lay the foundation for the implementation of a memory metadata tagging graph transform, which will make sure that all tensors participating in an operator call is has a valid/optimal (storage type, memory layout) setting, and insert transition operators to transfer input tensors to the correct memory settings when necessary. An additional change that is required arises from the fact that in Vulkan, there is a limit on texture and buffer sizes. Therefore, the partitioner needs to account for the storage types and memory layouts supported by the operator implementation, and check if all tensors participating in a computation can be represented with some storage type, memory layout combination supported by the implementation. ## Changes Improvements to the operator registry: * Introduce utility functions to check the optimal and enabled storage types and memory layouts for an operator Improvements to the Partitioner: * Account for the storage types and memory layouts supported by an operator when deciding if a node should be partitioned * Improved logic for fusable ops (i.e. the permute/transpose before a mm which can be fused into linear) to check if the final target op is supported in Vulkan, and only partition those nodes if so. Otherwise, don't partition it so that it can be fused by another backend. Differential Revision: [D65428843](https://our.internmc.facebook.com/intern/diff/D65428843/) [ghstack-poisoned]
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-e47e8794499a4a0130ff4efb8713ff93f4b40c36
+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad
diff --git a/backends/arm/quantizer/TARGETS b/backends/arm/quantizer/TARGETS
@@ -3,7 +3,6 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_quantizer",
     srcs = ["arm_quantizer.py"],
-    typing = True,
     deps = [
         ":arm_quantizer_utils",
         "//caffe2:torch",
@@ -15,7 +14,6 @@ python_library(
 python_library(
     name = "quantization_config",
     srcs = ["quantization_config.py"],
-    typing = True,
     deps = [
         "//caffe2:torch",
     ],
@@ -24,7 +22,6 @@ python_library(
 python_library(
     name = "arm_quantizer_utils",
     srcs = ["arm_quantizer_utils.py"],
-    typing = True,
     deps = [
         ":quantization_config",
     ],
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -154,6 +154,10 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -125,6 +125,10 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -36,6 +36,12 @@
 lib.define(
     "quantized_layer_norm.out(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
 )
+lib.define(
+    "quantized_layer_norm.per_tensor(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
+)
+lib.define(
+    "quantized_layer_norm.per_tensor_out(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
 
 lib.define(
     "quantized_linear(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
@@ -180,6 +186,21 @@ def quantized_layer_norm_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_layer_norm.per_tensor")
+def quantized_layer_norm_per_tensor_meta(
+    input: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -27,7 +27,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     float input_scale,
     int64_t input_zero_point,
@@ -107,7 +107,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -120,7 +120,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
@@ -157,6 +157,44 @@ void quantized_layer_norm_out(
 #undef typed_quantized_layer_norm
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+#define typed_quantized_layer_norm(ctype, dtype) \
+  case ScalarType::dtype: {                      \
+    quantized_layer_norm_per_tensor_<ctype>(     \
+        input,                                   \
+        in_scale,                                \
+        in_zero_point,                           \
+        weight,                                  \
+        bias,                                    \
+        eps,                                     \
+        output_scale,                            \
+        output_zero_point,                       \
+        out);                                    \
+    break;                                       \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_layer_norm)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_layer_norm
+}
+
 }; // namespace native
 }; // namespace HiFi
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -11,9 +11,11 @@
 
 #include <cmath>
 
-using executorch::aten::Tensor;
-using executorch::runtime::getLeadingDims;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace reference {
@@ -22,7 +24,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     double input_scale,
     int64_t input_zero_point,
@@ -98,7 +100,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -111,11 +113,11 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
-    const executorch::aten::IntArrayRef normalized_shape,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
     const Tensor& weight,
     const Tensor& bias,
     double eps,
@@ -152,6 +154,48 @@ void quantized_layer_norm_out(
   }
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_per_tensor_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_per_tensor_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess.py b/examples/models/llama3_2_vision/preprocess/export_preprocess.py
@@ -24,29 +24,22 @@ def main():
         strict=False,
     )
 
-    # Executorch
+    # AOTInductor. Note: export AOTI before ExecuTorch, as
+    # ExecuTorch will modify the ExportedProgram.
+    torch._inductor.aot_compile(
+        ep.module(),
+        model.get_example_inputs(),
+        options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    )
+
+    # Executorch.
     edge_program = to_edge(
         ep, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
     et_program = edge_program.to_executorch()
     with open("preprocess_et.pte", "wb") as file:
         et_program.write_to_file(file)
 
-    # Export.
-    # ep = torch.export.export(
-    #     model.get_eager_model(),
-    #     model.get_example_inputs(),
-    #     dynamic_shapes=model.get_dynamic_shapes(),
-    #     strict=False,
-    # )
-    #
-    # # AOTInductor
-    # torch._inductor.aot_compile(
-    #     ep.module(),
-    #     model.get_example_inputs(),
-    #     options={"aot_inductor.output_path": "preprocess_aoti.so"},
-    # )
-
 
 if __name__ == "__main__":
     main()
diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
@@ -26,6 +26,7 @@
 )
 
 from PIL import Image
+from torch._inductor.package import package_aoti
 
 from torchtune.models.clip.inference._transform import CLIPImageTransform
 
@@ -55,31 +56,46 @@ def initialize_models(resize_to_max_canvas: bool) -> Dict[str, Any]:
         possible_resolutions=None,
     )
 
+    # Eager model.
     model = CLIPImageTransformModel(config)
 
+    # Exported model.
     exported_model = torch.export.export(
         model.get_eager_model(),
         model.get_example_inputs(),
         dynamic_shapes=model.get_dynamic_shapes(),
         strict=False,
     )
 
-    # aoti_path = torch._inductor.aot_compile(
-    #     exported_model.module(),
-    #     model.get_example_inputs(),
-    # )
+    # AOTInductor model.
+    so = torch._export.aot_compile(
+        exported_model.module(),
+        args=model.get_example_inputs(),
+        options={"aot_inductor.package": True},
+        dynamic_shapes=model.get_dynamic_shapes(),
+    )
+    aoti_path = "preprocess.pt2"
+    package_aoti(aoti_path, so)
 
     edge_program = to_edge(
         exported_model, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
     executorch_model = edge_program.to_executorch()
 
+    # Re-export as ExecuTorch edits the ExportedProgram.
+    exported_model = torch.export.export(
+        model.get_eager_model(),
+        model.get_example_inputs(),
+        dynamic_shapes=model.get_dynamic_shapes(),
+        strict=False,
+    )
+
     return {
         "config": config,
         "reference_model": reference_model,
         "model": model,
         "exported_model": exported_model,
-        # "aoti_path": aoti_path,
+        "aoti_path": aoti_path,
         "executorch_model": executorch_model,
     }
 
@@ -265,11 +281,13 @@ def run_preprocess(
         ), f"Executorch model: expected {reference_ar} but got {et_ar.tolist()}"
 
         # Run aoti model and check it matches reference model.
-        # aoti_path = models["aoti_path"]
-        # aoti_model = torch._export.aot_load(aoti_path, "cpu")
-        # aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
-        # self.assertTrue(torch.allclose(reference_image, aoti_image))
-        # self.assertEqual(reference_ar, aoti_ar.tolist())
+        aoti_path = models["aoti_path"]
+        aoti_model = torch._inductor.aoti_load_package(aoti_path)
+        aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
+        assert_expected(aoti_image, reference_image, rtol=0, atol=1e-4)
+        assert (
+            reference_ar == aoti_ar.tolist()
+        ), f"AOTI model: expected {reference_ar} but got {aoti_ar.tolist()}"
 
     # This test setup mirrors the one in torchtune:
     # https://github.com/pytorch/torchtune/blob/main/tests/torchtune/models/clip/test_clip_image_transform.py
diff --git a/install_requirements.py b/install_requirements.py
@@ -112,7 +112,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20241030"
+NIGHTLY_VERSION = "dev20241101"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e47e8794499a4a0130ff4efb8713ff93f4b40c36`
	`1`	`+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad`