2024-11-03 nightly release (97a4600)

pytorchbot · pytorchbot · commit 480c4b5f8f27 · 2024-11-03T11:35:13.000Z
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-e47e8794499a4a0130ff4efb8713ff93f4b40c36
+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
@@ -154,6 +154,10 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
@@ -125,6 +125,10 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -36,6 +36,12 @@
 lib.define(
     "quantized_layer_norm.out(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
 )
+lib.define(
+    "quantized_layer_norm.per_tensor(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
+)
+lib.define(
+    "quantized_layer_norm.per_tensor_out(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
 
 lib.define(
     "quantized_linear(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
@@ -180,6 +186,21 @@ def quantized_layer_norm_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_layer_norm.per_tensor")
+def quantized_layer_norm_per_tensor_meta(
+    input: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
diff --git a/backends/cadence/hifi/operators/quantized_layer_norm.cpp b/backends/cadence/hifi/operators/quantized_layer_norm.cpp
@@ -27,7 +27,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     float input_scale,
     int64_t input_zero_point,
@@ -107,7 +107,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -120,7 +120,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
@@ -157,6 +157,44 @@ void quantized_layer_norm_out(
 #undef typed_quantized_layer_norm
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+#define typed_quantized_layer_norm(ctype, dtype) \
+  case ScalarType::dtype: {                      \
+    quantized_layer_norm_per_tensor_<ctype>(     \
+        input,                                   \
+        in_scale,                                \
+        in_zero_point,                           \
+        weight,                                  \
+        bias,                                    \
+        eps,                                     \
+        output_scale,                            \
+        output_zero_point,                       \
+        out);                                    \
+    break;                                       \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_layer_norm)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_layer_norm
+}
+
 }; // namespace native
 }; // namespace HiFi
 }; // namespace impl
diff --git a/backends/cadence/reference/operators/quantized_layer_norm.cpp b/backends/cadence/reference/operators/quantized_layer_norm.cpp
@@ -11,9 +11,11 @@
 
 #include <cmath>
 
-using executorch::aten::Tensor;
-using executorch::runtime::getLeadingDims;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace reference {
@@ -22,7 +24,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     double input_scale,
     int64_t input_zero_point,
@@ -98,7 +100,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -111,11 +113,11 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
-    const executorch::aten::IntArrayRef normalized_shape,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
     const Tensor& weight,
     const Tensor& bias,
     double eps,
@@ -152,6 +154,48 @@ void quantized_layer_norm_out(
   }
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_per_tensor_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_per_tensor_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
diff --git a/examples/models/llama3_2_vision/preprocess/export_preprocess.py b/examples/models/llama3_2_vision/preprocess/export_preprocess.py
@@ -24,29 +24,22 @@ def main():
         strict=False,
     )
 
-    # Executorch
+    # AOTInductor. Note: export AOTI before ExecuTorch, as
+    # ExecuTorch will modify the ExportedProgram.
+    torch._inductor.aot_compile(
+        ep.module(),
+        model.get_example_inputs(),
+        options={"aot_inductor.output_path": "preprocess_aoti.so"},
+    )
+
+    # Executorch.
     edge_program = to_edge(
         ep, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
     et_program = edge_program.to_executorch()
     with open("preprocess_et.pte", "wb") as file:
         et_program.write_to_file(file)
 
-    # Export.
-    # ep = torch.export.export(
-    #     model.get_eager_model(),
-    #     model.get_example_inputs(),
-    #     dynamic_shapes=model.get_dynamic_shapes(),
-    #     strict=False,
-    # )
-    #
-    # # AOTInductor
-    # torch._inductor.aot_compile(
-    #     ep.module(),
-    #     model.get_example_inputs(),
-    #     options={"aot_inductor.output_path": "preprocess_aoti.so"},
-    # )
-
 
 if __name__ == "__main__":
     main()
diff --git a/examples/models/llama3_2_vision/preprocess/test_preprocess.py b/examples/models/llama3_2_vision/preprocess/test_preprocess.py
@@ -26,6 +26,7 @@
 )
 
 from PIL import Image
+from torch._inductor.package import package_aoti
 
 from torchtune.models.clip.inference._transform import CLIPImageTransform
 
@@ -55,31 +56,46 @@ def initialize_models(resize_to_max_canvas: bool) -> Dict[str, Any]:
         possible_resolutions=None,
     )
 
+    # Eager model.
     model = CLIPImageTransformModel(config)
 
+    # Exported model.
     exported_model = torch.export.export(
         model.get_eager_model(),
         model.get_example_inputs(),
         dynamic_shapes=model.get_dynamic_shapes(),
         strict=False,
     )
 
-    # aoti_path = torch._inductor.aot_compile(
-    #     exported_model.module(),
-    #     model.get_example_inputs(),
-    # )
+    # AOTInductor model.
+    so = torch._export.aot_compile(
+        exported_model.module(),
+        args=model.get_example_inputs(),
+        options={"aot_inductor.package": True},
+        dynamic_shapes=model.get_dynamic_shapes(),
+    )
+    aoti_path = "preprocess.pt2"
+    package_aoti(aoti_path, so)
 
     edge_program = to_edge(
         exported_model, compile_config=EdgeCompileConfig(_check_ir_validity=False)
     )
     executorch_model = edge_program.to_executorch()
 
+    # Re-export as ExecuTorch edits the ExportedProgram.
+    exported_model = torch.export.export(
+        model.get_eager_model(),
+        model.get_example_inputs(),
+        dynamic_shapes=model.get_dynamic_shapes(),
+        strict=False,
+    )
+
     return {
         "config": config,
         "reference_model": reference_model,
         "model": model,
         "exported_model": exported_model,
-        # "aoti_path": aoti_path,
+        "aoti_path": aoti_path,
         "executorch_model": executorch_model,
     }
 
@@ -265,11 +281,13 @@ def run_preprocess(
         ), f"Executorch model: expected {reference_ar} but got {et_ar.tolist()}"
 
         # Run aoti model and check it matches reference model.
-        # aoti_path = models["aoti_path"]
-        # aoti_model = torch._export.aot_load(aoti_path, "cpu")
-        # aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
-        # self.assertTrue(torch.allclose(reference_image, aoti_image))
-        # self.assertEqual(reference_ar, aoti_ar.tolist())
+        aoti_path = models["aoti_path"]
+        aoti_model = torch._inductor.aoti_load_package(aoti_path)
+        aoti_image, aoti_ar = aoti_model(image_tensor, inscribed_size, best_resolution)
+        assert_expected(aoti_image, reference_image, rtol=0, atol=1e-4)
+        assert (
+            reference_ar == aoti_ar.tolist()
+        ), f"AOTI model: expected {reference_ar} but got {aoti_ar.tolist()}"
 
     # This test setup mirrors the one in torchtune:
     # https://github.com/pytorch/torchtune/blob/main/tests/torchtune/models/clip/test_clip_image_transform.py
diff --git a/install_requirements.py b/install_requirements.py
@@ -112,7 +112,7 @@ def python_is_compatible():
 # NOTE: If a newly-fetched version of the executorch repo changes the value of
 # NIGHTLY_VERSION, you should re-run this script to install the necessary
 # package versions.
-NIGHTLY_VERSION = "dev20241030"
+NIGHTLY_VERSION = "dev20241101"
 
 # The pip repository that hosts nightly torch packages.
 TORCH_NIGHTLY_URL = "https://download.pytorch.org/whl/nightly/cpu"

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e47e8794499a4a0130ff4efb8713ff93f4b40c36`
	`1`	`+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad`