pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/quantizer/TARGETS‎
Lines changed: 0 additions & 3 deletions b/‎backends/arm/quantizer/TARGETS‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/cadence/aot/functions.yaml‎
Lines changed: 4 additions & 0 deletions b/‎backends/cadence/aot/functions.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 4 additions & 0 deletions b/‎backends/cadence/aot/functions_hifi.yaml‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 21 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎backends/cadence/hifi/operators/quantized_layer_norm.cpp‎
Lines changed: 41 additions & 3 deletions b/‎backends/cadence/hifi/operators/quantized_layer_norm.cpp‎
Lines changed: 41 additions & 3 deletions
diff --git a/‎backends/cadence/reference/operators/quantized_layer_norm.cpp‎
Lines changed: 51 additions & 7 deletions b/‎backends/cadence/reference/operators/quantized_layer_norm.cpp‎
Lines changed: 51 additions & 7 deletions
diff --git a/‎backends/vulkan/op_registry.py‎
Lines changed: 30 additions & 8 deletions b/‎backends/vulkan/op_registry.py‎
Lines changed: 30 additions & 8 deletions
@@ -1 +1 @@
-e47e8794499a4a0130ff4efb8713ff93f4b40c36
+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad
@@ -3,7 +3,6 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_quantizer",
     srcs = ["arm_quantizer.py"],
-    typing = True,
     deps = [
         ":arm_quantizer_utils",
         "//caffe2:torch",
@@ -15,7 +14,6 @@ python_library(
 python_library(
     name = "quantization_config",
     srcs = ["quantization_config.py"],
-    typing = True,
     deps = [
         "//caffe2:torch",
     ],
@@ -24,7 +22,6 @@ python_library(
 python_library(
     name = "arm_quantizer_utils",
     srcs = ["arm_quantizer_utils.py"],
-    typing = True,
     deps = [
         ":quantization_config",
     ],
 
@@ -154,6 +154,10 @@
   kernels:
     - arg_meta: null
       kernel_name: impl::reference::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::reference::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
 
@@ -125,6 +125,10 @@
   kernels:
     - arg_meta: null
       kernel_name: cadence::impl::HiFi::quantized_layer_norm_out
+- func: cadence::quantized_layer_norm.per_tensor_out(Tensor input, float in_scale, int in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::quantized_layer_norm_per_tensor_out
 
 - func: cadence::quantized_linear.out(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
 
@@ -36,6 +36,12 @@
 lib.define(
     "quantized_layer_norm.out(Tensor X, Tensor X_scale, Tensor X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
 )
+lib.define(
+    "quantized_layer_norm.per_tensor(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point) -> (Tensor Y)"
+)
+lib.define(
+    "quantized_layer_norm.per_tensor_out(Tensor X, float X_scale, int X_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
 
 lib.define(
     "quantized_linear(Tensor src, Tensor weight, Tensor bias, int src_zero_point, Tensor weight_zero_point, Tensor out_multiplier, Tensor out_shift, int out_zero_point, Tensor? offset) -> (Tensor Z)"
@@ -180,6 +186,21 @@ def quantized_layer_norm_meta(
     return input.new_empty(input.size(), dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_layer_norm.per_tensor")
+def quantized_layer_norm_per_tensor_meta(
+    input: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_relu")
 def quantized_relu_meta(
     X: torch.Tensor,
 
@@ -27,7 +27,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     float input_scale,
     int64_t input_zero_point,
@@ -107,7 +107,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -120,7 +120,7 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
@@ -157,6 +157,44 @@ void quantized_layer_norm_out(
 #undef typed_quantized_layer_norm
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+#define typed_quantized_layer_norm(ctype, dtype) \
+  case ScalarType::dtype: {                      \
+    quantized_layer_norm_per_tensor_<ctype>(     \
+        input,                                   \
+        in_scale,                                \
+        in_zero_point,                           \
+        weight,                                  \
+        bias,                                    \
+        eps,                                     \
+        output_scale,                            \
+        output_zero_point,                       \
+        out);                                    \
+    break;                                       \
+  }
+
+  ScalarType dtype = input.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_layer_norm)
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_layer_norm
+}
+
 }; // namespace native
 }; // namespace HiFi
 }; // namespace impl
 
@@ -11,9 +11,11 @@
 
 #include <cmath>
 
-using executorch::aten::Tensor;
-using executorch::runtime::getLeadingDims;
-using executorch::runtime::KernelRuntimeContext;
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::getLeadingDims;
+using ::executorch::runtime::KernelRuntimeContext;
 
 namespace impl {
 namespace reference {
@@ -22,7 +24,7 @@ namespace native {
 // Compute quantized layer_norm. The current implementation assumes that the
 // input is per-tensor quantized.
 template <typename T>
-void quantized_layer_norm_(
+void quantized_layer_norm_per_tensor_(
     const Tensor& input,
     double input_scale,
     int64_t input_zero_point,
@@ -98,7 +100,7 @@ void quantized_layer_norm_(
   int64_t input_zero_point = in_zero_point.const_data_ptr<int64_t>()[0];
 
   // Call other overload
-  quantized_layer_norm_<T>(
+  quantized_layer_norm_per_tensor_<T>(
       input,
       input_scale,
       input_zero_point,
@@ -111,11 +113,11 @@ void quantized_layer_norm_(
 }
 
 void quantized_layer_norm_out(
-    KernelRuntimeContext& ctx,
+    __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
     const Tensor& in_scale,
     const Tensor& in_zero_point,
-    const executorch::aten::IntArrayRef normalized_shape,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
     const Tensor& weight,
     const Tensor& bias,
     double eps,
@@ -152,6 +154,48 @@ void quantized_layer_norm_out(
   }
 }
 
+void quantized_layer_norm_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    double in_scale,
+    int64_t in_zero_point,
+    __ET_UNUSED const executorch::aten::IntArrayRef normalized_shape,
+    const Tensor& weight,
+    const Tensor& bias,
+    double eps,
+    double output_scale,
+    int64_t output_zero_point,
+    Tensor& out) {
+  if (input.scalar_type() == executorch::aten::ScalarType::Byte) {
+    quantized_layer_norm_per_tensor_<uint8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == executorch::aten::ScalarType::Char) {
+    quantized_layer_norm_per_tensor_<int8_t>(
+        input,
+        in_scale,
+        in_zero_point,
+        weight,
+        bias,
+        eps,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(
+        false,
+        "Unhandled input dtype %hhd",
+        static_cast<int8_t>(input.scalar_type()));
+  }
+}
+
 }; // namespace native
 }; // namespace reference
 }; // namespace impl
@@ -55,6 +55,10 @@ def __init__(
             self.valid_packed_dims = valid_packed_dims
 
     def valid_memory_layouts(self) -> Set[VkMemoryLayout]:
+        """
+        Derive the set of memory layouts supported by the texture implementation based
+        on the valid packed dimensions.
+        """
         layouts = set()
 
         if PackedDim.WIDTH in self.valid_packed_dims:
@@ -112,6 +116,15 @@ def __init__(
             self.check_node_fn = check_node_fn
 
     def propose_storage_type(self) -> Optional[VkStorageType]:
+        """
+        Propose a storage type that should be used for this operator. A proposal can be
+        made if one of the following is true:
+        1. The operator specifies an optimal storage type
+        2. Only one storage type is supported.
+
+        If both storage types are supported and no optimal storage type is specified,
+        then None is returned to indicate that there is no preference in storage type.
+        """
         if self.optimal_storage is not None:
             return self.optimal_storage
 
@@ -123,6 +136,9 @@ def propose_storage_type(self) -> Optional[VkStorageType]:
         return None
 
     def supported_storage_types(self) -> Set[VkStorageType]:
+        """
+        Return the set of storage types supported by this operator.
+        """
         storage_types = set()
         if self.texture_impl is not None:
             storage_types.add(VkStorageType.TEXTURE_3D)
@@ -132,6 +148,16 @@ def supported_storage_types(self) -> Set[VkStorageType]:
         return storage_types
 
     def propose_memory_layout(self, storage: VkStorageType) -> Optional[VkMemoryLayout]:
+        """
+        Given a storage type as a precondition, propose a memory layout that should be
+        used for this operator. A proposal can be made if one of the following is true:
+        1. The operator specifies an optimal memory layout
+        2. Only one memory layout is supported.
+
+        If multiple memory layouts are supported and no optimal memory layout is
+        specified then return None to indicate that the "best" memory layout for the
+        operator is ambiguous.
+        """
         if self.optimal_layout is not None:
             return self.optimal_layout
 
@@ -144,6 +170,10 @@ def propose_memory_layout(self, storage: VkStorageType) -> Optional[VkMemoryLayo
         return None
 
     def supported_memory_layouts(self, storage: VkStorageType) -> Set[VkMemoryLayout]:
+        """
+        Return the set of memory layouts supported by this operator for a given storage
+        type.
+        """
         if storage == VkStorageType.TEXTURE_3D:
             assert self.texture_impl is not None
             return self.texture_impl.valid_memory_layouts()
@@ -517,13 +547,5 @@ def get_op_features(target: OpKey) -> OpFeatures:
         return vulkan_supported_ops[target]
 
 
-def optimal_storage_type(target: OpKey) -> Optional[VkStorageType]:
-    return get_op_features(target).optimal_storage
-
-
-def optimal_memory_layout(target: OpKey) -> Optional[VkMemoryLayout]:
-    return get_op_features(target).optimal_layout
-
-
 def handles_own_prepacking(target: OpKey) -> bool:
     return get_op_features(target).handles_own_prepacking
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-e47e8794499a4a0130ff4efb8713ff93f4b40c36`
	`1`	`+c8a648d4dffb9f0133ff4a2ea0e660b42105d3ad`