pytorch
diff --git a/‎.github/workflows/ghstack_land.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ghstack_land.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/test/runner_utils.py‎
Lines changed: 7 additions & 2 deletions b/‎backends/arm/test/runner_utils.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 54 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/gen_vulkan_spv.py‎
Lines changed: 25 additions & 1 deletion b/‎backends/vulkan/runtime/gen_vulkan_spv.py‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl‎
Lines changed: 2 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/vulkan/targets.bzl‎
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/targets.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎devtools/inspector/_inspector_utils.py‎
Lines changed: 1 addition & 0 deletions b/‎devtools/inspector/_inspector_utils.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎exir/passes/executorch_prim_ops_registry.py‎
Lines changed: 9 additions & 0 deletions b/‎exir/passes/executorch_prim_ops_registry.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎extension/llm/custom_ops/targets.bzl‎
Lines changed: 7 additions & 2 deletions b/‎extension/llm/custom_ops/targets.bzl‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎kernels/optimized/cpu/op_add.cpp‎
Lines changed: 3 additions & 53 deletions b/‎kernels/optimized/cpu/op_add.cpp‎
Lines changed: 3 additions & 53 deletions
@@ -5,6 +5,7 @@ on:
     branches:
       - 'gh/cccclai/[0-9]+/base'
       - 'gh/dbort/[0-9]+/base'
+      - 'gh/dvorjackz/[0-9]+/base'
       - 'gh/guangy10/[0-9]+/base'
       - 'gh/helunwencser/[0-9]+/base'
       - 'gh/jorgep31415/[0-9]+/base'
 
@@ -448,16 +448,21 @@ def run_tosa_ref_model(
                 ), "There are no quantization parameters, check output parameters"
                 tosa_ref_output = (tosa_ref_output - quant_param.zp) * quant_param.scale
 
+            if tosa_ref_output.dtype == np.double:
+                tosa_ref_output = tosa_ref_output.astype("float32")
+
             # tosa_output is a numpy array, convert to torch tensor for comparison
-            tosa_ref_outputs.append(torch.from_numpy(tosa_ref_output.astype("float32")))
+            tosa_ref_outputs.append(torch.from_numpy(tosa_ref_output))
 
         return tosa_ref_outputs
 
 
 def prep_data_for_save(
     data, is_quantized: bool, input_name: str, quant_param: QuantizationParams
 ):
-    data_np = np.array(data.detach(), order="C").astype(np.float32)
+    data_np = np.array(data.detach(), order="C").astype(
+        f"{data.dtype}".replace("torch.", "")
+    )
 
     if is_quantized:
         assert quant_param.node_name in input_name, (
 
@@ -66,6 +66,12 @@
 lib.define(
     "quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 lib.define(
     "quantized_matmul(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
@@ -171,6 +177,54 @@ def quantized_conv_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_conv.per_tensor")
+def quantized_conv_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    if channel_last:
+        out_channels, *kernel_size, _ = weight.shape
+    else:
+        out_channels, _, *kernel_size = weight.shape
+
+    in_size = input.shape
+    # Assert that the input tensor has at least 3 dimensions, and at most 6
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+
+    # Compute the output tensor size
+    output_size = (
+        get_conv1d_output_size(
+            in_size,
+            out_channels,
+            stride[1],
+            padding[1],
+            dilation[1],
+            kernel_size[0],
+            channel_last,
+        )
+        if len(in_size) == 3
+        else get_conv2d_output_size(
+            in_size, out_channels, stride, padding, dilation, kernel_size, channel_last
+        )
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_layer_norm")
 def quantized_layer_norm_meta(
     input: torch.Tensor,
 
@@ -540,6 +540,7 @@ def __init__(
         env: Dict[Any, Any],
         glslc_path: Optional[str],
         glslc_flags: str = "",
+        replace_u16vecn: bool = False,
     ) -> None:
         if isinstance(src_dir_paths, str):
             self.src_dir_paths = [src_dir_paths]
@@ -549,6 +550,7 @@ def __init__(
         self.env = env
         self.glslc_path = glslc_path
         self.glslc_flags = glslc_flags
+        self.replace_u16vecn = replace_u16vecn
 
         self.glsl_src_files: Dict[str, str] = {}
         self.template_yaml_files: List[str] = []
@@ -705,6 +707,22 @@ def constructOutputMap(self) -> None:
                     self.create_shader_params(),
                 )
 
+    def maybe_replace_u16vecn(self, input_text: str) -> str:
+        """
+        There is a latency benefit to using u16vecn variables to store texture position
+        variables instead of ivecn, likely due to reduced register pressure. However,
+        SwiftShader does not support 16 bit integer types in shaders, so this is a crude
+        way to fallback to using ivecn to store texture positions so that testing with
+        SwiftShader is still possible.
+        """
+        if not self.replace_u16vecn:
+            return input_text
+        if "codegen-nosub" in input_text:
+            return input_text
+
+        input_text = input_text.replace("u16vec", "ivec")
+        return input_text
+
     def generateSPV(self, output_dir: str) -> Dict[str, str]:
         output_file_map = {}
 
@@ -716,6 +734,7 @@ def process_shader(shader_paths_pair):
 
             with codecs.open(source_glsl, "r", encoding="utf-8") as input_file:
                 input_text = input_file.read()
+                input_text = self.maybe_replace_u16vecn(input_text)
                 output_text = preprocess(input_text, shader_params)
 
             glsl_out_path = os.path.join(output_dir, f"{shader_name}.glsl")
@@ -1029,6 +1048,7 @@ def main(argv: List[str]) -> int:
     parser.add_argument("-c", "--glslc-path", required=True, help="")
     parser.add_argument("-t", "--tmp-dir-path", required=True, help="/tmp")
     parser.add_argument("-o", "--output-path", required=True, help="")
+    parser.add_argument("--replace-u16vecn", action="store_true", default=False)
     parser.add_argument("--optimize_size", action="store_true", help="")
     parser.add_argument("--optimize", action="store_true", help="")
     parser.add_argument(
@@ -1056,7 +1076,11 @@ def main(argv: List[str]) -> int:
         glslc_flags += "-O"
 
     shader_generator = SPVGenerator(
-        options.glsl_paths, env, options.glslc_path, glslc_flags
+        options.glsl_paths,
+        env,
+        options.glslc_path,
+        glslc_flags=glslc_flags,
+        replace_u16vecn=options.replace_u16vecn,
     )
     output_spv_files = shader_generator.generateSPV(options.tmp_dir_path)
 
 
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// codegen-nosub
+
 #version 450 core
 
 #define PRECISION ${PRECISION}
 
@@ -27,6 +27,7 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
         select({
             "DEFAULT": "",
             "ovr_config//os:android": "--optimize",
+            "ovr_config//os:linux": "--replace-u16vecn",
         })
     )
 
 
@@ -112,6 +112,7 @@ def get_scalar_type_size(scalar_type: ScalarType) -> Tuple[torch.dtype, int]:
             ScalarType.BYTE: (torch.uint8, 1),
             ScalarType.CHAR: (torch.int8, 1),
             ScalarType.BOOL: (torch.bool, 1),
+            ScalarType.BITS16: (torch.uint16, 2),
             ScalarType.SHORT: (torch.int16, 2),
             ScalarType.HALF: (torch.float16, 2),
             ScalarType.INT: (torch.int, 4),
 
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 import operator
 from typing import Dict, Set, Union
 
@@ -14,6 +15,8 @@
 from torch._ops import OpOverload
 from torch.library import Library
 
+# pyre-unsafe
+
 
 executorch_prims_lib = Library("executorch_prim", "DEF")
 
@@ -91,7 +94,13 @@ def neg(a: _SymScalar) -> _SymScalar:
     return -a  # pyre-ignore
 
 
+@bind_pattern_to_op(executorch_prims_lib, "trunc.Scalar(Scalar a) -> Scalar")
+def trunc(a: _SymScalar) -> _SymScalar:
+    return math.trunc(a)  # pyre-ignore
+
+
 _PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[OpOverload, OpOverload] = {
+    math.trunc: ops.backend.executorch_prim.trunc.Scalar,
     operator.sub: ops.backend.executorch_prim.sub.Scalar,
     operator.mul: ops.backend.executorch_prim.mul.Scalar,
     operator.add: ops.backend.executorch_prim.add.Scalar,
 
@@ -1,10 +1,14 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(
+    "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_preprocessor_flags",
+    "get_vec_deps",
+)
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
     "get_compiler_optimization_flags",
 )
 
-
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -26,6 +30,7 @@ def define_common_targets():
                 "op_sdpa.h",
                 "op_update_quantized_cache.h",
             ],
+            preprocessor_flags = get_vec_preprocessor_flags(),
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
                 "//executorch/kernels/portable/cpu:scalar_utils",
@@ -38,7 +43,7 @@ def define_common_targets():
             deps = [
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
-            ],
+            ] + get_vec_deps(),
             compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
             visibility = [
                 "//executorch/...",
 
@@ -9,6 +9,7 @@
 #include <executorch/kernels/optimized/cpu/binary_ops.h>
 #include <executorch/kernels/optimized/vec/functional.h>
 #include <executorch/kernels/optimized/vec/vec.h>
+#include <executorch/kernels/portable/cpu/op_add_impl.h>
 #include <executorch/kernels/portable/cpu/scalar_utils.h>
 #include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
@@ -176,35 +177,7 @@ Tensor& opt_add_out(
           lhs->sizes()[lhs->dim() - 1]);
     });
   } else {
-    ScalarType common_type =
-        promoteTypes(a_type, b_type, /*half_to_float*/ true);
-    ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
-
-    ET_KERNEL_CHECK(
-        ctx,
-        resize_to_broadcast_target_size(a, b, out) == Error::Ok,
-        InvalidArgument,
-        out);
-
-    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.out", CTYPE_A, [&]() {
-      ET_SWITCH_REALHBBF16_TYPES(b_type, ctx, "add.out", CTYPE_B, [&]() {
-        using CTYPE_IN = typename torch::executor::
-            promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
-        ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
-        ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, "add.out", CTYPE_OUT, [&]() {
-          CTYPE_IN alpha_val;
-          ET_KERNEL_CHECK(
-              ctx, utils::extract_scalar(alpha, &alpha_val), InvalidArgument, );
-
-          AddInner<
-              can_cast<CTYPE_IN, CTYPE_OUT>::value,
-              CTYPE_A,
-              CTYPE_B,
-              CTYPE_IN,
-              CTYPE_OUT>::run(a, b, alpha_val, out);
-        });
-      });
-    });
+    add_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
@@ -255,30 +228,7 @@ Tensor& opt_add_scalar_out(
       });
     });
   } else {
-    ET_SWITCH_REALHBBF16_TYPES(a_type, ctx, "add.Scalar_out", CTYPE_A, [&]() {
-      ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, "add.Scalar_out", CTYPE_B, [&]() {
-        ET_SWITCH_REALB_TYPES(
-            common_type, ctx, "add.Scalar_out", CTYPE_IN, [&]() {
-              ET_SWITCH_REALHBBF16_TYPES(
-                  out_type, ctx, "add.Scalar_out", CTYPE_OUT, [&]() {
-                    CTYPE_B b_val;
-                    ET_EXTRACT_SCALAR(b, b_val);
-                    CTYPE_IN b_casted = static_cast<CTYPE_IN>(b_val);
-                    CTYPE_IN alpha_val;
-                    ET_EXTRACT_SCALAR(alpha, alpha_val);
-
-                    const size_t n = a.numel();
-                    const CTYPE_A* a_data = a.const_data_ptr<CTYPE_A>();
-                    CTYPE_OUT* out_data = out.mutable_data_ptr<CTYPE_OUT>();
-                    for (auto i = 0; i < n; ++i) {
-                      out_data[i] = static_cast<CTYPE_OUT>(
-                          static_cast<CTYPE_IN>(a_data[i]) +
-                          alpha_val * b_casted);
-                    }
-                  });
-            });
-      });
-    });
+    add_scalar_out_impl(ctx, a, b, alpha, out);
   }
 
   return out;
Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):`
`27`	`27`	`select({`
`28`	`28`	`"DEFAULT": "",`
`29`	`29`	`"ovr_config//os:android": "--optimize",`
	`30`	`+ "ovr_config//os:linux": "--replace-u16vecn",`
`30`	`31`	`})`
`31`	`32`	`)`
`32`	`33`