Merge branch 'main' into fix-macos-runners

shoumikhin · web-flow · commit 44991cf7e9cb · 2025-08-29T13:39:04.000-07:00
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -604,3 +604,17 @@ python_unittest(
         "//later:lib",
     ],
 )
+
+python_unittest(
+    name = "test_ref_implementations",
+    srcs = [
+        "tests/test_ref_implementations.py",
+    ],
+    supports_static_listing = False,
+    typing = True,
+    deps = [
+        ":typing_stubs",
+        "//executorch/backends/cadence/aot:ref_implementations",
+        "//caffe2:torch",
+    ]
+)
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -16,6 +16,7 @@
     get_im2row_output_size,
 )
 from executorch.exir.scalar_type import ScalarType
+from torch._meta_registrations import _linalg_svd_meta
 from torch.library import Library, register_fake
 
 lib = Library("cadence", "DEF")
@@ -250,6 +251,12 @@
     "int in_zero_point, bool channel_last=False) -> (Tensor out)"
 )
 lib.define("linalg_vector_norm(Tensor X) -> (Tensor Y)")
+lib.define(
+    "linalg_svd(Tensor A, bool full_matrices=False, bool compute_uv=True, str? driver=None) -> (Tensor U, Tensor S, Tensor Vh)"
+)
+lib.define(
+    "linalg_svd.out(Tensor A, bool full_matrices=False, bool compute_uv=True, str? driver=None, *, Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh) -> (Tensor(a!) U, Tensor(b!) S, Tensor(c!) Vh)"
+)
 lib.define(
     "transposed_im2row(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, "
     "int[2] output_padding, Tensor in_zero_point, bool channel_last=False) -> (Tensor out)"
@@ -1576,6 +1583,26 @@ def linalg_vector_norm_meta(
     return X.new_empty([], dtype=X.dtype)
 
 
+@register_fake("cadence::linalg_svd")
+def linalg_svd_meta(
+    A: torch.Tensor,
+    full_matrices: bool = False,
+    compute_uv: bool = True,
+    driver: Optional[str] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    # Based on the _linalg_svd meta implementation, but ensuring contiguous strides
+
+    # Get the shapes from the original meta function
+    U, S, Vh = _linalg_svd_meta(A, full_matrices, compute_uv, driver)
+
+    # Create new tensors with contiguous strides to fix the non-contiguous issue
+    U_contiguous = A.new_empty(U.shape, dtype=A.dtype).contiguous()
+    S_contiguous = A.new_empty(S.shape, dtype=A.dtype).contiguous()
+    Vh_contiguous = A.new_empty(Vh.shape, dtype=A.dtype).contiguous()
+
+    return U_contiguous, S_contiguous, Vh_contiguous
+
+
 @register_fake("cadence::requantize")
 def requantize_meta(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -20,6 +20,42 @@
 }
 
 
+@impl(m, "quantize_per_tensor")
+def quantize_per_tensor(
+    input: torch.Tensor,
+    scale: float,
+    zero_point: int,
+    quant_min: int,
+    quant_max: int,
+    dtype: torch.dtype,
+) -> torch.Tensor:
+    """
+    Quantizes a floating-point tensor to an integral tensor.
+
+    Args:
+        - input (Tensor): input tensor
+        - scale (float): Quantization scale. Derived from the ratio
+            between the min/max of the floating-point tensor and the
+            min/max of the quantized range.
+        - zero_point (int): The point which represents 0 in the quantized
+            range. For example, consider the floating point range [-1., 2.] and
+            quantized integer range [-7, 7]. In this case, 0 is 1/3 of way from
+            -1. to 2. So, the point that represents 0 in the quantized range should
+            be 1/3 of the way from [-7, 7]. This ends up being -2 in the integer space.
+        - quant_min (int): The smallest value in the quantized domain. Unused since scale
+            is already provided.
+        - quant_max (int): The largest value in the quantized domain. Unused since scale
+            is already provided.
+        - dtype (torch.dtype): The type of the output tensor
+    """
+    supported_quant_types = [torch.int8, torch.int16, torch.int32]
+    if dtype not in supported_quant_types:
+        raise ValueError(
+            f"Unsupported dtype to quantize to. Supported dtypes must be one of {supported_quant_types}"
+        )
+    return torch.round(input / scale + zero_point).to(dtype)
+
+
 @impl(m, "requantize")
 def requantize(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -0,0 +1,50 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import unittest
+
+import torch
+
+from executorch.backends.cadence.aot.ref_implementations import quantize_per_tensor
+from executorch.backends.cadence.aot.typing_stubs import expand
+
+
+class TestRefImplementations(unittest.TestCase):
+    @expand(
+        [
+            ("basic_int8", 0.42, -1.0, 2.0, -7, 7, torch.int8, 0),
+            ("basic_int16", 0.42, -1.0, 5.0, -6, 7, torch.int16, -3),
+        ]
+    )
+    def test_quantize_per_tensor(
+        self,
+        name: str,
+        input_value: float,
+        f_min: float,
+        f_max: float,
+        q_min: int,
+        q_max: int,
+        target_dtype: torch.dtype,
+        expected_value: int,
+    ) -> None:
+        input_tensor = torch.tensor([input_value])
+        scale = (f_max - f_min) / (q_max - q_min)
+        zero_point = round(-f_min / scale) + q_min
+        expected_output = torch.tensor([expected_value], dtype=target_dtype)
+
+        output = quantize_per_tensor(
+            input_tensor, scale, zero_point, q_min, q_max, target_dtype
+        )
+
+        self.assertEqual(
+            output.dtype, expected_output.dtype, f"Dtype mismatch in {name}"
+        )
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Values don't match in {name}: got {output}, expected {expected_output}",
+        )
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
diff --git a/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/optimized/op_registration_util.bzl