Backend-agnostic implementation of quantized_layer_norm_per_tensor

Andrew Grebenisan · facebook-github-bot · commit 3e0cd2062baa · 2025-09-01T14:26:31.000-07:00
Summary: Continuing support for supporting backend-agnostic Cadence custom ops.

Differential Revision: D81459333
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -239,6 +239,85 @@ def quantized_linear(
     return out.reshape(*leading_dims, N)
 
 
+@impl(m, "quantized_layer_norm_per_tensor")
+def quantized_layer_norm_per_tensor(
+    input: torch.Tensor,
+    X_scale: float,
+    X_zero_point: int,
+    normalized_shape: int,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float,
+    output_scale: float,
+    output_zero_point: int,
+) -> torch.Tensor:
+    """
+    Quantized layer norm operation.
+
+    Args:
+        - input (Tensor): The activations tensor
+        - X_scale (float): The scale of the input
+        - X_zero_point (int): The zero point of the input
+        - normalized_shape (int): The shape of the input (unused)
+        - weight (Tensor): The weight tensor
+        - bias (Tensor): The bias tensor
+        - eps (float): The epsilon value
+        - output_scale (float): The scale of the output
+        - output_zero_point (int): The zero point of the output
+    """
+    supported_dtypes = [torch.int8, torch.uint8]
+    if input.dtype not in supported_dtypes:
+        raise ValueError(
+            f"Input dtype must be one of {supported_dtypes}. Got {input.dtype}"
+        )
+
+    # Get dimensions
+    last_dim = input.size(-1)
+    leading_dims = input.numel() // last_dim
+
+    # Reshape input to process as 1D vectors
+    input_flat = input.view(leading_dims, last_dim)
+    output = torch.empty_like(input)
+    output_flat = output.view(leading_dims, last_dim)
+
+    output_inv_scale = 1.0 / output_scale
+
+    # Process each 1D vector
+    for i in range(leading_dims):
+        x = input_flat[i]
+
+        # Compute sum and squared sum in quantized space
+        # Following the C++ implementation logic
+        sum_val = torch.sum(x.to(torch.int32))
+        sq_sum = last_dim * X_zero_point * X_zero_point + torch.sum(
+            x.to(torch.int32) * x.to(torch.int32)
+        )
+        sq_sum -= 2 * sum_val * X_zero_point
+        sum_val -= last_dim * X_zero_point
+
+        # Convert to floating point mean and variance
+        mean = (X_scale * sum_val) / last_dim
+        variance = (sq_sum * X_scale * X_scale) / last_dim - mean * mean
+        inv_std = 1.0 / torch.sqrt(torch.tensor(variance + eps))  # type: ignore[arg-type]
+
+        # Apply layer norm: (x - mean) / std * weight + bias
+        for j in range(last_dim):
+            # Dequantize input value
+            val = dequantize_per_tensor(
+                x[j], X_scale, X_zero_point, -128, 127, torch.float32
+            )
+
+            # Apply layer norm formula
+            val = (val - mean) * inv_std * weight[j] + bias[j]
+
+            # Quantize result
+            output_flat[i, j] = quantize_per_tensor(
+                val, output_inv_scale, output_zero_point, -128, 127, input.dtype
+            )
+
+    return output
+
+
 @impl(m, "requantize")
 def requantize(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -15,6 +15,7 @@
     dequantize_per_tensor,
     quantize_per_tensor,
     quantized_add,
+    quantized_layer_norm_per_tensor,
     quantized_linear,
 )
 from executorch.backends.cadence.aot.typing_stubs import expand
@@ -232,3 +233,97 @@ def test_quantized_linear(
             torch.equal(output, expected_output),
             f"Values don't match: got {output}, expected {expected_output}",
         )
+
+    @expand(
+        [
+            # Test case 1: Simple case with int8, zero mean input
+            (
+                torch.tensor(
+                    [[-1, 1]], dtype=torch.int8
+                ),  # input: dequantized to [-0.1, 0.1]
+                0.1,  # X_scale
+                0,  # X_zero_point
+                2,  # normalized_shape (last dimension)
+                torch.tensor([1.0, 1.0]),  # weight
+                torch.tensor([0.0, 0.0]),  # bias
+                1e-5,  # eps
+                0.1,  # output_scale
+                0,  # output_zero_point
+                torch.int8,  # dtype
+                torch.tensor([[-10, 10]], dtype=torch.int8),  # expected_output
+            ),
+            # Test case 2: uint8 with zero_point offset
+            (
+                torch.tensor(
+                    [[127, 129]], dtype=torch.uint8
+                ),  # input: dequantized to [-0.05, 0.05]
+                0.05,  # X_scale
+                128,  # X_zero_point
+                2,  # normalized_shape (last dimension)
+                torch.tensor([1.0, 1.0]),  # weight
+                torch.tensor([0.0, 0.0]),  # bias
+                1e-5,  # eps
+                0.05,  # output_scale
+                128,  # output_zero_point
+                torch.uint8,  # dtype
+                torch.tensor([[108, 148]], dtype=torch.uint8),  # expected_output
+            ),
+            # Test case 3: Test with weight and bias scaling
+            (
+                torch.tensor(
+                    [[-2, 2]], dtype=torch.int8
+                ),  # input: dequantized to [-0.2, 0.2]
+                0.1,  # X_scale
+                0,  # X_zero_point
+                2,  # normalized_shape (last dimension)
+                torch.tensor(
+                    [2.0, 0.5]
+                ),  # weight: scale first element by 2, second by 0.5
+                torch.tensor(
+                    [0.1, -0.1]
+                ),  # bias: add 0.1 to first, subtract 0.1 from second
+                1e-5,  # eps
+                0.1,  # output_scale
+                0,  # output_zero_point
+                torch.int8,  # dtype
+                torch.tensor([[-19, 4]], dtype=torch.int8),  # expected_output
+            ),
+        ]
+    )
+    def test_quantized_layer_norm_per_tensor(
+        self,
+        input_tensor: torch.Tensor,
+        X_scale: float,
+        X_zero_point: int,
+        normalized_shape: int,
+        weight: torch.Tensor,
+        bias: torch.Tensor,
+        eps: float,
+        output_scale: float,
+        output_zero_point: int,
+        dtype: torch.dtype,
+        expected_output: torch.Tensor,
+    ) -> None:
+        output = quantized_layer_norm_per_tensor(
+            input_tensor,
+            X_scale,
+            X_zero_point,
+            normalized_shape,
+            weight,
+            bias,
+            eps,
+            output_scale,
+            output_zero_point,
+        )
+
+        # Verify output properties
+        self.assertEqual(output.dtype, dtype, f"Output dtype should be {dtype}")
+        self.assertEqual(
+            output.shape, input_tensor.shape, "Output shape should match input shape"
+        )
+
+        # Verify output matches expected values
+        self.assertTrue(
+            torch.equal(output, expected_output),
+            f"Output values don't match expected. Got {output}, expected {expected_output}",
+        )