Add layernorm fusion

gramalingam · gramalingam · commit 08dba7489373 · 2025-08-14T17:15:16.000-07:00
Signed-off-by: Ganesan Ramalingam &lt;grama@microsoft.com&gt;
diff --git a/onnxscript/rewriter/layer_normalization.py b/onnxscript/rewriter/layer_normalization.py
@@ -0,0 +1,119 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+from __future__ import annotations
+
+import onnx_ir as ir
+
+from onnxscript.rewriter import _fusion_utils, _ir_utils, pattern
+
+"""
+Layer Normalization fusion optimization.
+
+This module contains rewrite rules for fusing Layer Normalization patterns into the 
+ONNX LayerNormalization operator.
+
+Layer Normalization performs normalization over the last D dimensions as specified by the axis.
+The computation follows: Y = scale * (X - mean) / sqrt(variance + epsilon) + bias
+
+Key points for the fusion optimization:
+* Following restrictions from opset 17 LayerNormalization:
+* Input, scale, and bias must be of same type T in {float16, bfloat16, float, double}
+* The normalization can be done in a different precision than the input type (bfloat16 or float),
+which is also the precision of the output mean/invstddev
+"""
+
+float_types = frozenset(
+    [
+        ir.DataType.FLOAT,
+        ir.DataType.FLOAT16,
+        ir.DataType.BFLOAT16,
+        ir.DataType.DOUBLE,
+    ]
+)
+fp_float_types = frozenset([ir.DataType.FLOAT, ir.DataType.DOUBLE])
+
+
+class LayerNormFusion(pattern.RewriteRuleClassBase):
+    def pattern(self, op, x, scale, bias, epsilon, target_dtype):       
+        # Compute mean: Mean = ReduceMean(X, axes=normalized_axes)
+        # TODO: support axes attribute too
+        mean = op.ReduceMean(x, [-1], keepdims=1)
+        
+        # Compute deviation: D = Sub(X, Mean)
+        deviation = op.Sub(x, mean)
+        
+        # Compute squared deviation: DD = Mul(D, D)
+        # TODO: support Pow (D, 2) as well
+        deviation_squared = op.Mul(deviation, deviation)
+        
+        # Compute variance: Var = ReduceMean(DD, axes=normalized_axes)
+        variance = op.ReduceMean(deviation_squared, [-1], keepdims=1)
+        
+        # Add epsilon: VarEps = Add(Var, epsilon)
+        variance_plus_epsilon = op.Add(variance, epsilon)
+        
+        # Compute standard deviation: StdDev = Sqrt(VarEps)
+        std_dev = op.Sqrt(variance_plus_epsilon)
+        
+        # Compute reciprocal: InvStdDev = Reciprocal(StdDev)
+        # TODO: support Div(deviation, std_dev) as well?
+        inv_std_dev = op.Reciprocal(std_dev)
+        
+        # Normalize: Normalized = Mul(D, InvStdDev)
+        normalized = op.Mul(deviation, inv_std_dev)
+       
+        # Scale: NormalizedScaled = Mul(Normalized, Scale)
+        normalized_scaled = op.Mul(normalized, scale)
+        
+        # Add bias (if present): Y = Add(NormalizedScaled, B)
+        if bias is not None:
+            return op.Add(normalized_scaled, bias)
+        else:
+            return normalized_scaled
+
+    def check(
+        self, op, x, scale, bias, epsilon, compute_dtype, target_dtype, **_
+    ) -> pattern.MatchResult:  # type: ignore[name-defined]
+        """Check if the pattern matches conditions for use of LayerNormalization op."""
+        check_result = pattern.MatchResult()
+        
+        # epsilon must be a scalar
+        epsilon_value = _ir_utils.get_singleton_value(epsilon)
+        if not isinstance(epsilon_value, float):  # TODO: support other types
+            return check_result.fail("Epsilon is not a float value.", epsilon)
+        
+        if x.dtype not in fp_float_types:
+            return check_result.fail("Input is not a float type.", x)
+           
+        self._stash_dtype = x.dtype
+        
+        return check_result
+
+    def rewrite(self, op, x, scale, bias, epsilon, **_):
+        if bias is not None:
+            return op.LayerNormalization(
+                x,
+                scale,
+                bias,
+                axis=-1,
+                epsilon=_ir_utils.get_singleton_value(epsilon),
+                stash_type=self._stash_dtype,
+            )
+        else:
+            return op.LayerNormalization(
+                x,
+                scale,
+                axis=-1,
+                epsilon=_ir_utils.get_singleton_value(epsilon),
+                stash_type=self._stash_dtype,
+            )
+
+
+# Create rules for both with and without bias
+_layer_norm_with_bias_rule = LayerNormFusion.rule("LayerNormWithBias", has_bias=True)
+_layer_norm_rule = LayerNormFusion.rule("LayerNorm", has_bias=False)
+
+layer_normalization_rules = [_layer_norm_with_bias_rule, _layer_norm_rule]
+layer_normalization_ruleset = pattern.RewriteRuleSet(layer_normalization_rules)
+
+fuse_layer_normalization = _fusion_utils.apply_fusion_rules(layer_normalization_ruleset)
diff --git a/onnxscript/rewriter/layer_normalization_test.py b/onnxscript/rewriter/layer_normalization_test.py
@@ -0,0 +1,131 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import unittest
+
+import numpy as np
+import onnx_ir as ir
+import parameterized
+
+import onnxscript
+import onnxscript.rewriter.ort_fusions._test_utils as test_utils
+from onnxscript import FLOAT, OnnxFunction, script
+from onnxscript import opset17 as op
+from onnxscript.optimizer import optimize, remove_unused_nodes
+from onnxscript.rewriter.layer_normalization import fuse_layer_normalization
+import onnxscript.rewriter.testing
+
+
+@script()
+def _test_layer_norm_without_bias(x: FLOAT[2, 4, 8], scale: FLOAT[8]) -> FLOAT[2, 4, 8]:
+    """LayerNorm pattern without bias."""
+    # Compute mean: Mean = ReduceMean(X, axes=normalized_axes)
+    mean = op.ReduceMean(x, [-1], keepdims=1)
+    
+    # Compute deviation: D = Sub(X, Mean)
+    deviation = op.Sub(x, mean)
+    
+    # Compute squared deviation: DD = Mul(D, D)
+    deviation_squared = op.Mul(deviation, deviation)
+    
+    # Compute variance: Var = ReduceMean(DD, axes=normalized_axes)
+    variance = op.ReduceMean(deviation_squared, [-1], keepdims=1)
+    
+    # Add epsilon: VarEps = Add(Var, epsilon)
+    epsilon = op.Constant(value_float=1e-5)
+    variance_plus_epsilon = op.Add(variance, epsilon)
+    
+    # Compute standard deviation: StdDev = Sqrt(VarEps)
+    std_dev = op.Sqrt(variance_plus_epsilon)
+    
+    # Compute reciprocal: InvStdDev = Reciprocal(StdDev)
+    inv_std_dev = op.Reciprocal(std_dev)
+    
+    # Normalize: Normalized = Mul(D, InvStdDev)
+    normalized = op.Mul(deviation, inv_std_dev)
+    
+    # Scale: NormalizedScaled = Mul(Normalized, Scale)
+    normalized_scaled = op.Mul(normalized, scale)
+    
+    return normalized_scaled
+
+
+@script()
+def _test_layer_norm_with_bias(x: FLOAT[2, 4, 8], scale: FLOAT[8], bias: FLOAT[8]) -> FLOAT[2, 4, 8]:
+    """LayerNorm pattern with bias."""
+    # Compute mean: Mean = ReduceMean(X, axes=normalized_axes)
+    mean = op.ReduceMean(x, [-1], keepdims=1)
+    
+    # Compute deviation: D = Sub(X, Mean)
+    deviation = op.Sub(x, mean)
+    
+    # Compute squared deviation: DD = Mul(D, D)
+    deviation_squared = op.Mul(deviation, deviation)
+    
+    # Compute variance: Var = ReduceMean(DD, axes=normalized_axes)
+    variance = op.ReduceMean(deviation_squared, [-1], keepdims=1)
+    
+    # Add epsilon: VarEps = Add(Var, epsilon)
+    epsilon = op.Constant(value_float=1e-5)
+    variance_plus_epsilon = op.Add(variance, epsilon)
+    
+    # Compute standard deviation: StdDev = Sqrt(VarEps)
+    std_dev = op.Sqrt(variance_plus_epsilon)
+    
+    # Compute reciprocal: InvStdDev = Reciprocal(StdDev)
+    inv_std_dev = op.Reciprocal(std_dev)
+    
+    # Normalize: Normalized = Mul(D, InvStdDev)
+    normalized = op.Mul(deviation, inv_std_dev)
+    
+    # Scale: NormalizedScaled = Mul(Normalized, Scale)
+    normalized_scaled = op.Mul(normalized, scale)
+    
+    # Add bias: Y = Add(NormalizedScaled, B)
+    result = op.Add(normalized_scaled, bias)
+    
+    return result
+
+
+class LayerNormFusionTest(unittest.TestCase):
+    def _check(
+        self,
+        test_data_constructor: OnnxFunction,
+        expected_graph_len: int,
+        expected_op_type: str,
+        has_bias: bool = False,
+    ):
+        """Helper method to run a fusion test scenario."""
+        model_proto = test_data_constructor.to_model_proto()
+        # Create test inputs
+        input_data = onnxscript.rewriter.testing.generate_random_inputs(model)
+
+        model = ir.serde.deserialize_model(model_proto)
+        fuse_layer_normalization(model)
+
+        # Run original model
+        original_output = test_utils.ort_run("Original", model, input_data)
+
+        # Apply fusion
+        fuse_layer_normalization(model)
+        remove_unused_nodes(model)
+
+        # Verify fusion occurred
+        self.assertEqual(len(model.graph), expected_graph_len)
+        self.assertEqual(model.graph.node(0).op_type, expected_op_type)
+
+        # Run optimized model and verify outputs match
+        optimized_output = test_utils.ort_run("Optimized", model, input_data)
+        test_utils.assert_allclose(original_output, optimized_output, rtol=1e-4, atol=1e-4)
+
+    def test_layer_norm_fusion_without_bias(self):
+        """Test LayerNorm fusion without bias."""
+        self._check(_test_layer_norm_without_bias, 1, "LayerNormalization", has_bias=False)
+
+    def test_layer_norm_fusion_with_bias(self):
+        """Test LayerNorm fusion with bias."""
+        self._check(_test_layer_norm_with_bias, 1, "LayerNormalization", has_bias=True)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/onnxscript/rewriter/testing.py b/onnxscript/rewriter/testing.py
@@ -10,6 +10,24 @@
 
 from onnxscript import ir
 
+def generate_random_inputs(self, model: onnx.ModelProto) -> dict[str, Any]:
+    feeds: dict[str, Any] = {}
+    for input in model.graph.input:
+        input_type = input.type.tensor_type
+        shape = tuple(input_type.shape.dim)
+        if not all(hasattr(d, 'dim_value') for d in shape):
+            raise ValueError(f"Input {input.name} has dynamic shape dimensions.")
+        shape = tuple(d.dim_value for d in shape)
+        if input_type.elem_type == onnx.TensorProto.FLOAT:
+            if shape:
+                feeds[input.name] = np.random.randn(*shape).astype(np.float32)
+            else:
+                feeds[input.name] = np.random.randn(1).astype(np.float32)
+        else:
+            raise ValueError(f"Not implemented for input type {input_type.elem_type}")
+    return feeds
+
+
 
 def assert_numerically_equal(
     original_model_proto: onnx.ModelProto | ir.Model,