From 8c069ac698f5959ef4cc5f2acd075a42cdb528de Mon Sep 17 00:00:00 2001
From: naomiOvad <no0583267045@gmail.com>
Date: Thu, 6 Nov 2025 12:07:50 +0200
Subject: [PATCH 1/5] WIP: partial work on rmsnorm tests

---
 .../test/providers/cpu/nn/rms_norm_op_test.cc | 261 ++++++++++++++++++
 1 file changed, 261 insertions(+)
diff --git a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
index 4bdd3ea5adaff..479110bbf57c9 100644
--- a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
@@ -67,6 +67,267 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16) {
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
             kNnapiExecutionProvider, kQnnExecutionProvider});
 }
+//------------
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Scalar_Broadcast_ShouldPass) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  // X: values 0..29 reshaped to (2,5,3)
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  test.AddInput<float>("scale", /*dims*/ {}, /*vals*/ {1.5f}, /*is_initializer*/ true);
+
+  test.AddOutput<float>(
+      "Y", {2, 5, 3},
+      {0.0000f, 1.1619f, 2.3238f,
+       1.1023f, 1.4697f, 1.8371f,
+       1.2771f, 1.4899f, 1.7027f,
+       1.3455f, 1.4950f, 1.6445f,
+       1.3819f, 1.4971f, 1.6122f,
+
+       1.4044f, 1.4981f, 1.5917f,
+       1.4197f, 1.4986f, 1.5775f,
+       1.4308f, 1.4990f, 1.5671f,
+       1.4392f, 1.4992f, 1.5592f,
+       1.4458f, 1.4994f, 1.5529f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x1_Broadcast_ShouldPass) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  test.AddInput<float>("scale", {1, 1, 1}, {1.0f}, /*is_initializer*/ true);
+
+  test.AddOutput<float>(
+      "Y", {2, 5, 3},
+      {0.0000f, 0.7746f, 1.5492f,
+       0.7348f, 0.9798f, 1.2247f,
+       0.8514f, 0.9933f, 1.1352f,
+       0.8970f, 0.9967f, 1.0964f,
+       0.9213f, 0.9980f, 1.0748f,
+
+       0.9363f, 0.9987f, 1.0611f,
+       0.9465f, 0.9991f, 1.0517f,
+       0.9539f, 0.9993f, 1.0447f,
+       0.9595f, 0.9995f, 1.0394f,
+       0.9639f, 0.9996f, 1.0353f});
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
+           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
+            kNnapiExecutionProvider, kQnnExecutionProvider});
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_3_ShouldPass_NoBroadcast) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  test.AddInput<float>("scale", {3}, {1.5f, 1.5f, 1.5f}, /*is_initializer*/ true);
+
+  test.AddOutput<float>("Y", {2, 5, 3},
+    {
+      0.0000f, 1.1619f, 2.3238f,
+      1.1023f, 1.4697f, 1.8371f,
+      1.2771f, 1.4899f, 1.7027f,
+      1.3455f, 1.4950f, 1.6445f,
+      1.3819f, 1.4971f, 1.6122f,
+
+      1.4044f, 1.4981f, 1.5917f,
+      1.4197f, 1.4986f, 1.5775f,
+      1.4308f, 1.4990f, 1.5671f,
+      1.4392f, 1.4992f, 1.5592f,
+      1.4458f, 1.4994f, 1.5529f
+    });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x3_ShouldPass_WhenBroadcastSupported) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  test.AddInput<float>("scale", {1, 1, 3}, {1.5f, 1.5f, 1.5f}, /*is_initializer*/ true);
+
+  test.AddOutput<float>("Y", {2, 5, 3},
+    {
+      0.0000f, 1.1619f, 2.3238f,
+      1.1023f, 1.4697f, 1.8371f,
+      1.2771f, 1.4899f, 1.7027f,
+      1.3455f, 1.4950f, 1.6445f,
+      1.3819f, 1.4971f, 1.6122f,
+
+      1.4044f, 1.4981f, 1.5917f,
+      1.4197f, 1.4986f, 1.5775f,
+      1.4308f, 1.4990f, 1.5671f,
+      1.4392f, 1.4992f, 1.5592f,
+      1.4458f, 1.4994f, 1.5529f
+    });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Bx1x3_ShouldPass_WhenBroadcastSupported) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  // batch 0: 1.25; batch 1: 1.75
+  test.AddInput<float>("scale", {2, 1, 3},
+                       {1.25f,1.25f,1.25f,  1.75f,1.75f,1.75f}, /*is_initializer*/ true);
+
+  test.AddOutput<float>("Y", {2, 5, 3},
+    {
+      // batch 0 (S=0..4)
+      0.0000f, 0.9682f, 1.9365f,
+      0.9186f, 1.2247f, 1.5309f,
+      1.0642f, 1.2416f, 1.4190f,
+      1.1213f, 1.2459f, 1.3704f,
+      1.1516f, 1.2475f, 1.3435f,
+
+      // batch 1 (S=0..4)
+      1.6385f, 1.7477f, 1.8570f,
+      1.6564f, 1.7484f, 1.8404f,
+      1.6693f, 1.7488f, 1.8283f,
+      1.6791f, 1.7491f, 1.8190f,
+      1.6868f, 1.7493f, 1.8117f
+    });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx3_ShouldPass_WhenBroadcastSupported) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  test.AddInput<float>("scale", {1, 5, 3},
+                       {
+                         1.1f,1.1f,1.1f,  1.2f,1.2f,1.2f,  1.3f,1.3f,1.3f,
+                         1.4f,1.4f,1.4f,  1.5f,1.5f,1.5f
+                       }, /*is_initializer*/ true);
+
+  test.AddOutput<float>("Y", {2, 5, 3},
+    {
+      // batch 0
+      0.0000f, 0.8521f, 1.7041f,
+      0.8818f, 1.1758f, 1.4697f,
+      1.1068f, 1.2912f, 1.4757f,
+      1.2558f, 1.3954f, 1.5349f,
+      1.3819f, 1.4971f, 1.6122f,
+
+      // batch 1
+      1.0299f, 1.0986f, 1.1672f,
+      1.1358f, 1.1989f, 1.2620f,
+      1.2401f, 1.2991f, 1.3582f,
+      1.3433f, 1.3993f, 1.4552f,
+      1.4458f, 1.4994f, 1.5529f
+    });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_BxSx3_ShouldPass_NoBroadcast) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(2 * 5 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 5, 3}, x);
+
+  std::vector<float> scale(2 * 5 * 3, 1.5f);
+  test.AddInput<float>("scale", {2, 5, 3}, scale, /*is_initializer*/ true);
+
+  test.AddOutput<float>("Y", {2, 5, 3},
+    {
+      0.0000f, 1.1619f, 2.3238f,
+      1.1023f, 1.4697f, 1.8371f,
+      1.2771f, 1.4899f, 1.7027f,
+      1.3455f, 1.4950f, 1.6445f,
+      1.3819f, 1.4971f, 1.6122f,
+
+      1.4044f, 1.4981f, 1.5917f,
+      1.4197f, 1.4986f, 1.5775f,
+      1.4308f, 1.4990f, 1.5671f,
+      1.4392f, 1.4992f, 1.5592f,
+      1.4458f, 1.4994f, 1.5529f
+    });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xCx1x1_ShouldPass_WhenBroadcastSupported) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<int64_t>("axis", 1);  // normalize over [C,H,W]
+
+  // X: 0..15 reshaped to (1,4,2,2)
+  std::vector<float> x(1 * 4 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {1, 4, 2, 2}, x);
+
+  // scale: [1,4,1,1]
+  test.AddInput<float>("scale", {1, 4, 1, 1},
+                       {1.1f, 1.2f, 1.3f, 1.4f},
+                       /*is_initializer*/ true);
+
+  // expected Y (מחושב מראש)
+  test.AddOutput<float>("Y", {1, 4, 2, 2}, {
+    // c=0 (scale=1.1)
+    0.0000f, 0.1250f,
+    0.2499f, 0.3749f,
+
+    // c=1 (scale=1.2)
+    0.5452f, 0.6816f,
+    0.8179f, 0.9542f,
+
+    // c=2 (scale=1.3)
+    1.1814f, 1.3290f,
+    1.4767f, 1.6244f,
+
+    // c=3 (scale=1.4)
+    1.9084f, 2.0674f,
+    2.2264f, 2.3854f
+  });
+
+  test.Run(OpTester::ExpectResult::kExpectSuccess);
+}
+
+
+
+
+
+
+
+
 
 }  // namespace test
 }  // namespace onnxruntime

From ebec6ec0decf7182b1c8439b808da1d18f04df1c Mon Sep 17 00:00:00 2001
From: naomiOvad <no0583267045@gmail.com>
Date: Wed, 19 Nov 2025 17:13:53 +0200
Subject: [PATCH 2/5] Fix RMSNormalization & LayerNormalization broadcast
 handling and add comprehensive tests

---
 .../core/providers/cpu/nn/layer_norm_helper.h | 161 +++-
 .../core/providers/cpu/nn/layer_norm_impl.cc  | 242 +++++-
 .../test/contrib_ops/layer_norm_op_test.cc    | 330 ++++++++-
 .../test/providers/cpu/nn/rms_norm_op_test.cc | 695 +++++++++++++-----
 4 files changed, 1236 insertions(+), 192 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
index ed5ea83d9de30..5ce0024ef3bec 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
@@ -6,12 +6,12 @@
 #include "core/framework/tensor_shape.h"
 #include "core/common/status.h"
 #include "core/common/narrow.h"
+#include "core/common/inlined_containers.h"
 
 namespace onnxruntime {
 
 constexpr const char* kLayerNormInputShapeMismatchError =
-    "Size of scale and bias (if provided) must match X.shape[axis:], "
-    "or scale and bias (with same shape) can be broadcasted to X when axis is 2.";
+    "Scale and (optional) bias must match X.shape[axis:] or be NumPy-broadcastable to it.";
 
 constexpr const char* kLayerNormInvalidSize = "Size of X.shape[axis:] must be larger than 1, got ";
 
@@ -23,15 +23,31 @@ struct LayerNormParams {
   int64_t scale_size;
   int64_t bias_size;
   int64_t broadcast_param;
+  bool use_generic_broadcast{false};  // true: full NumPy-style broadcast; false: legacy broadcast_param path
+  onnxruntime::InlinedVector<int64_t, 8> x_dims;
+  onnxruntime::InlinedVector<int64_t, 8> x_inner_dims;  // X.shape[axis:]
+  onnxruntime::InlinedVector<int64_t, 8> sc_dims;
+  onnxruntime::InlinedVector<int64_t, 8> bi_dims;
+  onnxruntime::InlinedVector<int64_t, 8> sc_strides;
+  onnxruntime::InlinedVector<int64_t, 8> bi_strides;
+  int64_t axis{0};
+  int64_t last_rank{0};
+  onnxruntime::InlinedVector<int64_t, 8> sc_inner_inc;     // scale strides for inner dims [axis..]
+  onnxruntime::InlinedVector<int64_t, 8> bi_inner_inc;     // bias  strides for inner dims [axis..]
+  onnxruntime::InlinedVector<int64_t, 8> sc_outer_inc;     // how much the scale pointer moves (stride) when an outer-dimension index of X changes (dims 0..axis-1)
+  onnxruntime::InlinedVector<int64_t, 8> bi_outer_inc;     // how much the bias pointer moves (stride) when an outer-dimension index of X changes (dims 0..axis-1)
+  onnxruntime::InlinedVector<int64_t, 8> x_outer_strides;  // X strides for outer dims [0..axis-1]
 };
 
-// We support broadcasting for axis=2, where the first two dimensions are rows, and the rest are columns.
+// Fast-path broadcasting for axis = 2:
 // When X shape is (B, S, ...), and x_row (index of one row in X) is in the range of [0, B * S).
-// We support scale and bias shape like below:
+// We support the following scale/bias shapes in this path:
 //    When scale and bias shape is (1, 1, ...) or (...), value of broadcast_param is 0.
 //    When scale and bias shape is (B, 1, ...), value of broadcast_param is S.
 //    When scale and bias shape is (B, S, ...), value of broadcast_param is 1.
 //    When scale and bias shape is (1, S, ...), value of broadcast_param is -S.
+// For all other NumPy-broadcastable shapes we fall back to the generic
+// broadcasting path (use_generic_broadcast = true) and ignore broadcast_param.
 
 // Below is a macro to compute the offset for scale and bias data for a row of X.
 #ifndef LAYER_NORM_SCALE_BIAS_OFFSET
@@ -48,30 +64,151 @@ class LayerNormHelper {
                             bool has_bias,
                             int64_t axis,
                             LayerNormParams& params) {
+    // Initialize basic layout parameters: how many rows we have and how many elements
+    // are normalized per row, as well as the total scale/bias sizes.
     params.num_rows = x_shape.SizeToDimension(onnxruntime::narrow<size_t>(axis));
     params.norm_size = x_shape.SizeFromDimension(onnxruntime::narrow<size_t>(axis));
     params.scale_size = scale_shape.Size();
-    params.bias_size = bias_shape.Size();
+    params.bias_size = has_bias ? bias_shape.Size() : 0;
+
     params.broadcast_param = 0;
+    params.axis = axis;
 
     if (params.norm_size <= 1) {
       params.broadcast_param = kLayerNormInvalidInput;
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, kLayerNormInvalidSize, params.norm_size);
     } else if (params.scale_size != params.norm_size || (has_bias && params.bias_size != params.scale_size)) {
       params.broadcast_param = GetBroadcastParam(x_shape, scale_shape, has_bias ? &bias_shape : nullptr, axis);
-      if (params.broadcast_param == kLayerNormInvalidInput) {
-        return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
-                               kLayerNormInputShapeMismatchError,
-                               " X.shape=", x_shape,
-                               " scale.shape=", scale_shape,
-                               " bias.shape=", bias_shape,
-                               " and axis=", axis);
+      // Try to encode simple (B, S, ...) layouts into broadcast_param so that the
+      // fast-path can be used. If this fails, broadcast_param will be set to
+      // kLayerNormInvalidInput and we may fall back to generic broadcasting later.
+    }
+
+    const size_t xr = x_shape.NumDimensions();
+    const size_t sr = scale_shape.NumDimensions();
+    const size_t br = has_bias ? bias_shape.NumDimensions() : 0;
+    params.x_dims.clear();
+    params.x_dims.reserve(xr);
+    for (size_t i = 0; i < xr; ++i) params.x_dims.push_back(x_shape.GetDims()[i]);
+
+    // Right-align the scale (and bias) shape to match X's rank, filling leading
+    // dimensions with 1 so that NumPy-style broadcasting rules can be applied.
+    params.sc_dims.clear();
+    params.sc_dims.resize(xr, 1);
+    {
+      for (size_t i = 0; i < sr; ++i) {
+        params.sc_dims[xr - 1 - i] = scale_shape.GetDims()[sr - 1 - i];
+      }
+    }
+
+    params.bi_dims.clear();
+    if (has_bias) {
+      params.bi_dims.resize(xr, 1);
+      for (size_t i = 0; i < br; ++i) {
+        params.bi_dims[xr - 1 - i] = bias_shape.GetDims()[br - 1 - i];
+      }
+    }
+    // Validate that scale and bias shapes are NumPy-broadcastable to X.
+    // If not, we fail early with a clear shape mismatch error.
+    const bool sc_ok = IsNumpyBroadcastable(params.sc_dims, params.x_dims);
+    const bool bi_ok = !has_bias || IsNumpyBroadcastable(params.bi_dims, params.x_dims);
+    if (!sc_ok || !bi_ok) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             kLayerNormInputShapeMismatchError,
+                             " X.shape=", x_shape,
+                             " scale.shape=", scale_shape,
+                             " bias.shape=", bias_shape,
+                             " and axis=", axis);
+    }
+    // Cache the inner dimensions X.shape[axis:] that are normalized together
+    // for each logical row.
+    params.last_rank = onnxruntime::narrow<int64_t>(xr) - axis;
+    params.x_inner_dims.clear();
+    params.x_inner_dims.reserve(params.last_rank > 0 ? static_cast<size_t>(params.last_rank) : 0);
+    for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
+      params.x_inner_dims.push_back(params.x_dims[i]);
+    }
+
+    params.sc_strides = MakeStrides(params.sc_dims);
+    params.bi_strides.clear();
+    if (has_bias) {
+      params.bi_strides = MakeStrides(params.bi_dims);
+    }
+
+    // Precompute how scale/bias advance along the inner dimensions [axis..]:
+    // these increments are used inside the per-row normalization loop.
+    params.sc_inner_inc.clear();
+    params.bi_inner_inc.clear();
+    for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
+      params.sc_inner_inc.push_back(params.sc_strides[i]);
+      if (has_bias) {
+        params.bi_inner_inc.push_back(params.bi_strides[i]);
       }
     }
+    // Compute strides for X over the outer dimensions [0..axis-1],
+    // used to locate the base address of each logical row in X.
+    params.x_outer_strides.clear();
+    params.x_outer_strides.resize(static_cast<size_t>(axis), 1);
+    if (axis > 1) {
+      for (int64_t d = axis - 2; d >= 0; --d) {
+        const size_t du = static_cast<size_t>(d);
+        params.x_outer_strides[du] =
+            params.x_outer_strides[du + 1] * params.x_dims[du + 1];
+      }
+    }
+    // Detect whether scale/bias depend on any outer dimensions [0..axis-1].
+    // If any outer stride is non-zero, scale/bias are not purely "inner-only"
+    // and the simple fast-path based on broadcast_param is not sufficient.
+    params.sc_outer_inc.clear();
+    params.bi_outer_inc.clear();
+    for (int64_t i = 0; i < axis; ++i) {
+      params.sc_outer_inc.push_back(params.sc_strides[static_cast<size_t>(i)]);
+      params.bi_outer_inc.push_back(has_bias ? params.bi_strides[static_cast<size_t>(i)] : 0);
+    }
+
+    bool outer_dep = false;
+    for (int64_t i = 0; i < axis; ++i) {
+      if (params.sc_outer_inc[static_cast<size_t>(i)] != 0 ||
+          (has_bias && params.bi_outer_inc[static_cast<size_t>(i)] != 0)) {
+        outer_dep = true;
+        break;
+      }
+    }
+    // Enable the generic NumPy-style broadcasting path if either:
+    //  - the fast-path cannot represent this shape (broadcast_param is invalid), or
+    //  - scale/bias have any dependency on outer dimensions.
+    params.use_generic_broadcast = outer_dep || (params.broadcast_param == kLayerNormInvalidInput);
+
     return Status::OK();
   }
 
  private:
+  static bool IsNumpyBroadcastable(gsl::span<const int64_t> a,
+                                   gsl::span<const int64_t> b) {
+    ORT_ENFORCE(a.size() == b.size());
+    for (size_t k = 0; k < a.size(); ++k) {
+      const int64_t ak = a[k];
+      const int64_t bk = b[k];
+      if (!(ak == 1 || ak == bk)) {
+        return false;
+      }
+    }
+    return true;
+  }
+  static InlinedVector<int64_t, 8> MakeStrides(const InlinedVector<int64_t, 8>& dims) {
+    InlinedVector<int64_t, 8> strides(dims.size(), 0);
+    if (dims.empty()) return strides;
+
+    int64_t running = 1;
+    for (ptrdiff_t i = dims.size() - 1; i >= 0; --i) {
+      size_t idx = static_cast<size_t>(i);
+      strides[idx] = (dims[idx] == 1) ? 0 : running;
+      running *= std::max<int64_t>(1, dims[idx]);
+    }
+
+    return strides;
+  }
+
   static int64_t GetBroadcastParam(const TensorShape& x_shape,
                                    const TensorShape& scale_shape,
                                    const TensorShape* bias_shape,
diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
index 237110483416c..51e175da27caa 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
@@ -159,6 +159,205 @@ void ComputeJob(
     inv_std_dev_data[task_idx] = MLFloat16(1.0f / mean_square);
   }
 }
+// Write a statistic value (mean or 1/denom) into the output buffer,
+// converting from double to the target type U (including MLFloat16).
+template <typename U>
+ORT_FORCEINLINE void WriteStat(U* dst, ptrdiff_t index, double v) {
+  if constexpr (std::is_same_v<U, MLFloat16>) {
+    dst[index] = MLFloat16(static_cast<float>(v));
+  } else {
+    dst[index] = gsl::narrow_cast<U>(v);
+  }
+}
+// Generic per-row LayerNorm computation that supports full NumPy-style
+// broadcasting of scale and bias over both outer and inner dimensions.
+// task_idx selects the logical row in X (and the matching slices of scale/bias).
+template <typename T, typename U>
+void ComputeJobGeneric(
+    const T* X_data,
+    const T* scale_data,
+    const T* bias_data,
+    const ptrdiff_t task_idx,
+    const LayerNormParams& params,
+    const float* scale_float_ptr,
+    const float* bias_float_ptr,
+    float epsilon,
+    bool simplified,
+    T* Y_data,
+    U* mean_data,
+    U* inv_std_dev_data) {
+  ORT_UNUSED_PARAMETER(scale_float_ptr);
+  ORT_UNUSED_PARAMETER(bias_float_ptr);
+  const auto& sc_inner_inc = params.sc_inner_inc;
+  const auto& bi_inner_inc = params.bi_inner_inc;
+  const int64_t norm_size = params.norm_size;
+  const int64_t last_rank = params.last_rank;
+  const T* p_input = X_data + task_idx * norm_size;
+  T* p_output = Y_data + task_idx * norm_size;
+  double mean = 0.0;
+  double mean_sq = 0.0;
+  for (int64_t h = 0; h < norm_size; ++h) {
+    mean += p_input[h];
+    mean_sq += static_cast<double>(p_input[h]) * p_input[h];
+  }
+  mean /= static_cast<double>(norm_size);
+  double denom = simplified
+                     ? std::sqrt(mean_sq / norm_size + epsilon)
+                     : std::sqrt(mean_sq / norm_size - mean * mean + epsilon);
+
+  // Decode the outer-dimension indices for this task_idx and compute the
+  // base offsets into scale/bias for the current logical row of X.
+  int64_t off_sc_row = 0;
+  int64_t off_bi_row = 0;
+
+  if (params.axis > 0) {
+    const auto& outer_strides = params.x_outer_strides;
+
+    for (int64_t d = 0; d < params.axis; ++d) {
+      const size_t du = static_cast<size_t>(d);
+      const int64_t dim = params.x_dims[du];
+      const int64_t idx_d = (dim == 0)
+                                ? 0
+                                : (task_idx / outer_strides[du]) % dim;
+
+      off_sc_row += idx_d * params.sc_strides[du];
+      if (bias_data) {
+        off_bi_row += idx_d * params.bi_strides[du];
+      }
+    }
+  }
+  // Iterate over the inner dimensions using a small multi-dimensional index
+  // (idx), and use the precomputed inner increments to locate the correct
+  // scale/bias element for each position within the row.
+  onnxruntime::InlinedVector<int64_t, 8> idx(static_cast<size_t>(last_rank), 0);
+
+  for (int64_t h = 0; h < norm_size; ++h) {
+    int64_t off_sc = off_sc_row;
+    int64_t off_bi = off_bi_row;
+    for (size_t d = 0; d < static_cast<size_t>(last_rank); ++d) {
+      off_sc += idx[d] * sc_inner_inc[d];
+      if (bias_data) off_bi += idx[d] * bi_inner_inc[d];
+    }
+
+    const double s = static_cast<double>(scale_data[off_sc]);
+    const double b = (bias_data ? static_cast<double>(bias_data[off_bi]) : 0.0);
+    const double x = static_cast<double>(p_input[h]);
+    const double y = simplified ? (x / denom) * s
+                                : ((x - mean) / denom) * s + b;
+    p_output[h] = static_cast<T>(y);
+
+    for (int64_t d = last_rank - 1; d >= 0; --d) {
+      if (++idx[static_cast<size_t>(d)] < params.x_inner_dims[static_cast<size_t>(d)]) break;
+      idx[static_cast<size_t>(d)] = 0;
+    }
+  }
+  if (mean_data) {
+    WriteStat<U>(mean_data, task_idx, mean);
+  }
+  if (inv_std_dev_data) {
+    WriteStat<U>(inv_std_dev_data, task_idx, 1.0 / denom);
+  }
+}
+// Specialization for MLFloat16 input/output: we compute statistics and the
+// normalized values in float, optionally using pre-converted float scale/bias
+// buffers (scale_float_ptr / bias_float_ptr) for better performance and reuse.
+template <typename U>
+void ComputeJobGeneric(
+    const MLFloat16* X_data,
+    const MLFloat16* scale_data,
+    const MLFloat16* bias_data,
+    const ptrdiff_t task_idx,
+    const LayerNormParams& params,
+    const float* scale_float_ptr,
+    const float* bias_float_ptr,
+    float epsilon,
+    bool simplified,
+    MLFloat16* Y_data,
+    U* mean_data,
+    U* inv_std_dev_data) {
+  const auto& sc_inner_inc = params.sc_inner_inc;
+  const auto& bi_inner_inc = params.bi_inner_inc;
+  const int64_t norm_size = params.norm_size;
+  const int64_t last_rank = params.last_rank;
+
+  const MLFloat16* p_input = X_data + task_idx * norm_size;
+  MLFloat16* p_output = Y_data + task_idx * norm_size;
+
+  double mean = 0.0;
+  double mean_sq = 0.0;
+  for (int64_t h = 0; h < norm_size; ++h) {
+    const float xv = static_cast<float>(p_input[h]);
+    mean += xv;
+    mean_sq += static_cast<double>(xv) * xv;
+  }
+  mean /= static_cast<double>(norm_size);
+  const double denom = simplified
+                           ? std::sqrt(mean_sq / norm_size + epsilon)
+                           : std::sqrt(mean_sq / norm_size - mean * mean + epsilon);
+  int64_t off_sc_row = 0;
+  int64_t off_bi_row = 0;
+
+  if (params.axis > 0) {
+    const auto& outer_strides = params.x_outer_strides;
+
+    for (int64_t d = 0; d < params.axis; ++d) {
+      const size_t du = static_cast<size_t>(d);
+      const int64_t dim = params.x_dims[du];
+      const int64_t idx_d = (dim == 0)
+                                ? 0
+                                : (task_idx / outer_strides[du]) % dim;
+
+      off_sc_row += idx_d * params.sc_strides[du];
+      if (bias_data || bias_float_ptr) {
+        off_bi_row += idx_d * params.bi_strides[du];
+      }
+    }
+  }
+
+  onnxruntime::InlinedVector<int64_t, 8> idx(static_cast<size_t>(last_rank), 0);
+
+  for (int64_t h = 0; h < norm_size; ++h) {
+    int64_t off_sc = off_sc_row;
+    int64_t off_bi = off_bi_row;
+
+    for (size_t d = 0; d < static_cast<size_t>(last_rank); ++d) {
+      off_sc += idx[d] * sc_inner_inc[d];
+      if (bias_data || bias_float_ptr) {
+        off_bi += idx[d] * bi_inner_inc[d];
+      }
+    }
+
+    const float s = scale_float_ptr
+                        ? scale_float_ptr[off_sc]
+                        : static_cast<float>(scale_data[off_sc]);
+
+    const float b = bias_float_ptr
+                        ? bias_float_ptr[off_bi]
+                        : (bias_data ? static_cast<float>(bias_data[off_bi]) : 0.0f);
+
+    const float x = static_cast<float>(p_input[h]);
+    const float y = simplified
+                        ? (x / static_cast<float>(denom)) * s
+                        : ((x - static_cast<float>(mean)) / static_cast<float>(denom)) * s + b;
+
+    p_output[h] = MLFloat16(y);
+
+    for (int64_t d = last_rank - 1; d >= 0; --d) {
+      const size_t du = static_cast<size_t>(d);
+      if (++idx[du] < params.x_inner_dims[du]) {
+        break;
+      }
+      idx[du] = 0;
+    }
+  }
+
+  if (mean_data) {
+    WriteStat<U>(mean_data, task_idx, mean);
+  }
+  if (inv_std_dev_data) {
+    WriteStat<U>(inv_std_dev_data, task_idx, 1.0 / denom);
+  }
+}
 
 void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr<float>& dest, bool& is_packed) {
   if (tensor.GetElementType() == utils::ToTensorProtoElementType<MLFloat16>()) {
@@ -277,7 +476,13 @@ Status LayerNormImpl::ComputeWithoutContext(
     bool simplified,
     AllocatorPtr alloc) const {
   LayerNormParams params;
-  ORT_RETURN_IF_ERROR(LayerNormHelper::CheckInputs(x_shape, scale_shape, bias_shape, bias_data != nullptr, axis, params));
+  const bool has_bias =
+      !simplified &&
+      (bias_data != nullptr ||
+       (std::is_same_v<T, MLFloat16> && prepacked_bias_fp32_data_ != nullptr));
+
+  ORT_RETURN_IF_ERROR(
+      LayerNormHelper::CheckInputs(x_shape, scale_shape, bias_shape, has_bias, axis, params));
 
   IAllocatorUniquePtr<float> scale_fp32;
   IAllocatorUniquePtr<float> bias_fp32;
@@ -294,17 +499,42 @@ Status LayerNormImpl::ComputeWithoutContext(
     }
   }
 
+  // Resolve the float32 pointers for scale/bias (scf/bif) in the MLFloat16 case.
+  // For non-MLFloat16 types, these remain null and the original T* buffers are used.
+  const float* scf = nullptr;
+  const float* bif = nullptr;
+
+  if constexpr (std::is_same_v<T, MLFloat16>) {
+    scf = prepacked_scale_fp32_data_ ? prepacked_scale_fp32_data_.get()
+                                     : scale_fp32.get();
+
+    if (has_bias) {
+      bif = prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get()
+                                      : (bias_fp32 ? bias_fp32.get() : nullptr);
+    } else {
+      bif = nullptr;
+    }
+  }
+  // Launch one normalization job per logical row in X. For each row we either:
+  //  - use the generic NumPy-style broadcasting path, or
+  //  - use the existing fast-path based on broadcast_param.
   concurrency::ThreadPool::TryBatchParallelFor(
       thread_pool, static_cast<int32_t>(params.num_rows),
       [&](ptrdiff_t task_idx) {
-        ComputeJob(X_data, scale_data, bias_data, task_idx, params.norm_size, params.broadcast_param,
-                   prepacked_scale_fp32_data_ ? prepacked_scale_fp32_data_.get() : scale_fp32.get(),
-                   prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(),
-                   epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc);
+        if (params.use_generic_broadcast) {
+          ComputeJobGeneric(X_data, scale_data, bias_data, task_idx, params,
+                            scf, bif,
+                            epsilon, simplified, Y_data, mean_data, inv_std_dev_data);
+        } else {
+          ComputeJob(X_data, scale_data, bias_data, task_idx,
+                     params.norm_size, params.broadcast_param,
+                     scf, bif,
+                     epsilon, simplified, Y_data, mean_data, inv_std_dev_data, alloc);
+        }
       },
       0);
 
   return Status::OK();
 }
 
-}  // namespace onnxruntime
\ No newline at end of file
+}  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 0d4fc5af68b4f..f7271f673cc49 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -443,21 +443,161 @@ TEST(LayerNormTest, LayerNorm_InvalidNormSize) {
   RunTestOnCpuAndCuda(test, kLayerNormInvalidSize);
 }
 
-TEST(LayerNormTest, LayerNorm_InvalidScaleBias) {
+TEST(LayerNormTest, LayerNorm_ValidScaleBias_Broadcast) {
   OpTester test("LayerNormalization");
   test.AddAttribute<float>("epsilon", 1e-05f);
 
-  // as axis is 1, the scale and bias should have size 6
+  // With axis = 1, scale and bias of shape {2} are NumPy-broadcastable
+  // to X.shape[axis:] = {3, 2}, so this configuration is now valid.
   std::vector<int64_t> dims{1, 3, 2};
   test.AddInput<float>("x", dims, {1.2416f, 0.946123f, 13.1685f, 0.36423f, 21.145f, 0.03941f});
   test.AddInput<float>("gamma", {2}, {-0.6953f, 5.1824f});
   test.AddInput<float>("bias", {2}, {0.6435f, -0.3964f});
   test.AddAttribute<int64_t>("axis", 1);
-  test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f});
+  test.AddOutput<float>("output", dims,
+                        {1.063606f, -3.716114f,
+                         0.042961f, -4.087264f,
+                         -0.639629f, -4.294445f});
+
+  // This configuration used to be rejected, but with generic NumPy-style
+  // broadcasting support it is now valid and should run successfully.
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(LayerNormTest, LayerNorm_Scale_Scalar_NoBias_Axis2) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  test.AddInput<float>("Scale", {}, {1.5f}, true);
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(LayerNormTest, LayerNorm_Scale_Bias_Scalar_Axis2) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  test.AddInput<float>("Scale", {}, {1.5f}, true);
+
+  test.AddInput<float>("B", {}, {0.1f}, true);
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
 
-  // CPU and CUDA EPs have check for unexpected scale or bias sizes. Exclude other EPs with a LayerNormalization
-  // implementation for which we don't control the check or error message.
-  RunTestOnCpuAndCuda(test, kLayerNormInputShapeMismatchError);
+TEST(LayerNormTest, LayerNorm_Scale_Bias_Axis2) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+  test.AddInput<float>("Scale", {2}, {1.0f, 2.0f}, true);
+  test.AddInput<float>("B", {2}, {0.0f, 0.5f}, true);
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(LayerNormTest,  LayerNorm_Scale_Bias_4D_OuterInnerBroadcast_Axis3) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 3);
+  std::vector<float> x(1 * 2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {1, 2, 2, 2}, x);
+  test.AddInput<float>("Scale", {1, 2, 1, 2},
+                       {
+                           1.0f,
+                           1.1f,
+                           1.2f,
+                           1.3f,
+                       },
+                       true);
+  test.AddInput<float>("B", {1, 2, 1, 2},
+                       {
+                           0.0f,
+                           0.5f,
+                           1.0f,
+                           1.5f,
+                       },
+                       true);
+  test.AddOutput<float>("Y", {1, 2, 2, 2},
+                        {
+                            -1.0f,
+                            1.6f,
+                            -1.0f,
+                            1.6f,
+
+                            -0.2f,
+                            2.8f,
+                            -0.2f,
+                            2.8f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
 #if defined(USE_DNNL)
@@ -478,6 +618,184 @@ TEST(LayerNormTest, LayerNorm17_Scale_Bias_bfloat16) {
   test.AddOutput<BFloat16>("output", dims, MakeBFloat16({-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}));
   test.Run();
 }
+//
+TEST(LayerNormTest, LayerNorm_Scale_Scalar_NoBias) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
+
+  // X: shape = {2, 2, 2}, values 0..7
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  // Scale: scalar
+  test.AddInput<float>("Scale", {}, {1.5f}, /*is_initializer*/ true);
+
+  // Expected Y ( לפי ONNX LayerNormalization: standardization + scale )
+  // מחושב בפייתון בהתאם לספציפיקציה:
+  //
+  // Normalized = (X - Mean) / sqrt(Var + eps)
+  // Y = Normalized * Scale
+  //
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                            -1.5f,
+                            1.5f,
+                        });
+
+  // הפרש קטן בגלל חישובי float
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  test.Run();
+}
+TEST(LayerNormTest, LayerNorm_Scale_Bias_Scalar) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
+
+  // X: shape = {2, 2, 2}, values 0..7
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  // Scale: scalar
+  test.AddInput<float>("Scale", {}, {1.5f}, /*is_initializer*/ true);
+
+  // Bias: scalar
+  test.AddInput<float>("B", {}, {0.1f}, /*is_initializer*/ true);
+
+  // Y = Normalized * 1.5 + 0.1
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                            -1.4f,
+                            1.6f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  test.Run();
+}
+TEST(LayerNormTest, LayerNorm_Scale_Bias_PerLastDim) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
+
+  // X: shape = {2, 2, 2}, values 0..7
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  // Scale: shape = {2} -> broadcast לממד האחרון
+  // scale[0] לכל העמודות במיקום 0, scale[1] למיקום 1
+  test.AddInput<float>("Scale", {2}, {1.0f, 2.0f}, /*is_initializer*/ true);
+
+  // Bias: shape = {2} -> broadcast זהה
+  test.AddInput<float>("B", {2}, {0.0f, 0.5f}, /*is_initializer*/ true);
+
+  // Y = Normalized * Scale + B
+  // מחושב לפי הספציפיקציה (Python reference):
+  // יוצא:
+  // [[[-1.0, 2.5],
+  //   [-1.0, 2.5]],
+  //  [[-1.0, 2.5],
+  //   [-1.0, 2.5]]]
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                            -1.0f,
+                            2.5f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  test.Run();
+}
+TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 3);  // normalize over last dim (W)
+
+  // X: shape = {1, 2, 2, 2}, values 0..7
+  std::vector<float> x(1 * 2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {1, 2, 2, 2}, x);
+
+  // Scale: shape = {1, 2, 1, 2}
+  // S=0: [1.0, 1.1]
+  // S=1: [1.2, 1.3]
+  test.AddInput<float>("Scale", {1, 2, 1, 2},
+                       {
+                           1.0f,
+                           1.1f,
+                           1.2f,
+                           1.3f,
+                       },
+                       /*is_initializer*/ true);
+
+  // Bias: shape = {1, 2, 1, 2}
+  // S=0: [0.0, 0.5]
+  // S=1: [1.0, 1.5]
+  test.AddInput<float>("B", {1, 2, 1, 2},
+                       {
+                           0.0f,
+                           0.5f,
+                           1.0f,
+                           1.5f,
+                       },
+                       /*is_initializer*/ true);
+
+  // Expected Y (מחושב בפייתון לפי ה־spec):
+  //
+  // [[[[ -1.0,  1.6],
+  //    [ -1.0,  1.6]],
+  //
+  //   [[ -0.2,  2.8],
+  //    [ -0.2,  2.8]]]]
+  //
+  test.AddOutput<float>("Y", {1, 2, 2, 2},
+                        {
+                            -1.0f,
+                            1.6f,
+                            -1.0f,
+                            1.6f,
+
+                            -0.2f,
+                            2.8f,
+                            -0.2f,
+                            2.8f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  test.Run();
+}
+
 #endif  //  USE_DNNL
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
index 479110bbf57c9..74d5303b1e932 100644
--- a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
@@ -67,19 +67,15 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16) {
            {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
             kNnapiExecutionProvider, kQnnExecutionProvider});
 }
-//------------
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_Scalar_Broadcast_ShouldPass) {
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Scalar_Axis2) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 2);
-
-  // X: values 0..29 reshaped to (2,5,3)
   std::vector<float> x(2 * 5 * 3);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
   test.AddInput<float>("X", {2, 5, 3}, x);
-
-  test.AddInput<float>("scale", /*dims*/ {}, /*vals*/ {1.5f}, /*is_initializer*/ true);
-
+  test.AddInput<float>("scale", {}, {1.5f}, true);
   test.AddOutput<float>(
       "Y", {2, 5, 3},
       {0.0000f, 1.1619f, 2.3238f,
@@ -93,169 +89,224 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_Scalar_Broadcast_ShouldPass) {
        1.4308f, 1.4990f, 1.5671f,
        1.4392f, 1.4992f, 1.5592f,
        1.4458f, 1.4994f, 1.5529f});
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
-            kNnapiExecutionProvider, kQnnExecutionProvider});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x1_Broadcast_ShouldPass) {
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x1_Axis2) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 2);
-
-  std::vector<float> x(2 * 5 * 3);
-  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
-  test.AddInput<float>("X", {2, 5, 3}, x);
-
-  test.AddInput<float>("scale", {1, 1, 1}, {1.0f}, /*is_initializer*/ true);
-
-  test.AddOutput<float>(
-      "Y", {2, 5, 3},
-      {0.0000f, 0.7746f, 1.5492f,
-       0.7348f, 0.9798f, 1.2247f,
-       0.8514f, 0.9933f, 1.1352f,
-       0.8970f, 0.9967f, 1.0964f,
-       0.9213f, 0.9980f, 1.0748f,
-
-       0.9363f, 0.9987f, 1.0611f,
-       0.9465f, 0.9991f, 1.0517f,
-       0.9539f, 0.9993f, 1.0447f,
-       0.9595f, 0.9995f, 1.0394f,
-       0.9639f, 0.9996f, 1.0353f});
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess, "",
-           {kTensorrtExecutionProvider, kDnnlExecutionProvider, kOpenVINOExecutionProvider,
-            kNnapiExecutionProvider, kQnnExecutionProvider});
+  std::vector<float> x(2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[i] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 2, 2}, x);
+
+  test.AddInput<float>("scale", {1, 1, 1}, {1.0f}, true);
+
+  test.AddOutput<float>("Y", {2, 2, 2},
+                        {
+                            0.0000f,
+                            1.4142f,
+                            0.7845f,
+                            1.1767f,
+
+                            0.8835f,
+                            1.1043f,
+                            0.9204f,
+                            1.0738f,
+                        });
+
+  test.SetOutputAbsErr("Y", 1e-4f);
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_3_ShouldPass_NoBroadcast) {
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Vector3_Axis2) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 2);
 
   std::vector<float> x(2 * 5 * 3);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
   test.AddInput<float>("X", {2, 5, 3}, x);
 
-  test.AddInput<float>("scale", {3}, {1.5f, 1.5f, 1.5f}, /*is_initializer*/ true);
+  test.AddInput<float>("scale", {3}, {1.5f, 1.5f, 1.5f}, true);
 
   test.AddOutput<float>("Y", {2, 5, 3},
-    {
-      0.0000f, 1.1619f, 2.3238f,
-      1.1023f, 1.4697f, 1.8371f,
-      1.2771f, 1.4899f, 1.7027f,
-      1.3455f, 1.4950f, 1.6445f,
-      1.3819f, 1.4971f, 1.6122f,
-
-      1.4044f, 1.4981f, 1.5917f,
-      1.4197f, 1.4986f, 1.5775f,
-      1.4308f, 1.4990f, 1.5671f,
-      1.4392f, 1.4992f, 1.5592f,
-      1.4458f, 1.4994f, 1.5529f
-    });
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+                        {0.0000f, 1.1619f, 2.3238f,
+                         1.1023f, 1.4697f, 1.8371f,
+                         1.2771f, 1.4899f, 1.7027f,
+                         1.3455f, 1.4950f, 1.6445f,
+                         1.3819f, 1.4971f, 1.6122f,
+
+                         1.4044f, 1.4981f, 1.5917f,
+                         1.4197f, 1.4986f, 1.5775f,
+                         1.4308f, 1.4990f, 1.5671f,
+                         1.4392f, 1.4992f, 1.5592f,
+                         1.4458f, 1.4994f, 1.5529f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x3_ShouldPass_WhenBroadcastSupported) {
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x3_Axis2) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 2);
 
   std::vector<float> x(2 * 5 * 3);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
   test.AddInput<float>("X", {2, 5, 3}, x);
 
-  test.AddInput<float>("scale", {1, 1, 3}, {1.5f, 1.5f, 1.5f}, /*is_initializer*/ true);
+  test.AddInput<float>("scale", {1, 1, 3}, {1.5f, 1.5f, 1.5f}, true);
 
   test.AddOutput<float>("Y", {2, 5, 3},
-    {
-      0.0000f, 1.1619f, 2.3238f,
-      1.1023f, 1.4697f, 1.8371f,
-      1.2771f, 1.4899f, 1.7027f,
-      1.3455f, 1.4950f, 1.6445f,
-      1.3819f, 1.4971f, 1.6122f,
-
-      1.4044f, 1.4981f, 1.5917f,
-      1.4197f, 1.4986f, 1.5775f,
-      1.4308f, 1.4990f, 1.5671f,
-      1.4392f, 1.4992f, 1.5592f,
-      1.4458f, 1.4994f, 1.5529f
-    });
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+                        {0.0000f, 1.1619f, 2.3238f,
+                         1.1023f, 1.4697f, 1.8371f,
+                         1.2771f, 1.4899f, 1.7027f,
+                         1.3455f, 1.4950f, 1.6445f,
+                         1.3819f, 1.4971f, 1.6122f,
+
+                         1.4044f, 1.4981f, 1.5917f,
+                         1.4197f, 1.4986f, 1.5775f,
+                         1.4308f, 1.4990f, 1.5671f,
+                         1.4392f, 1.4992f, 1.5592f,
+                         1.4458f, 1.4994f, 1.5529f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_Bx1x3_ShouldPass_WhenBroadcastSupported) {
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Bx1x3_Axis2) {
   OpTester test("RMSNormalization", 23);
   test.AddAttribute<float>("epsilon", 1e-5f);
   test.AddAttribute<int64_t>("axis", 2);
 
-  std::vector<float> x(2 * 5 * 3);
-  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
-  test.AddInput<float>("X", {2, 5, 3}, x);
-
-  // batch 0: 1.25; batch 1: 1.75
-  test.AddInput<float>("scale", {2, 1, 3},
-                       {1.25f,1.25f,1.25f,  1.75f,1.75f,1.75f}, /*is_initializer*/ true);
+  std::vector<float> x(3 * 2 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[i] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {3, 2, 3}, x);
+
+  test.AddInput<float>(
+      "scale", {3, 1, 3},
+      {
+          1.0f,
+          1.0f,
+          1.0f,
+          1.2f,
+          1.2f,
+          1.2f,
+          1.4f,
+          1.4f,
+          1.4f,
+      },
+      true);
 
-  test.AddOutput<float>("Y", {2, 5, 3},
-    {
-      // batch 0 (S=0..4)
-      0.0000f, 0.9682f, 1.9365f,
-      0.9186f, 1.2247f, 1.5309f,
-      1.0642f, 1.2416f, 1.4190f,
-      1.1213f, 1.2459f, 1.3704f,
-      1.1516f, 1.2475f, 1.3435f,
-
-      // batch 1 (S=0..4)
-      1.6385f, 1.7477f, 1.8570f,
-      1.6564f, 1.7484f, 1.8404f,
-      1.6693f, 1.7488f, 1.8283f,
-      1.6791f, 1.7491f, 1.8190f,
-      1.6868f, 1.7493f, 1.8117f
-    });
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+  test.AddOutput<float>(
+      "Y", {3, 2, 3},
+      {
+          0.0000f,
+          0.7746f,
+          1.5492f,
+          0.7348f,
+          0.9798f,
+          1.2247f,
+
+          1.0216f,
+          1.1919f,
+          1.3622f,
+          1.0764f,
+          1.1960f,
+          1.3156f,
+
+          1.2898f,
+          1.3972f,
+          1.5047f,
+          1.3108f,
+          1.3982f,
+          1.4856f,
+      });
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx3_ShouldPass_WhenBroadcastSupported) {
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx3_Axis2) {
   OpTester test("RMSNormalization", 23);
   test.AddAttribute<float>("epsilon", 1e-5f);
   test.AddAttribute<int64_t>("axis", 2);
 
-  std::vector<float> x(2 * 5 * 3);
-  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
-  test.AddInput<float>("X", {2, 5, 3}, x);
+  std::vector<float> x(2 * 4 * 3);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) {
+    x[i] = static_cast<float>(i);
+  }
+  test.AddInput<float>("X", {2, 4, 3}, x);
 
-  test.AddInput<float>("scale", {1, 5, 3},
+  test.AddInput<float>("scale",
+                       {1, 4, 3},
                        {
-                         1.1f,1.1f,1.1f,  1.2f,1.2f,1.2f,  1.3f,1.3f,1.3f,
-                         1.4f,1.4f,1.4f,  1.5f,1.5f,1.5f
-                       }, /*is_initializer*/ true);
+                           1.1f,
+                           1.1f,
+                           1.1f,
+                           1.2f,
+                           1.2f,
+                           1.2f,
+                           1.3f,
+                           1.3f,
+                           1.3f,
+                           1.4f,
+                           1.4f,
+                           1.4f,
+                       },
+                       true);
 
-  test.AddOutput<float>("Y", {2, 5, 3},
-    {
-      // batch 0
-      0.0000f, 0.8521f, 1.7041f,
-      0.8818f, 1.1758f, 1.4697f,
-      1.1068f, 1.2912f, 1.4757f,
-      1.2558f, 1.3954f, 1.5349f,
-      1.3819f, 1.4971f, 1.6122f,
-
-      // batch 1
-      1.0299f, 1.0986f, 1.1672f,
-      1.1358f, 1.1989f, 1.2620f,
-      1.2401f, 1.2991f, 1.3582f,
-      1.3433f, 1.3993f, 1.4552f,
-      1.4458f, 1.4994f, 1.5529f
-    });
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+  test.AddOutput<float>(
+      "Y", {2, 4, 3},
+      {
+          0.0000f,
+          0.8521f,
+          1.7041f,
+          0.8818f,
+          1.1758f,
+          1.4697f,
+          1.1068f,
+          1.2912f,
+          1.4757f,
+          1.2558f,
+          1.3954f,
+          1.5349f,
+
+          1.0134f,
+          1.0978f,
+          1.1823f,
+          1.1235f,
+          1.1984f,
+          1.2733f,
+          1.2304f,
+          1.2988f,
+          1.3672f,
+          1.3354f,
+          1.3990f,
+          1.4626f,
+      });
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_BxSx3_ShouldPass_NoBroadcast) {
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_NoBroadcast_BxSx3_Axis2) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 2);
 
   std::vector<float> x(2 * 5 * 3);
@@ -263,71 +314,379 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_BxSx3_ShouldPass_NoBroadcast) {
   test.AddInput<float>("X", {2, 5, 3}, x);
 
   std::vector<float> scale(2 * 5 * 3, 1.5f);
-  test.AddInput<float>("scale", {2, 5, 3}, scale, /*is_initializer*/ true);
+  test.AddInput<float>("scale", {2, 5, 3}, scale, true);
 
   test.AddOutput<float>("Y", {2, 5, 3},
-    {
-      0.0000f, 1.1619f, 2.3238f,
-      1.1023f, 1.4697f, 1.8371f,
-      1.2771f, 1.4899f, 1.7027f,
-      1.3455f, 1.4950f, 1.6445f,
-      1.3819f, 1.4971f, 1.6122f,
-
-      1.4044f, 1.4981f, 1.5917f,
-      1.4197f, 1.4986f, 1.5775f,
-      1.4308f, 1.4990f, 1.5671f,
-      1.4392f, 1.4992f, 1.5592f,
-      1.4458f, 1.4994f, 1.5529f
-    });
-
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+                        {0.0000f, 1.1619f, 2.3238f,
+                         1.1023f, 1.4697f, 1.8371f,
+                         1.2771f, 1.4899f, 1.7027f,
+                         1.3455f, 1.4950f, 1.6445f,
+                         1.3819f, 1.4971f, 1.6122f,
+
+                         1.4044f, 1.4981f, 1.5917f,
+                         1.4197f, 1.4986f, 1.5775f,
+                         1.4308f, 1.4990f, 1.5671f,
+                         1.4392f, 1.4992f, 1.5592f,
+                         1.4458f, 1.4994f, 1.5529f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-
-TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xCx1x1_ShouldPass_WhenBroadcastSupported) {
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xCx1x1_Axis1) {
   OpTester test("RMSNormalization", 23);
-  test.AddAttribute<float>("epsilon", 1e-5f);
-  test.AddAttribute<int64_t>("axis", 1);  // normalize over [C,H,W]
-
-  // X: 0..15 reshaped to (1,4,2,2)
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
   std::vector<float> x(1 * 4 * 2 * 2);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
   test.AddInput<float>("X", {1, 4, 2, 2}, x);
 
-  // scale: [1,4,1,1]
   test.AddInput<float>("scale", {1, 4, 1, 1},
                        {1.1f, 1.2f, 1.3f, 1.4f},
-                       /*is_initializer*/ true);
+                       true);
+
+  test.AddOutput<float>("Y", {1, 4, 2, 2}, {0.0000000, 0.1249516, 0.2499032, 0.3748548,
+
+                                            0.5452434, 0.6815542, 0.8178651, 0.9541759,
+
+                                            1.1813605, 1.3290305, 1.4767007, 1.6243708,
+
+                                            1.9083518, 2.0673811, 2.2264102, 2.3854396});
+  test.SetOutputAbsErr("Y", 1e-4f);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xCx1_Axis1) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+  std::vector<float> x(2 * 3 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {2, 3, 2}, x);
+  test.AddInput<float>("scale", {1, 3, 1}, {1.0f, 1.2f, 1.4f}, true);
+  test.AddOutput<float>(
+      "Y", {2, 3, 2},
+      {0.0f, 0.33028895f,
+       0.79269350f, 1.18904030f,
+       1.84961808f, 2.31202269f,
+       0.69205177f, 0.80739373f,
+       1.10728300f, 1.24569333f,
+       1.61478746f, 1.77626622f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x3x2x1_Axis1) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+  std::vector<float> x(1 * 3 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i)
+    x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {1, 3, 2, 2}, x);
+  test.AddInput<float>(
+      "scale", {1, 3, 2, 1},
+      {
+          1.0f,
+          1.1f,
+          1.2f,
+          1.3f,
+          1.4f,
+          1.5f,
+      },
+      true);
+  test.AddOutput<float>(
+      "Y", {1, 3, 2, 2},
+      {0.0f, 0.15399808f,
+       0.33879578f, 0.50819367f,
+       0.73919082f, 0.92398852f,
+       1.20118499f, 1.40138257f,
+       1.72477841f, 1.94037580f,
+       2.30997109f, 2.54096842f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx1xW_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+  std::vector<float> x(1 * 2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {1, 2, 2, 2}, x);
+
+  test.AddInput<float>("scale", {1, 2, 1, 2},
+                       {1.0f, 1.2f,
+                        1.4f, 1.6f},
+                       true);
+  test.AddOutput<float>("Y", {1, 2, 2, 2},
+                        {0.0000f, 0.6414f,
+                         1.0690f, 1.9243f,
+
+                         0.9978f, 1.4254f,
+                         1.4967f, 1.9956f});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1xHx1_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  std::vector<float> x(1 * 2 * 2 * 2);
+  for (int i = 0; i < static_cast<int>(x.size()); ++i) x[i] = static_cast<float>(i);
+  test.AddInput<float>("X", {1, 2, 2, 2}, x);
+
+  test.AddInput<float>("scale", {1, 1, 2, 1}, {1.0f, 1.3f}, true);
+
+  test.AddOutput<float>("Y", {1, 2, 2, 2},
+                        {0.0000f, 0.5345f,
+                         1.3898f, 2.0846f,
+
+                         0.7127f, 0.8909f,
+                         1.3898f, 1.6214f});
 
-  // expected Y (מחושב מראש)
-  test.AddOutput<float>("Y", {1, 4, 2, 2}, {
-    // c=0 (scale=1.1)
-    0.0000f, 0.1250f,
-    0.2499f, 0.3749f,
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
 
-    // c=1 (scale=1.2)
-    0.5452f, 0.6816f,
-    0.8179f, 0.9542f,
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1x1xW_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
 
-    // c=2 (scale=1.3)
-    1.1814f, 1.3290f,
-    1.4767f, 1.6244f,
+  std::vector<float> x(1 * 2 * 2 * 3);
+  for (int i = 0; i < (int)x.size(); ++i) x[i] = (float)i;
+  test.AddInput<float>("X", {1, 2, 2, 3}, x);
 
-    // c=3 (scale=1.4)
-    1.9084f, 2.0674f,
-    2.2264f, 2.3854f
-  });
+  test.AddInput<float>("scale", {1, 1, 1, 3}, {1.0f, 1.2f, 1.4f}, true);
 
-  test.Run(OpTester::ExpectResult::kExpectSuccess);
+  test.AddOutput<float>("Y", {1, 2, 2, 3},
+                        {0.0000f, 0.3963f, 0.9248f,
+                         0.9909f, 1.5854f, 2.3120f,
+                         0.6921f, 0.9689f, 1.2918f,
+                         1.0381f, 1.3841f, 1.7763f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx1x1_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
 
+  std::vector<float> x(1 * 3 * 2 * 2);
+  for (int i = 0; i < (int)x.size(); ++i) x[i] = (float)i;
+  test.AddInput<float>("X", {1, 3, 2, 2}, x);
 
+  test.AddInput<float>("scale", {1, 3, 1, 1}, {1.0f, 1.2f, 1.4f}, true);
 
+  test.AddOutput<float>("Y", {1, 3, 2, 2},
+                        {0.0000f, 0.5345f, 1.0690f, 1.6036f,
+                         0.8552f, 1.0690f, 1.2829f, 1.4967f,
+                         1.1709f, 1.3172f, 1.4636f, 1.6099f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
 
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Bx1x1xW_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
 
+  std::vector<float> x(2 * 1 * 2 * 2);
+  for (int i = 0; i < (int)x.size(); ++i) x[i] = (float)i;
+  test.AddInput<float>("X", {2, 1, 2, 2}, x);
 
+  test.AddInput<float>("scale", {2, 1, 1, 2}, {1.0f, 1.1f, 1.3f, 1.4f}, true);
 
+  test.AddOutput<float>("Y", {2, 1, 2, 2},
+                        {0.0000f, 0.5880f, 1.0690f, 1.7639f,
+                         0.9265f, 1.2472f, 1.3898f, 1.7461f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1x1xHxW_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+
+  test.AddInput<float>("X", {1, 2, 2, 3},
+                       {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+
+  test.AddInput<float>("scale", {1, 1, 2, 3},
+                       {1.0f, 1.1f, 1.2f,
+                        1.3f, 1.4f, 1.5f},
+                       true);
+
+  test.AddOutput<float>("Y", {1, 2, 2, 3},
+                        {0.0000f, 0.3633f, 0.7927f, 1.2881f, 1.8496f, 2.4772f,
+                         0.6921f, 0.8881f, 1.1073f, 1.3495f, 1.6148f, 1.9031f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx1xW_AxisNeg2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", -2);
+
+  std::vector<float> x(1 * 2 * 2 * 2);
+  for (int i = 0; i < (int)x.size(); ++i) x[i] = (float)i;
+  test.AddInput<float>("X", {1, 2, 2, 2}, x);
+
+  test.AddInput<float>("scale", {1, 2, 1, 2},
+                       {1.0f, 1.2f, 1.4f, 1.6f}, true);
+
+  test.AddOutput<float>("Y", {1, 2, 2, 2},
+                        {0.0000f, 0.6414f,
+                         1.0690f, 1.9243f,
+
+                         0.9978f, 1.4254f,
+                         1.4967f, 1.9956f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_1xSx1x1xC_Axis3) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 3);
+
+  const int B = 1, S = 2, H = 2, W = 2, C = 3;
+  std::vector<float> x(B * S * H * W * C);
+  for (int i = 0; i < (int)x.size(); ++i) x[i] = (float)i;
+  test.AddInput<float>("X", {B, S, H, W, C}, x);
+
+  test.AddInput<float>("scale", {1, S, 1, 1, C},
+                       {1.0f, 1.1f, 1.2f,
+                        1.3f, 1.4f, 1.5f},
+                       true);
+
+  test.AddOutput<float>("Y", {B, S, H, W, C},
+                        {
+                            0.0000f,
+                            0.3633f,
+                            0.7927f,
+                            0.9909f,
+                            1.4533f,
+                            1.9817f,
+                            0.6921f,
+                            0.8881f,
+                            1.1073f,
+                            1.0381f,
+                            1.2688f,
+                            1.5225f,
+                            1.0685f,
+                            1.2466f,
+                            1.4383f,
+                            1.3356f,
+                            1.5342f,
+                            1.7465f,
+                            1.1375f,
+                            1.2931f,
+                            1.4584f,
+                            1.3271f,
+                            1.4973f,
+                            1.6771f,
+                        });
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16_OuterInnerBroadcast_Axis1) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+  std::vector<float> x_f(24);
+  for (int i = 0; i < 24; ++i) x_f[i] = static_cast<float>(i);
+
+  std::vector<MLFloat16> x_half(x_f.size());
+  for (size_t i = 0; i < x_f.size(); ++i)
+    x_half[i] = MLFloat16(x_f[i]);
+
+  test.AddInput<MLFloat16>("X", {2, 3, 4}, x_half);
+
+  std::vector<float> scale_f = {1.0f, 2.0f, 3.0f};
+  std::vector<MLFloat16> scale_half(scale_f.size());
+  for (size_t i = 0; i < scale_f.size(); ++i)
+    scale_half[i] = MLFloat16(scale_f[i]);
+
+  test.AddInput<MLFloat16>("scale", {1, 3, 1}, scale_half, true);
+
+  std::vector<float> y_f = {
+      0.0000f, 0.1540f, 0.3080f, 0.4620f,
+      1.2320f, 1.5400f, 1.8480f, 2.1560f,
+      3.6960f, 4.1579f, 4.6199f, 5.0819f,
+      0.6728f, 0.7288f, 0.7849f, 0.8409f,
+      1.7940f, 1.9061f, 2.0183f, 2.1304f,
+      3.3638f, 3.5319f, 3.7001f, 3.8683f};
+
+  std::vector<MLFloat16> y_half(y_f.size());
+  for (size_t i = 0; i < y_f.size(); ++i)
+    y_half[i] = MLFloat16(y_f[i]);
+
+  test.AddOutput<MLFloat16>("Y", {2, 3, 4}, y_half);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16_OuterBroadcast_BxSx1_Axis2) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 2);
+  std::vector<float> x_f(2 * 2 * 3);
+  for (int i = 0; i < static_cast<int>(x_f.size()); ++i) {
+    x_f[static_cast<size_t>(i)] = static_cast<float>(i);
+  }
+  std::vector<MLFloat16> x_half(x_f.size());
+  for (size_t i = 0; i < x_f.size(); ++i) {
+    x_half[i] = MLFloat16(x_f[i]);
+  }
+  test.AddInput<MLFloat16>("X", {2, 2, 3}, x_half);
+  std::vector<float> scale_f = {
+      1.0f, 2.0f,
+      3.0f, 4.0f};
+  std::vector<MLFloat16> scale_half(scale_f.size());
+  for (size_t i = 0; i < scale_f.size(); ++i) {
+    scale_half[i] = MLFloat16(scale_f[i]);
+  }
+  test.AddInput<MLFloat16>("scale", {2, 2, 1}, scale_half, true);
+  std::vector<float> y_f = {
+      0.0000f, 0.7746f, 1.5492f,
+      1.4697f, 1.9596f, 2.4495f,
+
+      2.5541f, 2.9798f, 3.4055f,
+      3.5881f, 3.9867f, 4.3854f};
+
+  std::vector<MLFloat16> y_half(y_f.size());
+  for (size_t i = 0; i < y_f.size(); ++i) {
+    y_half[i] = MLFloat16(y_f[i]);
+  }
+
+  test.AddOutput<MLFloat16>("Y", {2, 2, 3}, y_half);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
 
 }  // namespace test
 }  // namespace onnxruntime

From c934be901e2b03f4133179d85694f06a36db80e9 Mon Sep 17 00:00:00 2001
From: naomiOvad <no0583267045@gmail.com>
Date: Wed, 19 Nov 2025 20:39:08 +0200
Subject: [PATCH 3/5] Update LayerNorm tests to run on CPU only

---
 .../test/contrib_ops/layer_norm_op_test.cc    | 123 ++++++------------
 1 file changed, 42 insertions(+), 81 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index f7271f673cc49..50879a7c632a0 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -556,7 +556,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Axis2) {
   test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-TEST(LayerNormTest,  LayerNorm_Scale_Bias_4D_OuterInnerBroadcast_Axis3) {
+TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast_Axis3) {
   OpTester test("LayerNormalization", 17);
   test.AddAttribute<float>("epsilon", 1e-05f);
   test.AddAttribute<int64_t>("axis", 3);
@@ -600,46 +600,16 @@ TEST(LayerNormTest,  LayerNorm_Scale_Bias_4D_OuterInnerBroadcast_Axis3) {
   test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
-#if defined(USE_DNNL)
-TEST(LayerNormTest, LayerNorm17_Scale_Bias_bfloat16) {
-#ifdef USE_DNNL
-  if (!DnnlHasBF16Support()) {
-    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
-    return;
-  }
-#endif
-  OpTester test("LayerNormalization", 17);
-  test.AddAttribute<float>("epsilon", 1e-05f);
-
-  std::vector<int64_t> dims{1, 3, 2};
-  test.AddInput<BFloat16>("x", dims, MakeBFloat16({1.2416f, 0.946123f, 13.1685f, 0.36423f, 21.145f, 0.03941f}));
-  test.AddInput<BFloat16>("gamma", {2}, MakeBFloat16({-0.6953f, 5.1824f}));
-  test.AddInput<BFloat16>("bias", {2}, MakeBFloat16({0.6435f, -0.3964f}));
-  test.AddOutput<BFloat16>("output", dims, MakeBFloat16({-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}));
-  test.Run();
-}
-//
 TEST(LayerNormTest, LayerNorm_Scale_Scalar_NoBias) {
   OpTester test("LayerNormalization", 17);
   test.AddAttribute<float>("epsilon", 1e-05f);
-  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
-
-  // X: shape = {2, 2, 2}, values 0..7
+  test.AddAttribute<int64_t>("axis", 2);
   std::vector<float> x(2 * 2 * 2);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) {
     x[static_cast<size_t>(i)] = static_cast<float>(i);
   }
   test.AddInput<float>("X", {2, 2, 2}, x);
-
-  // Scale: scalar
-  test.AddInput<float>("Scale", {}, {1.5f}, /*is_initializer*/ true);
-
-  // Expected Y ( לפי ONNX LayerNormalization: standardization + scale )
-  // מחושב בפייתון בהתאם לספציפיקציה:
-  //
-  // Normalized = (X - Mean) / sqrt(Var + eps)
-  // Y = Normalized * Scale
-  //
+  test.AddInput<float>("Scale", {}, {1.5f}, true);
   test.AddOutput<float>("Y", {2, 2, 2},
                         {
                             -1.5f,
@@ -652,30 +622,23 @@ TEST(LayerNormTest, LayerNorm_Scale_Scalar_NoBias) {
                             1.5f,
                         });
 
-  // הפרש קטן בגלל חישובי float
   test.SetOutputAbsErr("Y", 1e-4f);
-
-  test.Run();
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 TEST(LayerNormTest, LayerNorm_Scale_Bias_Scalar) {
   OpTester test("LayerNormalization", 17);
   test.AddAttribute<float>("epsilon", 1e-05f);
-  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
-
-  // X: shape = {2, 2, 2}, values 0..7
+  test.AddAttribute<int64_t>("axis", 2);
   std::vector<float> x(2 * 2 * 2);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) {
     x[static_cast<size_t>(i)] = static_cast<float>(i);
   }
   test.AddInput<float>("X", {2, 2, 2}, x);
+  test.AddInput<float>("Scale", {}, {1.5f}, true);
+  test.AddInput<float>("B", {}, {0.1f}, true);
 
-  // Scale: scalar
-  test.AddInput<float>("Scale", {}, {1.5f}, /*is_initializer*/ true);
-
-  // Bias: scalar
-  test.AddInput<float>("B", {}, {0.1f}, /*is_initializer*/ true);
-
-  // Y = Normalized * 1.5 + 0.1
   test.AddOutput<float>("Y", {2, 2, 2},
                         {
                             -1.4f,
@@ -690,34 +653,26 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Scalar) {
 
   test.SetOutputAbsErr("Y", 1e-4f);
 
-  test.Run();
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
+
 TEST(LayerNormTest, LayerNorm_Scale_Bias_PerLastDim) {
   OpTester test("LayerNormalization", 17);
   test.AddAttribute<float>("epsilon", 1e-05f);
-  test.AddAttribute<int64_t>("axis", 2);  // normalize over last dim
+  test.AddAttribute<int64_t>("axis", 2);
 
-  // X: shape = {2, 2, 2}, values 0..7
   std::vector<float> x(2 * 2 * 2);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) {
     x[static_cast<size_t>(i)] = static_cast<float>(i);
   }
   test.AddInput<float>("X", {2, 2, 2}, x);
 
-  // Scale: shape = {2} -> broadcast לממד האחרון
-  // scale[0] לכל העמודות במיקום 0, scale[1] למיקום 1
-  test.AddInput<float>("Scale", {2}, {1.0f, 2.0f}, /*is_initializer*/ true);
+  test.AddInput<float>("Scale", {2}, {1.0f, 2.0f}, true);
 
-  // Bias: shape = {2} -> broadcast זהה
-  test.AddInput<float>("B", {2}, {0.0f, 0.5f}, /*is_initializer*/ true);
+  test.AddInput<float>("B", {2}, {0.0f, 0.5f}, true);
 
-  // Y = Normalized * Scale + B
-  // מחושב לפי הספציפיקציה (Python reference):
-  // יוצא:
-  // [[[-1.0, 2.5],
-  //   [-1.0, 2.5]],
-  //  [[-1.0, 2.5],
-  //   [-1.0, 2.5]]]
   test.AddOutput<float>("Y", {2, 2, 2},
                         {
                             -1.0f,
@@ -732,23 +687,22 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_PerLastDim) {
 
   test.SetOutputAbsErr("Y", 1e-4f);
 
-  test.Run();
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
+
 TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
   OpTester test("LayerNormalization", 17);
   test.AddAttribute<float>("epsilon", 1e-05f);
-  test.AddAttribute<int64_t>("axis", 3);  // normalize over last dim (W)
+  test.AddAttribute<int64_t>("axis", 3);
 
-  // X: shape = {1, 2, 2, 2}, values 0..7
   std::vector<float> x(1 * 2 * 2 * 2);
   for (int i = 0; i < static_cast<int>(x.size()); ++i) {
     x[static_cast<size_t>(i)] = static_cast<float>(i);
   }
   test.AddInput<float>("X", {1, 2, 2, 2}, x);
 
-  // Scale: shape = {1, 2, 1, 2}
-  // S=0: [1.0, 1.1]
-  // S=1: [1.2, 1.3]
   test.AddInput<float>("Scale", {1, 2, 1, 2},
                        {
                            1.0f,
@@ -756,11 +710,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
                            1.2f,
                            1.3f,
                        },
-                       /*is_initializer*/ true);
+                       true);
 
-  // Bias: shape = {1, 2, 1, 2}
-  // S=0: [0.0, 0.5]
-  // S=1: [1.0, 1.5]
   test.AddInput<float>("B", {1, 2, 1, 2},
                        {
                            0.0f,
@@ -768,16 +719,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
                            1.0f,
                            1.5f,
                        },
-                       /*is_initializer*/ true);
-
-  // Expected Y (מחושב בפייתון לפי ה־spec):
-  //
-  // [[[[ -1.0,  1.6],
-  //    [ -1.0,  1.6]],
-  //
-  //   [[ -0.2,  2.8],
-  //    [ -0.2,  2.8]]]]
-  //
+                       true);
   test.AddOutput<float>("Y", {1, 2, 2, 2},
                         {
                             -1.0f,
@@ -792,7 +734,26 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
                         });
 
   test.SetOutputAbsErr("Y", 1e-4f);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+#if defined(USE_DNNL)
+TEST(LayerNormTest, LayerNorm17_Scale_Bias_bfloat16) {
+#ifdef USE_DNNL
+  if (!DnnlHasBF16Support()) {
+    LOGS_DEFAULT(WARNING) << "Hardware does NOT support BF16";
+    return;
+  }
+#endif
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
 
+  std::vector<int64_t> dims{1, 3, 2};
+  test.AddInput<BFloat16>("x", dims, MakeBFloat16({1.2416f, 0.946123f, 13.1685f, 0.36423f, 21.145f, 0.03941f}));
+  test.AddInput<BFloat16>("gamma", {2}, MakeBFloat16({-0.6953f, 5.1824f}));
+  test.AddInput<BFloat16>("bias", {2}, MakeBFloat16({0.6435f, -0.3964f}));
+  test.AddOutput<BFloat16>("output", dims, MakeBFloat16({-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}));
   test.Run();
 }
 

From b7992235a406d956b002163c3acf3eb131c16ce0 Mon Sep 17 00:00:00 2001
From: naomiOvad <no0583267045@gmail.com>
Date: Fri, 21 Nov 2025 01:52:06 +0200
Subject: [PATCH 4/5] LayerNorm: unify generic implementation, add
 mixed-broadcast test, and improve input validation

---
 .../core/providers/cpu/nn/layer_norm_helper.h | 115 ++++----
 .../core/providers/cpu/nn/layer_norm_impl.cc  | 279 +++++++++++-------
 .../test/contrib_ops/layer_norm_op_test.cc    |  75 ++++-
 .../test/providers/cpu/nn/rms_norm_op_test.cc |  27 ++
 4 files changed, 318 insertions(+), 178 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
index 5ce0024ef3bec..ac00580c98aff 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
@@ -13,7 +13,7 @@ namespace onnxruntime {
 constexpr const char* kLayerNormInputShapeMismatchError =
     "Scale and (optional) bias must match X.shape[axis:] or be NumPy-broadcastable to it.";
 
-constexpr const char* kLayerNormInvalidSize = "Size of X.shape[axis:] must be larger than 1, got ";
+constexpr const char* kLayerNormInvalidSize = "Size of X.shape[axis:] must be at least 1, got ";
 
 constexpr int64_t kLayerNormInvalidInput = -1;
 
@@ -34,8 +34,6 @@ struct LayerNormParams {
   int64_t last_rank{0};
   onnxruntime::InlinedVector<int64_t, 8> sc_inner_inc;     // scale strides for inner dims [axis..]
   onnxruntime::InlinedVector<int64_t, 8> bi_inner_inc;     // bias  strides for inner dims [axis..]
-  onnxruntime::InlinedVector<int64_t, 8> sc_outer_inc;     // how much the scale pointer moves (stride) when an outer-dimension index of X changes (dims 0..axis-1)
-  onnxruntime::InlinedVector<int64_t, 8> bi_outer_inc;     // how much the bias pointer moves (stride) when an outer-dimension index of X changes (dims 0..axis-1)
   onnxruntime::InlinedVector<int64_t, 8> x_outer_strides;  // X strides for outer dims [0..axis-1]
 };
 
@@ -74,7 +72,8 @@ class LayerNormHelper {
     params.broadcast_param = 0;
     params.axis = axis;
 
-    if (params.norm_size <= 1) {
+    // Allow norm_size == 1 (scalar normalization is valid according to ONNX spec).
+    if (params.norm_size < 1) {
       params.broadcast_param = kLayerNormInvalidInput;
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, kLayerNormInvalidSize, params.norm_size);
     } else if (params.scale_size != params.norm_size || (has_bias && params.bias_size != params.scale_size)) {
@@ -83,22 +82,21 @@ class LayerNormHelper {
       // fast-path can be used. If this fails, broadcast_param will be set to
       // kLayerNormInvalidInput and we may fall back to generic broadcasting later.
     }
-
     const size_t xr = x_shape.NumDimensions();
     const size_t sr = scale_shape.NumDimensions();
     const size_t br = has_bias ? bias_shape.NumDimensions() : 0;
+
     params.x_dims.clear();
     params.x_dims.reserve(xr);
-    for (size_t i = 0; i < xr; ++i) params.x_dims.push_back(x_shape.GetDims()[i]);
+    for (size_t i = 0; i < xr; ++i) {
+      params.x_dims.push_back(x_shape.GetDims()[i]);
+    }
 
-    // Right-align the scale (and bias) shape to match X's rank, filling leading
-    // dimensions with 1 so that NumPy-style broadcasting rules can be applied.
+    // Right-align scale and bias shapes
     params.sc_dims.clear();
     params.sc_dims.resize(xr, 1);
-    {
-      for (size_t i = 0; i < sr; ++i) {
-        params.sc_dims[xr - 1 - i] = scale_shape.GetDims()[sr - 1 - i];
-      }
+    for (size_t i = 0; i < sr; ++i) {
+      params.sc_dims[xr - 1 - i] = scale_shape.GetDims()[sr - 1 - i];
     }
 
     params.bi_dims.clear();
@@ -108,8 +106,8 @@ class LayerNormHelper {
         params.bi_dims[xr - 1 - i] = bias_shape.GetDims()[br - 1 - i];
       }
     }
-    // Validate that scale and bias shapes are NumPy-broadcastable to X.
-    // If not, we fail early with a clear shape mismatch error.
+
+    // Validate broadcastability
     const bool sc_ok = IsNumpyBroadcastable(params.sc_dims, params.x_dims);
     const bool bi_ok = !has_bias || IsNumpyBroadcastable(params.bi_dims, params.x_dims);
     if (!sc_ok || !bi_ok) {
@@ -120,65 +118,66 @@ class LayerNormHelper {
                              " bias.shape=", bias_shape,
                              " and axis=", axis);
     }
-    // Cache the inner dimensions X.shape[axis:] that are normalized together
-    // for each logical row.
-    params.last_rank = onnxruntime::narrow<int64_t>(xr) - axis;
-    params.x_inner_dims.clear();
-    params.x_inner_dims.reserve(params.last_rank > 0 ? static_cast<size_t>(params.last_rank) : 0);
-    for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
-      params.x_inner_dims.push_back(params.x_dims[i]);
-    }
 
+    // Compute strides for scale/bias once
     params.sc_strides = MakeStrides(params.sc_dims);
     params.bi_strides.clear();
     if (has_bias) {
       params.bi_strides = MakeStrides(params.bi_dims);
     }
 
-    // Precompute how scale/bias advance along the inner dimensions [axis..]:
-    // these increments are used inside the per-row normalization loop.
-    params.sc_inner_inc.clear();
-    params.bi_inner_inc.clear();
-    for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
-      params.sc_inner_inc.push_back(params.sc_strides[i]);
-      if (has_bias) {
-        params.bi_inner_inc.push_back(params.bi_strides[i]);
-      }
-    }
-    // Compute strides for X over the outer dimensions [0..axis-1],
-    // used to locate the base address of each logical row in X.
-    params.x_outer_strides.clear();
-    params.x_outer_strides.resize(static_cast<size_t>(axis), 1);
-    if (axis > 1) {
-      for (int64_t d = axis - 2; d >= 0; --d) {
-        const size_t du = static_cast<size_t>(d);
-        params.x_outer_strides[du] =
-            params.x_outer_strides[du + 1] * params.x_dims[du + 1];
-      }
-    }
-    // Detect whether scale/bias depend on any outer dimensions [0..axis-1].
-    // If any outer stride is non-zero, scale/bias are not purely "inner-only"
-    // and the simple fast-path based on broadcast_param is not sufficient.
-    params.sc_outer_inc.clear();
-    params.bi_outer_inc.clear();
-    for (int64_t i = 0; i < axis; ++i) {
-      params.sc_outer_inc.push_back(params.sc_strides[static_cast<size_t>(i)]);
-      params.bi_outer_inc.push_back(has_bias ? params.bi_strides[static_cast<size_t>(i)] : 0);
-    }
-
+    // Detect dependency on outer dimensions [0..axis-1]
     bool outer_dep = false;
     for (int64_t i = 0; i < axis; ++i) {
-      if (params.sc_outer_inc[static_cast<size_t>(i)] != 0 ||
-          (has_bias && params.bi_outer_inc[static_cast<size_t>(i)] != 0)) {
+      const size_t idx = static_cast<size_t>(i);
+      if (params.sc_strides[idx] != 0 ||
+          (has_bias && params.bi_strides[idx] != 0)) {
         outer_dep = true;
         break;
       }
     }
-    // Enable the generic NumPy-style broadcasting path if either:
-    //  - the fast-path cannot represent this shape (broadcast_param is invalid), or
-    //  - scale/bias have any dependency on outer dimensions.
+
+    // Decide if we need the generic NumPy-style broadcasting path
     params.use_generic_broadcast = outer_dep || (params.broadcast_param == kLayerNormInvalidInput);
 
+    if (params.use_generic_broadcast) {
+      // Cache inner dims X.shape[axis:]
+      params.last_rank = onnxruntime::narrow<int64_t>(xr) - axis;
+      params.x_inner_dims.clear();
+      params.x_inner_dims.reserve(params.last_rank > 0 ? static_cast<size_t>(params.last_rank) : 0);
+      for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
+        params.x_inner_dims.push_back(params.x_dims[i]);
+      }
+
+      // Precompute inner increments for scale/bias over [axis..]
+      params.sc_inner_inc.clear();
+      params.bi_inner_inc.clear();
+      for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
+        params.sc_inner_inc.push_back(params.sc_strides[i]);
+        if (has_bias) {
+          params.bi_inner_inc.push_back(params.bi_strides[i]);
+        }
+      }
+
+      // X outer strides [0..axis-1], used only in generic path
+      params.x_outer_strides.clear();
+      params.x_outer_strides.resize(static_cast<size_t>(axis), 1);
+      if (axis > 1) {
+        for (int64_t d = axis - 2; d >= 0; --d) {
+          const size_t du = static_cast<size_t>(d);
+          params.x_outer_strides[du] =
+              params.x_outer_strides[du + 1] * params.x_dims[du + 1];
+        }
+      }
+    } else {
+      // Fast-path: we don't need inner/outer increments
+      params.last_rank = 0;
+      params.x_inner_dims.clear();
+      params.sc_inner_inc.clear();
+      params.bi_inner_inc.clear();
+      params.x_outer_strides.clear();
+    }
+
     return Status::OK();
   }
 
diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
index 51e175da27caa..712dcaf9d7034 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
@@ -169,134 +169,108 @@ ORT_FORCEINLINE void WriteStat(U* dst, ptrdiff_t index, double v) {
     dst[index] = gsl::narrow_cast<U>(v);
   }
 }
-// Generic per-row LayerNorm computation that supports full NumPy-style
-// broadcasting of scale and bias over both outer and inner dimensions.
-// task_idx selects the logical row in X (and the matching slices of scale/bias).
-template <typename T, typename U>
-void ComputeJobGeneric(
-    const T* X_data,
-    const T* scale_data,
-    const T* bias_data,
-    const ptrdiff_t task_idx,
-    const LayerNormParams& params,
-    const float* scale_float_ptr,
-    const float* bias_float_ptr,
-    float epsilon,
-    bool simplified,
-    T* Y_data,
-    U* mean_data,
-    U* inv_std_dev_data) {
-  ORT_UNUSED_PARAMETER(scale_float_ptr);
-  ORT_UNUSED_PARAMETER(bias_float_ptr);
-  const auto& sc_inner_inc = params.sc_inner_inc;
-  const auto& bi_inner_inc = params.bi_inner_inc;
-  const int64_t norm_size = params.norm_size;
-  const int64_t last_rank = params.last_rank;
-  const T* p_input = X_data + task_idx * norm_size;
-  T* p_output = Y_data + task_idx * norm_size;
-  double mean = 0.0;
-  double mean_sq = 0.0;
-  for (int64_t h = 0; h < norm_size; ++h) {
-    mean += p_input[h];
-    mean_sq += static_cast<double>(p_input[h]) * p_input[h];
+template <typename T>
+struct NormalizationMath {
+  static double LoadInput(const T* ptr, int64_t offset) {
+    return static_cast<double>(ptr[offset]);
   }
-  mean /= static_cast<double>(norm_size);
-  double denom = simplified
-                     ? std::sqrt(mean_sq / norm_size + epsilon)
-                     : std::sqrt(mean_sq / norm_size - mean * mean + epsilon);
-
-  // Decode the outer-dimension indices for this task_idx and compute the
-  // base offsets into scale/bias for the current logical row of X.
-  int64_t off_sc_row = 0;
-  int64_t off_bi_row = 0;
-
-  if (params.axis > 0) {
-    const auto& outer_strides = params.x_outer_strides;
 
-    for (int64_t d = 0; d < params.axis; ++d) {
-      const size_t du = static_cast<size_t>(d);
-      const int64_t dim = params.x_dims[du];
-      const int64_t idx_d = (dim == 0)
-                                ? 0
-                                : (task_idx / outer_strides[du]) % dim;
+  static double LoadScale(const T* scale_data,
+                          const float* scale_float_ptr,
+                          int64_t offset) {
+    ORT_UNUSED_PARAMETER(scale_float_ptr);
+    return static_cast<double>(scale_data[offset]);
+  }
 
-      off_sc_row += idx_d * params.sc_strides[du];
-      if (bias_data) {
-        off_bi_row += idx_d * params.bi_strides[du];
-      }
+  static double LoadBias(const T* bias_data,
+                         const float* bias_float_ptr,
+                         int64_t offset) {
+    ORT_UNUSED_PARAMETER(bias_float_ptr);
+    if (!bias_data) {
+      return 0.0;
     }
+    return static_cast<double>(bias_data[offset]);
   }
-  // Iterate over the inner dimensions using a small multi-dimensional index
-  // (idx), and use the precomputed inner increments to locate the correct
-  // scale/bias element for each position within the row.
-  onnxruntime::InlinedVector<int64_t, 8> idx(static_cast<size_t>(last_rank), 0);
 
-  for (int64_t h = 0; h < norm_size; ++h) {
-    int64_t off_sc = off_sc_row;
-    int64_t off_bi = off_bi_row;
-    for (size_t d = 0; d < static_cast<size_t>(last_rank); ++d) {
-      off_sc += idx[d] * sc_inner_inc[d];
-      if (bias_data) off_bi += idx[d] * bi_inner_inc[d];
-    }
+  static void StoreOutput(T* dst, int64_t offset, double v) {
+    dst[offset] = static_cast<T>(v);
+  }
+};
 
-    const double s = static_cast<double>(scale_data[off_sc]);
-    const double b = (bias_data ? static_cast<double>(bias_data[off_bi]) : 0.0);
-    const double x = static_cast<double>(p_input[h]);
-    const double y = simplified ? (x / denom) * s
-                                : ((x - mean) / denom) * s + b;
-    p_output[h] = static_cast<T>(y);
+struct HalfMath {
+  static double LoadInput(const MLFloat16* ptr, int64_t offset) {
+    return static_cast<double>(static_cast<float>(ptr[offset]));
+  }
 
-    for (int64_t d = last_rank - 1; d >= 0; --d) {
-      if (++idx[static_cast<size_t>(d)] < params.x_inner_dims[static_cast<size_t>(d)]) break;
-      idx[static_cast<size_t>(d)] = 0;
+  static double LoadScale(const MLFloat16* scale_data,
+                          const float* scale_float_ptr,
+                          int64_t offset) {
+    if (scale_float_ptr) {
+      return static_cast<double>(scale_float_ptr[offset]);
     }
+    return static_cast<double>(static_cast<float>(scale_data[offset]));
   }
-  if (mean_data) {
-    WriteStat<U>(mean_data, task_idx, mean);
+
+  static double LoadBias(const MLFloat16* bias_data,
+                         const float* bias_float_ptr,
+                         int64_t offset) {
+    if (bias_float_ptr) {
+      return static_cast<double>(bias_float_ptr[offset]);
+    }
+    if (bias_data) {
+      return static_cast<double>(static_cast<float>(bias_data[offset]));
+    }
+    return 0.0;
   }
-  if (inv_std_dev_data) {
-    WriteStat<U>(inv_std_dev_data, task_idx, 1.0 / denom);
+
+  static void StoreOutput(MLFloat16* dst, int64_t offset, double v) {
+    dst[offset] = MLFloat16(static_cast<float>(v));
   }
-}
-// Specialization for MLFloat16 input/output: we compute statistics and the
-// normalized values in float, optionally using pre-converted float scale/bias
-// buffers (scale_float_ptr / bias_float_ptr) for better performance and reuse.
-template <typename U>
-void ComputeJobGeneric(
-    const MLFloat16* X_data,
-    const MLFloat16* scale_data,
-    const MLFloat16* bias_data,
+};
+// Shared generic implementation for LayerNorm with full NumPy-style broadcasting.
+// DataT  - storage type (float/double/MLFloat16)
+// MathPolicy - policy that handles load/store/cast for DataT
+// U      - statistics output type (float, MLFloat16, etc.)
+template <typename DataT, typename MathPolicy, typename U>
+void ComputeJobGenericShared(
+    const DataT* X_data,
+    const DataT* scale_data,
+    const DataT* bias_data,
     const ptrdiff_t task_idx,
     const LayerNormParams& params,
     const float* scale_float_ptr,
     const float* bias_float_ptr,
     float epsilon,
     bool simplified,
-    MLFloat16* Y_data,
+    DataT* Y_data,
     U* mean_data,
     U* inv_std_dev_data) {
-  const auto& sc_inner_inc = params.sc_inner_inc;
-  const auto& bi_inner_inc = params.bi_inner_inc;
   const int64_t norm_size = params.norm_size;
   const int64_t last_rank = params.last_rank;
 
-  const MLFloat16* p_input = X_data + task_idx * norm_size;
-  MLFloat16* p_output = Y_data + task_idx * norm_size;
+  const DataT* p_input = X_data + task_idx * norm_size;
+  DataT* p_output = Y_data + task_idx * norm_size;
 
+  // Compute mean and denom (same for all types, via MathPolicy).
   double mean = 0.0;
   double mean_sq = 0.0;
   for (int64_t h = 0; h < norm_size; ++h) {
-    const float xv = static_cast<float>(p_input[h]);
+    const double xv = MathPolicy::LoadInput(p_input, h);
     mean += xv;
-    mean_sq += static_cast<double>(xv) * xv;
+    mean_sq += xv * xv;
   }
+
   mean /= static_cast<double>(norm_size);
   const double denom = simplified
                            ? std::sqrt(mean_sq / norm_size + epsilon)
                            : std::sqrt(mean_sq / norm_size - mean * mean + epsilon);
+
+  // Compute outer offsets for this logical row (same as before).
   int64_t off_sc_row = 0;
   int64_t off_bi_row = 0;
 
+  const bool has_bias_any = (bias_data != nullptr) || (bias_float_ptr != nullptr);
+
   if (params.axis > 0) {
     const auto& outer_strides = params.x_outer_strides;
 
@@ -308,49 +282,78 @@ void ComputeJobGeneric(
                                 : (task_idx / outer_strides[du]) % dim;
 
       off_sc_row += idx_d * params.sc_strides[du];
-      if (bias_data || bias_float_ptr) {
+      if (has_bias_any) {
         off_bi_row += idx_d * params.bi_strides[du];
       }
     }
   }
 
+  // Prepare inner-dimension iteration (multi-dimensional idx for inner dims,
+  //    plus optimized inner loop over the last dimension).
+  ORT_ENFORCE(last_rank > 0);
   onnxruntime::InlinedVector<int64_t, 8> idx(static_cast<size_t>(last_rank), 0);
 
-  for (int64_t h = 0; h < norm_size; ++h) {
+  const auto& x_inner_dims = params.x_inner_dims;
+  const auto& sc_inner_inc = params.sc_inner_inc;
+  const auto& bi_inner_inc = params.bi_inner_inc;
+
+  const int64_t last_dim = x_inner_dims[static_cast<size_t>(last_rank - 1)];
+  ORT_ENFORCE(last_dim > 0);
+  ORT_ENFORCE(norm_size % last_dim == 0);
+  const int64_t num_chunks = norm_size / last_dim;
+
+  const int64_t sc_last_stride = !sc_inner_inc.empty() ? sc_inner_inc.back() : 0;
+  const int64_t bi_last_stride =
+      (has_bias_any && !bi_inner_inc.empty()) ? bi_inner_inc.back() : 0;
+
+  //  Outer loop: iterate over "chunks" of the last dimension.
+  for (int64_t c = 0; c < num_chunks; ++c) {
     int64_t off_sc = off_sc_row;
     int64_t off_bi = off_bi_row;
 
-    for (size_t d = 0; d < static_cast<size_t>(last_rank); ++d) {
-      off_sc += idx[d] * sc_inner_inc[d];
-      if (bias_data || bias_float_ptr) {
-        off_bi += idx[d] * bi_inner_inc[d];
+    // Base offsets for all inner dims except the last.
+    for (int64_t d = 0; d < last_rank - 1; ++d) {
+      const size_t du = static_cast<size_t>(d);
+      off_sc += idx[du] * sc_inner_inc[du];
+      if (has_bias_any) {
+        off_bi += idx[du] * bi_inner_inc[du];
       }
     }
 
-    const float s = scale_float_ptr
-                        ? scale_float_ptr[off_sc]
-                        : static_cast<float>(scale_data[off_sc]);
+    const int64_t base_h = c * last_dim;
 
-    const float b = bias_float_ptr
-                        ? bias_float_ptr[off_bi]
-                        : (bias_data ? static_cast<float>(bias_data[off_bi]) : 0.0f);
+    //  Tight inner loop over the last dimension: compiler can vectorize this.
+    for (int64_t i = 0; i < last_dim; ++i) {
+      const int64_t h = base_h + i;
 
-    const float x = static_cast<float>(p_input[h]);
-    const float y = simplified
-                        ? (x / static_cast<float>(denom)) * s
-                        : ((x - static_cast<float>(mean)) / static_cast<float>(denom)) * s + b;
+      const int64_t sc_offset = off_sc + i * sc_last_stride;
+      const int64_t bi_offset = off_bi + i * bi_last_stride;
 
-    p_output[h] = MLFloat16(y);
+      const double x = MathPolicy::LoadInput(p_input, h);
+      const double s = MathPolicy::LoadScale(scale_data, scale_float_ptr, sc_offset);
+      const double b = MathPolicy::LoadBias(bias_data, bias_float_ptr, bi_offset);
 
-    for (int64_t d = last_rank - 1; d >= 0; --d) {
-      const size_t du = static_cast<size_t>(d);
-      if (++idx[du] < params.x_inner_dims[du]) {
-        break;
+      const double y = simplified
+                           ? (x / denom) * s
+                           : ((x - mean) / denom) * s + b;
+
+      MathPolicy::StoreOutput(p_output, h, y);
+    }
+
+    //  Update multi-dimensional index 'idx' for the next chunk
+    //    (iterate backwards from the second-to-last dimension).
+    if (last_rank > 1) {
+      for (int64_t d = last_rank - 2; d >= 0; --d) {
+        const size_t du = static_cast<size_t>(d);
+        if (++idx[du] < x_inner_dims[du]) {
+          break;
+        }
+        idx[du] = 0;
       }
-      idx[du] = 0;
     }
   }
 
+  //  Write statistics outputs.
   if (mean_data) {
     WriteStat<U>(mean_data, task_idx, mean);
   }
@@ -358,6 +361,54 @@ void ComputeJobGeneric(
     WriteStat<U>(inv_std_dev_data, task_idx, 1.0 / denom);
   }
 }
+template <typename T, typename U>
+void ComputeJobGeneric(
+    const T* X_data,
+    const T* scale_data,
+    const T* bias_data,
+    const ptrdiff_t task_idx,
+    const LayerNormParams& params,
+    const float* scale_float_ptr,
+    const float* bias_float_ptr,
+    float epsilon,
+    bool simplified,
+    T* Y_data,
+    U* mean_data,
+    U* inv_std_dev_data) {
+  ORT_UNUSED_PARAMETER(scale_float_ptr);
+  ORT_UNUSED_PARAMETER(bias_float_ptr);
+
+  using Policy = NormalizationMath<T>;
+  ComputeJobGenericShared<T, Policy, U>(
+      X_data, scale_data, bias_data,
+      task_idx, params,
+      nullptr,
+      nullptr,
+      epsilon, simplified,
+      Y_data, mean_data, inv_std_dev_data);
+}
+template <typename U>
+void ComputeJobGeneric(
+    const MLFloat16* X_data,
+    const MLFloat16* scale_data,
+    const MLFloat16* bias_data,
+    const ptrdiff_t task_idx,
+    const LayerNormParams& params,
+    const float* scale_float_ptr,
+    const float* bias_float_ptr,
+    float epsilon,
+    bool simplified,
+    MLFloat16* Y_data,
+    U* mean_data,
+    U* inv_std_dev_data) {
+  using Policy = HalfMath;
+  ComputeJobGenericShared<MLFloat16, Policy, U>(
+      X_data, scale_data, bias_data,
+      task_idx, params,
+      scale_float_ptr, bias_float_ptr,
+      epsilon, simplified,
+      Y_data, mean_data, inv_std_dev_data);
+}
 
 void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr<float>& dest, bool& is_packed) {
   if (tensor.GetElementType() == utils::ToTensorProtoElementType<MLFloat16>()) {
diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
index 50879a7c632a0..c97e70a550730 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc
@@ -428,19 +428,18 @@ TEST(LayerNormTest, LayerNorm17_double) {
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider});
 }
 
-// Test normalize size shall be larger than 1.
-TEST(LayerNormTest, LayerNorm_InvalidNormSize) {
+TEST(LayerNormTest, LayerNorm_NormSize1_Valid) {
   OpTester test("LayerNormalization");
   test.AddAttribute<float>("epsilon", 1e-05f);
-
   std::vector<int64_t> dims{1, 3, 1};
   test.AddInput<float>("x", dims, {1.2416f, 0.946123f, 13.1685f});
   test.AddInput<float>("gamma", {1}, {-0.6953f});
   test.AddInput<float>("bias", {1}, {0.6435f});
   test.AddAttribute<int64_t>("axis", 2);
-  test.AddOutput<float>("output", dims, {-0.0516f, -5.5776f, -0.0518f});
-
-  RunTestOnCpuAndCuda(test, kLayerNormInvalidSize);
+  test.AddOutput<float>("output", dims, {0.6435f, 0.6435f, 0.6435f});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
 TEST(LayerNormTest, LayerNorm_ValidScaleBias_Broadcast) {
@@ -738,6 +737,70 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_4D_OuterInnerBroadcast) {
   if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
   test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
+TEST(LayerNormTest, LayerNorm_NormSize1_NoBias) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<int64_t>("axis", 2);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+
+  std::vector<float> x = {
+      1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f};
+  test.AddInput<float>("X", {2, 3, 1}, x);
+  test.AddInput<float>("Scale", {1}, {1.0f});
+  std::vector<float> expected = {
+      0.0f, 0.0f, 0.0f,
+      0.0f, 0.0f, 0.0f};
+
+  test.AddOutput<float>("Y", {2, 3, 1}, expected);
+  test.AddOutput<float>("Mean", {2, 3, 1},
+                        {1.0f, 2.0f, 3.0f,
+                         4.0f, 5.0f, 6.0f});
+
+  float inv_std = 1.0f / sqrtf(1e-5f);
+  test.AddOutput<float>("InvStdDev", {2, 3, 1},
+                        {inv_std, inv_std, inv_std,
+                         inv_std, inv_std, inv_std});
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+TEST(LayerNormTest, LayerNorm_NormSize1_WithBiasScale) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<int64_t>("axis", 2);
+  test.AddAttribute<float>("epsilon", 1e-5f);
+  test.AddInput<float>("X", {1, 2, 1}, {10.0f, 20.0f});
+  test.AddInput<float>("Scale", {1}, {2.0f});
+  test.AddInput<float>("Bias", {1}, {5.0f});
+  test.AddOutput<float>("Y", {1, 2, 1}, {5.0f, 5.0f});
+  test.AddOutput<float>("Mean", {1, 2, 1}, {10.0f, 20.0f});
+  float inv_std = 1.0f / sqrtf(1e-5f);
+  test.AddOutput<float>("InvStdDev", {1, 2, 1}, {inv_std, inv_std});
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
+TEST(LayerNormTest, LayerNorm_Scale_Broadcast_Inner_Mixed) {
+  OpTester test("LayerNormalization", 17);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+  std::vector<int64_t> dims{1, 2, 4};
+  std::vector<float> x = {
+      0.0f, 1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f, 7.0f};
+  test.AddInput<float>("X", dims, x);
+  std::vector<float> scale = {1.0f, 0.5f, 1.0f, 0.5f};
+  test.AddInput<float>("Scale", {1, 4}, scale);
+  std::vector<float> expected_y = {
+      -1.527524f, -0.545544f, -0.654653f, -0.109109f,
+      0.218218f, 0.327327f, 1.091088f, 0.763762f};
+  test.AddOutput<float>("Y", dims, expected_y);
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
+
 #if defined(USE_DNNL)
 TEST(LayerNormTest, LayerNorm17_Scale_Bias_bfloat16) {
 #ifdef USE_DNNL
diff --git a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
index 74d5303b1e932..d16e5eee3b50d 100644
--- a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
@@ -687,6 +687,33 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_Float16_OuterBroadcast_BxSx1_Axis2) {
   if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
   test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
+TEST(RMSNormalizationOpTest, RMSNorm_Scale_Broadcast_Inner_Mixed) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+  std::vector<int64_t> dims{1, 2, 4};
+  std::vector<float> x = {
+      0.0f, 1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f, 7.0f};
+  test.AddInput<float>("X", dims, x);
+  std::vector<float> scale = {1.0f, 0.5f, 1.0f, 0.5f};
+  test.AddInput<float>("Scale", {1, 4}, scale);
+  std::vector<float> expected = {
+      0.0f,
+      0.119527f,
+      0.478108f,
+      0.358581f,
+      0.956216f,
+      0.597635f,
+      1.434324f,
+      0.836689f};
+
+  test.AddOutput<float>("Y", dims, expected);
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) GTEST_SKIP() << "CPU EP not available in this build.";
+  test.ConfigEp(std::move(cpu)).RunWithConfig();
+}
 
 }  // namespace test
 }  // namespace onnxruntime

From dc68d2be00be52b7e3a57c077792b2069ab4fdea Mon Sep 17 00:00:00 2001
From: naomiOvad <no0583267045@gmail.com>
Date: Tue, 25 Nov 2025 12:52:10 +0200
Subject: [PATCH 5/5] Added rank check, renamed sc_/bi_, and added a test for
 invalid Scale/Bias rank.

---
 .../core/providers/cpu/nn/layer_norm_helper.h | 56 ++++++++++---------
 .../core/providers/cpu/nn/layer_norm_impl.cc  | 16 +++---
 .../test/providers/cpu/nn/rms_norm_op_test.cc | 29 ++++++++++
 3 files changed, 68 insertions(+), 33 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
index ac00580c98aff..1060f51811654 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_helper.h
@@ -26,14 +26,14 @@ struct LayerNormParams {
   bool use_generic_broadcast{false};  // true: full NumPy-style broadcast; false: legacy broadcast_param path
   onnxruntime::InlinedVector<int64_t, 8> x_dims;
   onnxruntime::InlinedVector<int64_t, 8> x_inner_dims;  // X.shape[axis:]
-  onnxruntime::InlinedVector<int64_t, 8> sc_dims;
-  onnxruntime::InlinedVector<int64_t, 8> bi_dims;
-  onnxruntime::InlinedVector<int64_t, 8> sc_strides;
-  onnxruntime::InlinedVector<int64_t, 8> bi_strides;
+  onnxruntime::InlinedVector<int64_t, 8> scale_dims;
+  onnxruntime::InlinedVector<int64_t, 8> bias_dims;
+  onnxruntime::InlinedVector<int64_t, 8> scale_strides;
+  onnxruntime::InlinedVector<int64_t, 8> bias_strides;
   int64_t axis{0};
   int64_t last_rank{0};
-  onnxruntime::InlinedVector<int64_t, 8> sc_inner_inc;     // scale strides for inner dims [axis..]
-  onnxruntime::InlinedVector<int64_t, 8> bi_inner_inc;     // bias  strides for inner dims [axis..]
+  onnxruntime::InlinedVector<int64_t, 8> scale_inner_inc;  // scale strides for inner dims [axis..]
+  onnxruntime::InlinedVector<int64_t, 8> bias_inner_inc;   // bias  strides for inner dims [axis..]
   onnxruntime::InlinedVector<int64_t, 8> x_outer_strides;  // X strides for outer dims [0..axis-1]
 };
 
@@ -86,6 +86,12 @@ class LayerNormHelper {
     const size_t sr = scale_shape.NumDimensions();
     const size_t br = has_bias ? bias_shape.NumDimensions() : 0;
 
+    if (sr > xr || (has_bias && br > xr)) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                             kLayerNormInputShapeMismatchError,
+                             " Scale/Bias rank cannot exceed Input rank.");
+    }
+
     params.x_dims.clear();
     params.x_dims.reserve(xr);
     for (size_t i = 0; i < xr; ++i) {
@@ -93,23 +99,23 @@ class LayerNormHelper {
     }
 
     // Right-align scale and bias shapes
-    params.sc_dims.clear();
-    params.sc_dims.resize(xr, 1);
+    params.scale_dims.clear();
+    params.scale_dims.resize(xr, 1);
     for (size_t i = 0; i < sr; ++i) {
-      params.sc_dims[xr - 1 - i] = scale_shape.GetDims()[sr - 1 - i];
+      params.scale_dims[xr - 1 - i] = scale_shape.GetDims()[sr - 1 - i];
     }
 
-    params.bi_dims.clear();
+    params.bias_dims.clear();
     if (has_bias) {
-      params.bi_dims.resize(xr, 1);
+      params.bias_dims.resize(xr, 1);
       for (size_t i = 0; i < br; ++i) {
-        params.bi_dims[xr - 1 - i] = bias_shape.GetDims()[br - 1 - i];
+        params.bias_dims[xr - 1 - i] = bias_shape.GetDims()[br - 1 - i];
       }
     }
 
     // Validate broadcastability
-    const bool sc_ok = IsNumpyBroadcastable(params.sc_dims, params.x_dims);
-    const bool bi_ok = !has_bias || IsNumpyBroadcastable(params.bi_dims, params.x_dims);
+    const bool sc_ok = IsNumpyBroadcastable(params.scale_dims, params.x_dims);
+    const bool bi_ok = !has_bias || IsNumpyBroadcastable(params.bias_dims, params.x_dims);
     if (!sc_ok || !bi_ok) {
       return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
                              kLayerNormInputShapeMismatchError,
@@ -120,18 +126,18 @@ class LayerNormHelper {
     }
 
     // Compute strides for scale/bias once
-    params.sc_strides = MakeStrides(params.sc_dims);
-    params.bi_strides.clear();
+    params.scale_strides = MakeStrides(params.scale_dims);
+    params.bias_strides.clear();
     if (has_bias) {
-      params.bi_strides = MakeStrides(params.bi_dims);
+      params.bias_strides = MakeStrides(params.bias_dims);
     }
 
     // Detect dependency on outer dimensions [0..axis-1]
     bool outer_dep = false;
     for (int64_t i = 0; i < axis; ++i) {
       const size_t idx = static_cast<size_t>(i);
-      if (params.sc_strides[idx] != 0 ||
-          (has_bias && params.bi_strides[idx] != 0)) {
+      if (params.scale_strides[idx] != 0 ||
+          (has_bias && params.bias_strides[idx] != 0)) {
         outer_dep = true;
         break;
       }
@@ -150,12 +156,12 @@ class LayerNormHelper {
       }
 
       // Precompute inner increments for scale/bias over [axis..]
-      params.sc_inner_inc.clear();
-      params.bi_inner_inc.clear();
+      params.scale_inner_inc.clear();
+      params.bias_inner_inc.clear();
       for (size_t i = static_cast<size_t>(axis); i < xr; ++i) {
-        params.sc_inner_inc.push_back(params.sc_strides[i]);
+        params.scale_inner_inc.push_back(params.scale_strides[i]);
         if (has_bias) {
-          params.bi_inner_inc.push_back(params.bi_strides[i]);
+          params.bias_inner_inc.push_back(params.bias_strides[i]);
         }
       }
 
@@ -173,8 +179,8 @@ class LayerNormHelper {
       // Fast-path: we don't need inner/outer increments
       params.last_rank = 0;
       params.x_inner_dims.clear();
-      params.sc_inner_inc.clear();
-      params.bi_inner_inc.clear();
+      params.scale_inner_inc.clear();
+      params.bias_inner_inc.clear();
       params.x_outer_strides.clear();
     }
 
diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
index 712dcaf9d7034..7dd9d994e52b4 100644
--- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
+++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc
@@ -281,9 +281,9 @@ void ComputeJobGenericShared(
                                 ? 0
                                 : (task_idx / outer_strides[du]) % dim;
 
-      off_sc_row += idx_d * params.sc_strides[du];
+      off_sc_row += idx_d * params.scale_strides[du];
       if (has_bias_any) {
-        off_bi_row += idx_d * params.bi_strides[du];
+        off_bi_row += idx_d * params.bias_strides[du];
       }
     }
   }
@@ -294,17 +294,17 @@ void ComputeJobGenericShared(
   onnxruntime::InlinedVector<int64_t, 8> idx(static_cast<size_t>(last_rank), 0);
 
   const auto& x_inner_dims = params.x_inner_dims;
-  const auto& sc_inner_inc = params.sc_inner_inc;
-  const auto& bi_inner_inc = params.bi_inner_inc;
+  const auto& scale_inner_inc = params.scale_inner_inc;
+  const auto& bias_inner_inc = params.bias_inner_inc;
 
   const int64_t last_dim = x_inner_dims[static_cast<size_t>(last_rank - 1)];
   ORT_ENFORCE(last_dim > 0);
   ORT_ENFORCE(norm_size % last_dim == 0);
   const int64_t num_chunks = norm_size / last_dim;
 
-  const int64_t sc_last_stride = !sc_inner_inc.empty() ? sc_inner_inc.back() : 0;
+  const int64_t sc_last_stride = !scale_inner_inc.empty() ? scale_inner_inc.back() : 0;
   const int64_t bi_last_stride =
-      (has_bias_any && !bi_inner_inc.empty()) ? bi_inner_inc.back() : 0;
+      (has_bias_any && !bias_inner_inc.empty()) ? bias_inner_inc.back() : 0;
 
   //  Outer loop: iterate over "chunks" of the last dimension.
   for (int64_t c = 0; c < num_chunks; ++c) {
@@ -314,9 +314,9 @@ void ComputeJobGenericShared(
     // Base offsets for all inner dims except the last.
     for (int64_t d = 0; d < last_rank - 1; ++d) {
       const size_t du = static_cast<size_t>(d);
-      off_sc += idx[du] * sc_inner_inc[du];
+      off_sc += idx[du] * scale_inner_inc[du];
       if (has_bias_any) {
-        off_bi += idx[du] * bi_inner_inc[du];
+        off_bi += idx[du] * bias_inner_inc[du];
       }
     }
 
diff --git a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
index d16e5eee3b50d..25e2b7293ee30 100644
--- a/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
+++ b/onnxruntime/test/providers/cpu/nn/rms_norm_op_test.cc
@@ -715,5 +715,34 @@ TEST(RMSNormalizationOpTest, RMSNorm_Scale_Broadcast_Inner_Mixed) {
   test.ConfigEp(std::move(cpu)).RunWithConfig();
 }
 
+TEST(RMSNormalizationOpTest, RMSNorm_InvalidScaleRank_GreaterThanInputRank_ShouldFail) {
+  OpTester test("RMSNormalization", 23);
+  test.AddAttribute<float>("epsilon", 1e-05f);
+  test.AddAttribute<int64_t>("axis", 1);
+
+  std::vector<int64_t> x_dims{2, 4};
+  std::vector<float> x = {
+      0.0f, 1.0f, 2.0f, 3.0f,
+      4.0f, 5.0f, 6.0f, 7.0f};
+  test.AddInput<float>("X", x_dims, x);
+
+  std::vector<int64_t> scale_dims{1, 2, 4};
+  std::vector<float> scale(1 * 2 * 4, 1.0f);
+  test.AddInput<float>("Scale", scale_dims, scale);
+
+  // Dummy output so model builds; failure is expected during shape check.
+  std::vector<float> dummy_y(x.size(), 0.0f);
+  test.AddOutput<float>("Y", x_dims, dummy_y);
+
+  auto cpu = DefaultCpuExecutionProvider();
+  if (!cpu) {
+    GTEST_SKIP() << "CPU EP not available in this build.";
+  }
+  test.ConfigEp(std::move(cpu));
+
+  test.Run(OpTester::ExpectResult::kExpectFailure,
+           "Scale/Bias rank cannot exceed Input rank.");
+}
+
 }  // namespace test
 }  // namespace onnxruntime