From 5ec6d8c39cfb0b24d2383f134cba1dc11cab22af Mon Sep 17 00:00:00 2001
From: Aditi_Pandey <54734131+AditiThirdEye@users.noreply.github.com>
Date: Sun, 28 Dec 2025 22:16:39 +0530
Subject: [PATCH] Add ONNX opset 23 RMSNormalization operator support

Implements RMSNormalization operator for TensorRT ONNX parser, enabling
deployment of modern transformer architectures (LLaMA, Mistral, etc.)
that use RMSNorm instead of LayerNorm.

Implementation details:
- Computes Y = (X / sqrt(mean(X^2) + epsilon)) * scale
- Supports FP32, FP16, and BF16 data types
- Handles axis attribute for normalization dimensions
- Supports epsilon and stash_type attributes per ONNX spec

Changes:
- onnxOpImporters.cpp: Add RMSNormalization importer using TensorRT
  primitive operations (ElementWise, Reduce, Unary)
- onnxOpCheckers.cpp: Add empty checker for RMSNormalization
- docs/operators.md: Add RMSNormalization to supported operators matrix
- onnx_backend_test.py: Include RMSNormalization tests

Fixes onnx/onnx-tensorrt#4639 (via NVIDIA/TensorRT#4639)

Signed-off-by: Aditi_Pandey <54734131+AditiThirdEye@users.noreply.github.com>
---
 docs/operators.md    |  1 +
 onnxOpCheckers.cpp   |  2 ++
 onnxOpImporters.cpp  | 86 ++++++++++++++++++++++++++++++++++++++++++++
 onnx_backend_test.py |  1 +
 4 files changed, 90 insertions(+)
diff --git a/docs/operators.md b/docs/operators.md
index 39034cd7..2b25fff1 100644
--- a/docs/operators.md
+++ b/docs/operators.md
@@ -161,6 +161,7 @@ TensorRT supports the following ONNX data types: DOUBLE, FLOAT32, FLOAT16, BFLOA
 | Reshape                   | Y          | FP32, FP16, BF16, INT32, INT64, BOOL |
 | Resize                    | Y          | FP32, FP16, BF16 | Supported resize transformation modes: `half_pixel`, `pytorch_half_pixel`, `tf_half_pixel_for_nn`, `asymmetric`, and `align_corners`.<br />Supported resize modes: `nearest`, `linear`.<br />Supported nearest modes: `floor`, `ceil`, `round_prefer_floor`, `round_prefer_ceil`.<br />Supported aspect ratio policy: `stretch`.<br />When `scales` is a tensor input, `axes` must be an iota vector of length rank(input).<br />Antialiasing is not supported.|
 | ReverseSequence           | Y          | FP32, FP16, BF16, INT32, INT64, BOOL |
+| RMSNormalization          | Y          | FP32, FP16, BF16 | Only the first output `Y` is supported. Introduced in opset 23.
 | RNN                       | Y          | FP32, FP16, BF16| For bidirectional RNNs, activation functions must be the same for both the forward and reverse pass
 | RoiAlign                  | Y          | FP32, FP16 |
 | Round                     | Y          | FP32, FP16, BF16 |
diff --git a/onnxOpCheckers.cpp b/onnxOpCheckers.cpp
index d1dc6960..7369d6e9 100644
--- a/onnxOpCheckers.cpp
+++ b/onnxOpCheckers.cpp
@@ -599,6 +599,8 @@ DEFINE_OP_EMPTY_CHECKER(ReduceSum)
 
 DEFINE_OP_EMPTY_CHECKER(ReduceSumSquare)
 
+DEFINE_OP_EMPTY_CHECKER(RMSNormalization)
+
 DEFINE_OP_EMPTY_CHECKER(Relu)
 
 DEFINE_OP_EMPTY_CHECKER(Sign)
diff --git a/onnxOpImporters.cpp b/onnxOpImporters.cpp
index d3e644bb..4cc7f16e 100644
--- a/onnxOpImporters.cpp
+++ b/onnxOpImporters.cpp
@@ -4791,6 +4791,92 @@ DEFINE_BUILTIN_OP_IMPORTER(ReduceSumSquare)
         inputs.size() >= 2 ? inputs.at(1) : TensorOrWeights());
 }
 
+// RMSNormalization: Y = (X / sqrt(mean(X^2) + epsilon)) * scale
+// Introduced in ONNX opset 23
+DEFINE_BUILTIN_OP_IMPORTER(RMSNormalization)
+{
+    using eOp = nvinfer1::ElementWiseOperation;
+    using uOp = nvinfer1::UnaryOperation;
+    using rOp = nvinfer1::ReduceOperation;
+
+    // Get input tensor
+    nvinfer1::ITensor* input = &convertToTensor(inputs.at(0), ctx);
+    auto const nbDims = input->getDimensions().nbDims;
+    auto const dt = input->getType();
+
+    // Validate supported data types
+    ONNXTRT_CHECK_NODE((dt == DataType::kFLOAT || dt == DataType::kHALF || dt == DataType::kBF16),
+        "Only float32/float16/bfloat16 inputs/outputs supported in RMSNormalization. The current data type = "
+            + getTrtDtypeName(dt) + ".",
+        node, nodeIdx, ErrorCode::kUNSUPPORTED_NODE_DATATYPE);
+
+    // Get scale tensor
+    nvinfer1::ITensor* scale = &convertToTensor(inputs.at(1), ctx);
+
+    // Parse attributes
+    OnnxAttrs attrs(node, ctx);
+    float const epsilon = attrs.get("epsilon", 1e-5f);
+    int32_t axis = attrs.get("axis", -1);
+    nvinfer1::DataType computeType = nvinfer1::DataType::kFLOAT;
+    convertDtype(attrs.get<int32_t>("stash_type", 1), &computeType);
+
+    // Convert negative axis to positive
+    convertAxis(axis, nbDims, node, nodeIdx);
+
+    // Create axes mask for normalization (from axis to end)
+    uint32_t axesMask = 0;
+    for (int32_t i = axis; i < nbDims; i++)
+    {
+        axesMask |= 1 << i;
+    }
+
+    // Step 1: Square the input (X^2)
+    auto* sqrLayer = N_CHECK(ctx->network()->addElementWise(*input, *input, eOp::kPROD));
+    ctx->registerLayer(sqrLayer, node);
+    auto* xSquared = N_CHECK(sqrLayer->getOutput(0));
+
+    // Step 2: Mean of squared values (mean(X^2))
+    auto* meanLayer = N_CHECK(ctx->network()->addReduce(*xSquared, rOp::kAVG, axesMask, true));
+    ctx->registerLayer(meanLayer, node);
+    auto* meanSquared = N_CHECK(meanLayer->getOutput(0));
+
+    // Step 3: Add epsilon (mean(X^2) + epsilon)
+    nvinfer1::IConstantLayer* epsilonLayer;
+    if (dt == DataType::kHALF)
+    {
+        epsilonLayer = addConstantScalar(ctx, static_cast<half_float::half>(epsilon), ::ONNX_NAMESPACE::TensorProto::FLOAT16);
+    }
+    else if (dt == DataType::kBF16)
+    {
+        epsilonLayer = addConstantScalar(ctx, static_cast<BFloat16>(epsilon), ::ONNX_NAMESPACE::TensorProto::BFLOAT16);
+    }
+    else
+    {
+        epsilonLayer = addConstantScalar(ctx, epsilon, ::ONNX_NAMESPACE::TensorProto::FLOAT);
+    }
+    auto* epsilonTensor = N_CHECK(epsilonLayer->getOutput(0));
+    auto* addEpsLayer = N_CHECK(ctx->network()->addElementWise(*meanSquared, *epsilonTensor, eOp::kSUM));
+    ctx->registerLayer(addEpsLayer, node);
+    auto* meanPlusEps = N_CHECK(addEpsLayer->getOutput(0));
+
+    // Step 4: Square root (sqrt(mean(X^2) + epsilon) = RMS)
+    auto* sqrtLayer = N_CHECK(ctx->network()->addUnary(*meanPlusEps, uOp::kSQRT));
+    ctx->registerLayer(sqrtLayer, node);
+    auto* rms = N_CHECK(sqrtLayer->getOutput(0));
+
+    // Step 5: Divide input by RMS (X / RMS = normalized)
+    auto* divLayer = N_CHECK(ctx->network()->addElementWise(*input, *rms, eOp::kDIV));
+    ctx->registerLayer(divLayer, node);
+    auto* normalized = N_CHECK(divLayer->getOutput(0));
+
+    // Step 6: Broadcast scale to input size and multiply (normalized * scale)
+    broadcastTensors(ctx, normalized, scale);
+    auto* scaleLayer = N_CHECK(ctx->network()->addElementWise(*normalized, *scale, eOp::kPROD));
+    ctx->registerLayer(scaleLayer, node);
+
+    RETURN_FIRST_OUTPUT(scaleLayer, node, nodeIdx);
+}
+
 DEFINE_BUILTIN_OP_IMPORTER(Relu)
 {
     return activationHelper(ctx, node, nodeIdx, inputs, nvinfer1::ActivationType::kRELU);
diff --git a/onnx_backend_test.py b/onnx_backend_test.py
index f62004ad..ecbf4692 100644
--- a/onnx_backend_test.py
+++ b/onnx_backend_test.py
@@ -107,6 +107,7 @@
 backend_test.include(r'.*test_reduce.*')
 backend_test.include(r'.*test_ReLU*')
 backend_test.include(r'.*test_relu.*')
+backend_test.include(r'.*test_rms_normalization.*')
 backend_test.include(r'.*test_selu.*')
 backend_test.include(r'.*test_shape.*')
 backend_test.include(r'.*test_Sigmoid*')