Add ONNX opset 23 RMSNormalization operator support

AditiThirdEye · AditiThirdEye · commit 5ec6d8c39cfb · 2025-12-29T10:17:31.000+05:30
Implements RMSNormalization operator for TensorRT ONNX parser, enabling deployment of modern transformer architectures (LLaMA, Mistral, etc.) that use RMSNorm instead of LayerNorm. Implementation details: - Computes Y = (X / sqrt(mean(X^2) + epsilon)) * scale - Supports FP32, FP16, and BF16 data types - Handles axis attribute for normalization dimensions - Supports epsilon and stash_type attributes per ONNX spec Changes: - onnxOpImporters.cpp: Add RMSNormalization importer using TensorRT primitive operations (ElementWise, Reduce, Unary) - onnxOpCheckers.cpp: Add empty checker for RMSNormalization - docs/operators.md: Add RMSNormalization to supported operators matrix - onnx_backend_test.py: Include RMSNormalization tests Fixes onnx/onnx-tensorrt#4639 (via NVIDIA/TensorRT#4639) Signed-off-by: Aditi_Pandey <54734131+AditiThirdEye@users.noreply.github.com>
diff --git a/docs/operators.md b/docs/operators.md
@@ -161,6 +161,7 @@ TensorRT supports the following ONNX data types: DOUBLE, FLOAT32, FLOAT16, BFLOA
 | Reshape                   | Y          | FP32, FP16, BF16, INT32, INT64, BOOL |
 | Resize                    | Y          | FP32, FP16, BF16 | Supported resize transformation modes: `half_pixel`, `pytorch_half_pixel`, `tf_half_pixel_for_nn`, `asymmetric`, and `align_corners`.<br />Supported resize modes: `nearest`, `linear`.<br />Supported nearest modes: `floor`, `ceil`, `round_prefer_floor`, `round_prefer_ceil`.<br />Supported aspect ratio policy: `stretch`.<br />When `scales` is a tensor input, `axes` must be an iota vector of length rank(input).<br />Antialiasing is not supported.|
 | ReverseSequence           | Y          | FP32, FP16, BF16, INT32, INT64, BOOL |
+| RMSNormalization          | Y          | FP32, FP16, BF16 | Only the first output `Y` is supported. Introduced in opset 23.
 | RNN                       | Y          | FP32, FP16, BF16| For bidirectional RNNs, activation functions must be the same for both the forward and reverse pass
 | RoiAlign                  | Y          | FP32, FP16 |
 | Round                     | Y          | FP32, FP16, BF16 |
diff --git a/onnxOpCheckers.cpp b/onnxOpCheckers.cpp
@@ -599,6 +599,8 @@ DEFINE_OP_EMPTY_CHECKER(ReduceSum)
 
 DEFINE_OP_EMPTY_CHECKER(ReduceSumSquare)
 
+DEFINE_OP_EMPTY_CHECKER(RMSNormalization)
+
 DEFINE_OP_EMPTY_CHECKER(Relu)
 
 DEFINE_OP_EMPTY_CHECKER(Sign)
diff --git a/onnxOpImporters.cpp b/onnxOpImporters.cpp
@@ -4791,6 +4791,92 @@ DEFINE_BUILTIN_OP_IMPORTER(ReduceSumSquare)
         inputs.size() >= 2 ? inputs.at(1) : TensorOrWeights());
 }
 
+// RMSNormalization: Y = (X / sqrt(mean(X^2) + epsilon)) * scale
+// Introduced in ONNX opset 23
+DEFINE_BUILTIN_OP_IMPORTER(RMSNormalization)
+{
+    using eOp = nvinfer1::ElementWiseOperation;
+    using uOp = nvinfer1::UnaryOperation;
+    using rOp = nvinfer1::ReduceOperation;
+
+    // Get input tensor
+    nvinfer1::ITensor* input = &convertToTensor(inputs.at(0), ctx);
+    auto const nbDims = input->getDimensions().nbDims;
+    auto const dt = input->getType();
+
+    // Validate supported data types
+    ONNXTRT_CHECK_NODE((dt == DataType::kFLOAT || dt == DataType::kHALF || dt == DataType::kBF16),
+        "Only float32/float16/bfloat16 inputs/outputs supported in RMSNormalization. The current data type = "
+            + getTrtDtypeName(dt) + ".",
+        node, nodeIdx, ErrorCode::kUNSUPPORTED_NODE_DATATYPE);
+
+    // Get scale tensor
+    nvinfer1::ITensor* scale = &convertToTensor(inputs.at(1), ctx);
+
+    // Parse attributes
+    OnnxAttrs attrs(node, ctx);
+    float const epsilon = attrs.get("epsilon", 1e-5f);
+    int32_t axis = attrs.get("axis", -1);
+    nvinfer1::DataType computeType = nvinfer1::DataType::kFLOAT;
+    convertDtype(attrs.get<int32_t>("stash_type", 1), &computeType);
+
+    // Convert negative axis to positive
+    convertAxis(axis, nbDims, node, nodeIdx);
+
+    // Create axes mask for normalization (from axis to end)
+    uint32_t axesMask = 0;
+    for (int32_t i = axis; i < nbDims; i++)
+    {
+        axesMask |= 1 << i;
+    }
+
+    // Step 1: Square the input (X^2)
+    auto* sqrLayer = N_CHECK(ctx->network()->addElementWise(*input, *input, eOp::kPROD));
+    ctx->registerLayer(sqrLayer, node);
+    auto* xSquared = N_CHECK(sqrLayer->getOutput(0));
+
+    // Step 2: Mean of squared values (mean(X^2))
+    auto* meanLayer = N_CHECK(ctx->network()->addReduce(*xSquared, rOp::kAVG, axesMask, true));
+    ctx->registerLayer(meanLayer, node);
+    auto* meanSquared = N_CHECK(meanLayer->getOutput(0));
+
+    // Step 3: Add epsilon (mean(X^2) + epsilon)
+    nvinfer1::IConstantLayer* epsilonLayer;
+    if (dt == DataType::kHALF)
+    {
+        epsilonLayer = addConstantScalar(ctx, static_cast<half_float::half>(epsilon), ::ONNX_NAMESPACE::TensorProto::FLOAT16);
+    }
+    else if (dt == DataType::kBF16)
+    {
+        epsilonLayer = addConstantScalar(ctx, static_cast<BFloat16>(epsilon), ::ONNX_NAMESPACE::TensorProto::BFLOAT16);
+    }
+    else
+    {
+        epsilonLayer = addConstantScalar(ctx, epsilon, ::ONNX_NAMESPACE::TensorProto::FLOAT);
+    }
+    auto* epsilonTensor = N_CHECK(epsilonLayer->getOutput(0));
+    auto* addEpsLayer = N_CHECK(ctx->network()->addElementWise(*meanSquared, *epsilonTensor, eOp::kSUM));
+    ctx->registerLayer(addEpsLayer, node);
+    auto* meanPlusEps = N_CHECK(addEpsLayer->getOutput(0));
+
+    // Step 4: Square root (sqrt(mean(X^2) + epsilon) = RMS)
+    auto* sqrtLayer = N_CHECK(ctx->network()->addUnary(*meanPlusEps, uOp::kSQRT));
+    ctx->registerLayer(sqrtLayer, node);
+    auto* rms = N_CHECK(sqrtLayer->getOutput(0));
+
+    // Step 5: Divide input by RMS (X / RMS = normalized)
+    auto* divLayer = N_CHECK(ctx->network()->addElementWise(*input, *rms, eOp::kDIV));
+    ctx->registerLayer(divLayer, node);
+    auto* normalized = N_CHECK(divLayer->getOutput(0));
+
+    // Step 6: Broadcast scale to input size and multiply (normalized * scale)
+    broadcastTensors(ctx, normalized, scale);
+    auto* scaleLayer = N_CHECK(ctx->network()->addElementWise(*normalized, *scale, eOp::kPROD));
+    ctx->registerLayer(scaleLayer, node);
+
+    RETURN_FIRST_OUTPUT(scaleLayer, node, nodeIdx);
+}
+
 DEFINE_BUILTIN_OP_IMPORTER(Relu)
 {
     return activationHelper(ctx, node, nodeIdx, inputs, nvinfer1::ActivationType::kRELU);
diff --git a/onnx_backend_test.py b/onnx_backend_test.py
@@ -107,6 +107,7 @@
 backend_test.include(r'.*test_reduce.*')
 backend_test.include(r'.*test_ReLU*')
 backend_test.include(r'.*test_relu.*')
+backend_test.include(r'.*test_rms_normalization.*')
 backend_test.include(r'.*test_selu.*')
 backend_test.include(r'.*test_shape.*')
 backend_test.include(r'.*test_Sigmoid*')