onnx
diff --git a/‎AttentionHelpers.cpp‎
Lines changed: 181 additions & 0 deletions b/‎AttentionHelpers.cpp‎
Lines changed: 181 additions & 0 deletions
diff --git a/‎AttentionHelpers.hpp‎
Lines changed: 92 additions & 0 deletions b/‎AttentionHelpers.hpp‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions b/‎CMakeLists.txt‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎ImporterContext.cpp‎
Lines changed: 3 additions & 0 deletions b/‎ImporterContext.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎ImporterContext.hpp‎
Lines changed: 10 additions & 8 deletions b/‎ImporterContext.hpp‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎ModelImporter.cpp‎
Lines changed: 1 addition & 1 deletion b/‎ModelImporter.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -0,0 +1,181 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ */
+
+#include "AttentionHelpers.hpp"
+#include "ImporterContext.hpp"
+#include "NvInfer.h"
+#include "ShapeTensor.hpp"
+#include "errorHelpers.hpp"
+#include "importerUtils.hpp"
+#include <cmath>
+#include <numeric>
+#include <string>
+#include <vector>
+
+namespace
+{
+//!
+//! \brief Return true if `divident` is divisible by `divisor`.
+//!
+bool isDivisible(int64_t const divident, int64_t const divisor)
+{
+    return (divisor != 0) && ((divident % divisor) == 0);
+}
+} // namespace
+
+namespace onnx2trt
+{
+
+//!
+//! \brief Reshape and return the Q, K, or V tensor from the input tensor.
+//!
+//! \param qkvInput The input tensor. This can either be a 4D tensor (batchSize, numHeads, sequenceLength, headSize) or
+//!                 a 3D tensor (batchSize, sequenceLength, hiddenSize=numHeads*headSize). If it is a 3D tensor,
+//!                 permute and reshape to the 4D shape before returning. Otherwise, return the input tensor.
+//! \param attrs The ONNX node attributes.
+//! \param ctx The importer context.
+//! \param isQ True if the input tensor is the Q tensor, false if it is the K or V tensor.
+//! \return nvinfer1::ITensor& The Q, K, or V tensor.
+//!
+nvinfer1::ITensor& reshapeQKVTensor(
+    TensorOrWeights& qkvInput, OnnxAttrs const& attrs, ImporterContext* ctx, bool const isQ)
+{
+    if (qkvInput.shape().nbDims == 3)
+    {
+        // qkvInput is a 3D tensor (batchSize, sequenceLength, hiddenSize=numHeads * headSize).
+        // Get relevant dimensions.
+        int64_t const numHeadsValue
+            = isQ ? attrs.get<int64_t>("q_num_heads", 0) : attrs.get<int64_t>("kv_num_heads", 0);
+        ONNXTRT_CHECK(numHeadsValue != 0,
+            "q_num_heads and kv_num_heads attributes are not specified, which are required for 3D Q/K/V tensors",
+            ErrorCode::kINVALID_NODE);
+        ShapeTensor numHeads = shapeVector(numHeadsValue);
+
+        ShapeTensor hiddenSize = gather(ctx, shapeOf(qkvInput), shapeVector(2));
+        if (hiddenSize.allValuesKnown())
+        {
+            // Perform static check for divisibility.
+            ONNXTRT_CHECK(isDivisible(hiddenSize[0], numHeads[0]),
+                "hidden_size must be divisible by num_heads. Received hidden_size=" << hiddenSize[0]
+                                                                                    << " and num_heads=" << numHeads,
+                ErrorCode::kINVALID_NODE);
+        }
+
+        ShapeTensor headSize = floorDiv(ctx, hiddenSize, numHeads);
+
+        // == Transform (batchSize, sequenceLength, hiddenSize) -> (batchSize, numHeads, sequenceLength, headSize) by ==
+        // 1. Reshape to (batchSize, sequenceLength, numHeads, headSize).
+        // Use (0, 0, numHeads, headSize) as a shorthand to propagate `batchSize` and `sequenceLength` from the input
+        // tensor without instantiating them. Set `zeroIsPlaceholder` to enable this shorthand.
+        ShapeTensor newShape = concat(ctx, fillShapeVector(ctx, 0, shapeVector(2)), concat(ctx, numHeads, headSize));
+        nvinfer1::IShuffleLayer* shuffle
+            = addShuffle(ctx, convertToTensor(qkvInput, ctx), newShape, /*zeroIsPlaceholder*/ true);
+
+        // 2. Permute to (batchSize, numHeads, sequenceLength, headSize)
+        shuffle->setSecondTranspose({0, 2, 1, 3});
+
+        return *N_CHECK(shuffle->getOutput(0));
+    }
+    else
+    {
+        return convertToTensor(qkvInput, ctx);
+    }
+}
+
+//!
+//! \brief Scale the Q or K tensor by `sqrt(scale)`.
+//!
+//! `scale` is either provided as an attribute or set as the default value of `1/sqrt(headSize)`. `scale` is defined as
+//! `QK^T -> QK^T * scale`, but we apply `Q -> Q * sqrt(scale)` and `K -> K * sqrt(scale)` for numerical stability.
+//!
+//! \param qkTensor The Q or K tensor to scale.
+//! \param attrs The ONNX node attributes.
+//! \param ctx The importer context.
+//! \return nvinfer1::ITensor& The scaled Q or K tensor.
+//!
+nvinfer1::ITensor& scaleQKTensor(nvinfer1::ITensor& qkTensor, OnnxAttrs const& attrs, ImporterContext* ctx)
+{
+    nvinfer1::ITensor* sqrtScale = nullptr;
+
+    if (attrs.count("scale"))
+    {
+        // Obtain the sqrt of scale as a constant (output of a constant layer).
+        nvinfer1::IConstantLayer* constant = addConstantScalar(
+            ctx, std::sqrt(attrs.get<float>("scale")), ::ONNX_NAMESPACE::TensorProto::FLOAT, {4, {1, 1, 1, 1}});
+        sqrtScale = castHelper(ctx, N_CHECK(constant)->getOutput(0), qkTensor.getType());
+    }
+    else
+    {
+        ShapeTensor headSize = gather(ctx, shapeOf(qkTensor), shapeScalar(3));
+        nvinfer1::ITensor* headSizeF = castHelper(ctx, &headSize.tensor(ctx), qkTensor.getType());
+
+        // By default, scale := 1/sqrt(headSize)
+        nvinfer1::ITensor* sqrtHeadSize = getUnaryResult(ctx, *headSizeF, nvinfer1::UnaryOperation::kSQRT);
+        nvinfer1::ITensor* scale = getUnaryResult(ctx, *sqrtHeadSize, nvinfer1::UnaryOperation::kRECIP);
+
+        sqrtScale = getUnaryResult(ctx, *scale, nvinfer1::UnaryOperation::kSQRT);
+        sqrtScale = unsqueezeTensor(ctx, *sqrtScale, {0, 1, 2, 3});
+    }
+
+    // Scale Q or K tensor by `sqrt(scale)`.
+    return *getElementWiseResult(ctx, qkTensor, *sqrtScale, nvinfer1::ElementWiseOperation::kPROD);
+}
+
+nvinfer1::ITensor& convertToQTensor(TensorOrWeights& qInput, OnnxAttrs const& attrs, ImporterContext* ctx)
+{
+    return scaleQKTensor(reshapeQKVTensor(qInput, attrs, ctx, true), attrs, ctx);
+}
+
+nvinfer1::ITensor& convertToKTensor(TensorOrWeights& kInput, OnnxAttrs const& attrs, ImporterContext* ctx)
+{
+    return scaleQKTensor(reshapeQKVTensor(kInput, attrs, ctx, false), attrs, ctx);
+}
+
+nvinfer1::ITensor& convertToVTensor(TensorOrWeights& vInput, OnnxAttrs const& attrs, ImporterContext* ctx)
+{
+    return reshapeQKVTensor(vInput, attrs, ctx, false);
+}
+
+nvinfer1::ITensor& convertToMaskTensor(TensorOrWeights& maskInput, ImporterContext* ctx)
+{
+    ONNXTRT_CHECK(maskInput.shape().nbDims <= 4,
+        "Attention masks should have rank leq 4. Got mask with rank " << maskInput.shape().nbDims << ".",
+        ErrorCode::kINVALID_NODE);
+
+    if (maskInput.shape().nbDims == 4)
+    {
+        // Mask has rank 4. Directly return the mask tensor.
+        return convertToTensor(maskInput, ctx);
+    }
+    else
+    {
+        // Mask has rank less than 4. Reshape to rank 4 by prepending dimensions.
+        int32_t const numDimsToPrepend = 4 - maskInput.shape().nbDims;
+        std::vector<int32_t> unsqueezeAxes(numDimsToPrepend);
+        std::iota(unsqueezeAxes.begin(), unsqueezeAxes.end(), 0);
+
+        return *unsqueezeTensor(ctx, convertToTensor(maskInput, ctx), unsqueezeAxes);
+    }
+}
+
+nvinfer1::AttentionNormalizationOp parseNormalizationOp(OnnxAttrs const& attrs)
+{
+    std::string normalizationOp
+        = attrs.get<std::string>("TRT_normalization_op", "softmax"); // Normalization op defaults to softmax.
+    if (normalizationOp == "softmax")
+    {
+        return nvinfer1::AttentionNormalizationOp::kSOFTMAX;
+    }
+    else if (normalizationOp == "none")
+    {
+        return nvinfer1::AttentionNormalizationOp::kNONE;
+    }
+    else
+    {
+        ONNXTRT_CHECK(false, "Unsupported normalization op: " << normalizationOp, ErrorCode::kINVALID_NODE);
+    }
+}
+
+} // namespace onnx2trt
@@ -0,0 +1,92 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Helper functions used for importing the ONNX Attention operator.
+ *
+ */
+
+#pragma once
+
+#include "ImporterContext.hpp"
+#include "OnnxAttrs.hpp"
+
+namespace onnx2trt
+{
+
+//!
+//! \brief Convert the input tensor to the Q (query) tensor accepted by TensorRT.
+//!
+//! This is a wrapper over \p convertToTensor with the following additional transformations:
+//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
+//!    reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
+//! 2) Obtain `scale` from the attribute if provided, otherwise use `1/sqrt(headSize)` as the default value. While
+//!    `scale` is defined on the QK^T product, apply `sqrt(scale)` on the Q tensor for numerical stability.
+//!
+//! \param qInput The input tensor to convert.
+//! \param attrs The attributes of the Attention node.
+//! \param ctx The importer context.
+//! \return nvinfer1::ITensor& The converted Q tensor with shape (batchSize, numHeads, sequenceLength, headSize).
+//!
+nvinfer1::ITensor& convertToQTensor(TensorOrWeights& qInput, OnnxAttrs const& attrs, ImporterContext* ctx);
+
+//!
+//! \brief Convert the input tensor to the K (key) tensor accepted by TensorRT.
+//!
+//! This is a wrapper over \p convertToTensor with the following additional transformations:
+//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
+//!    reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
+//! 2) Obtain `scale` from the attribute if provided, otherwise use `1/sqrt(headSize)` as the default value. While
+//!    `scale` is defined on the QK^T product, apply `sqrt(scale)` on the K tensor for numerical stability.
+//!
+//! \param kInput The input tensor to convert.
+//! \param attrs The attributes of the Attention node.
+//! \param ctx The importer context.
+//! \return nvinfer1::ITensor& The converted K tensor with shape (batchSize, numHeads, sequenceLength, headSize).
+//!
+nvinfer1::ITensor& convertToKTensor(TensorOrWeights& kInput, OnnxAttrs const& attrs, ImporterContext* ctx);
+
+//!
+//! \brief Convert the input tensor to the V (value) tensor accepted by TensorRT.
+//!
+//! This is a wrapper over \p convertToTensor with the following additional transformation:
+//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
+//!    reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
+//!
+//! \param vInput The input tensor to convert.
+//! \param attrs The attributes of the Attention node.
+//! \param ctx The importer context.
+//! \return nvinfer1::ITensor& The converted V tensor with shape (batchSize, numHeads, sequenceLength, headSize).
+//!
+nvinfer1::ITensor& convertToVTensor(TensorOrWeights& vInput, OnnxAttrs const& attrs, ImporterContext* ctx);
+
+//!
+//! \brief Convert the input tensor to the mask tensor accepted by TensorRT.
+//!
+//! \precondition: The input tensor shape is ONNX-broadcastable to (batchSize, qNumHeads, QSequenceLength,
+//! KVSequenceLength), where ONNX-broadcastable is defined as satisfying any one of the following:
+//! 1) The input tensor has exactly the same shape as the target shape.
+//! 2) The input tensor has the same rank (number of dimensions) as the target shape, and each dimension is either the
+//!    same as the target shape or 1.
+//! 3) The input tensor has less rank than the target shape, but it can have its shape
+//!    prepended with dimensions of length 1 to satisfy 2).
+//!
+//! \param maskInput The input tensor to convert.
+//! \param ctx The importer context.
+//! \return nvinfer1::ITensor& The converted mask tensor that is TensorRT-broadcastable to (batchSize, qNumHeads,
+//!         qSequenceLength, kvSequenceLength), where TensorRT-broadcastable is defined as satisfying properties 1) or
+//!         2) above, but not 3).
+//!
+nvinfer1::ITensor& convertToMaskTensor(TensorOrWeights& maskInput, ImporterContext* ctx);
+
+//!
+//! \brief Parse the normalization op from the attributes.
+//!
+//! While ONNX does not support specifying normalization op (always softmax), users could use the custom attribute
+//! \p TRT_normalization_op to set it for TensorRT.
+//!
+//! \param attrs The attributes of the Attention node.
+//! \return nvinfer1::AttentionNormalizationOp The parsed normalization op.
+//!
+nvinfer1::AttentionNormalizationOp parseNormalizationOp(OnnxAttrs const& attrs);
+
+} // namespace onnx2trt
@@ -28,7 +28,7 @@ add_definitions("-DSOURCE_LENGTH=${SOURCE_LENGTH}")
 # Version information
 #--------------------------------------------------
 set(ONNX2TRT_MAJOR 10)
-set(ONNX2TRT_MINOR 13)
+set(ONNX2TRT_MINOR 14)
 set(ONNX2TRT_PATCH 0)
 set(ONNX2TRT_VERSION "${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}.${ONNX2TRT_PATCH}" CACHE STRING "ONNX2TRT version")
 
@@ -37,6 +37,7 @@ set(ONNX2TRT_VERSION "${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}.${ONNX2TRT_PATCH}" CAC
 #--------------------------------------------------
 
 set(IMPORTER_SOURCES
+    AttentionHelpers.cpp
     NvOnnxParser.cpp
     ModelImporter.cpp
     ModelRefitter.cpp
@@ -114,11 +115,11 @@ MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
 # TensorRT Python Headers
 find_path(TENSORRT_PYTHON_INCLUDE_DIR NvInferPythonPlugin.h
   HINTS ${TENSORRT_ROOT}
-  PATH_SUFFIXES python/include/impl)
+  PATH_SUFFIXES include/impl)
 
 # If header is not found, download it from open source release.
 if(NOT TENSORRT_PYTHON_INCLUDE_DIR)
-  set(PLUGIN_URL "https://raw.githubusercontent.com/NVIDIA/TensorRT/refs/heads/release/${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}/python/include/impl/NvInferPythonPlugin.h")
+  set(PLUGIN_URL "https://raw.githubusercontent.com/NVIDIA/TensorRT/refs/heads/release/${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}/include/impl/NvInferPythonPlugin.h")
   set(FILE_DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/NvInferPythonPlugin.h")
 
   message(NOTICE "Required header NvInferPythonPlugin.h not found. Downloading from ${PLUGIN_URL} to ${FILE_DESTINATION}")
 
@@ -26,6 +26,9 @@
         }                                                                                                              \
     } while (0)
 
+#define STRINGIFY(x) #x
+#define LITERAL(x) STRINGIFY(x)
+
 namespace
 {
 
 
@@ -94,11 +94,13 @@ class ImporterContext
     std::set<std::string> mLayerNames;
     //! An increasing suffix counter used to uniquify layer names.
     int64_t mSuffixCounter{0};
-    //! Set to keep track of how many times a batch norm weight name shows up,
-    //! to avoid duplicate naming in TRT.
-    std::set<std::string> mBatchNormWeightNames;
-    //! An increasing suffix counter used to uniquify batch norm weight names.
-    int64_t mBatchNormWeightSuffixCounter{0};
+    //! Set to keep track of how many times a refittable name created by the parser shows up, to avoid duplicate naming in TRT.
+    //! Currently tracks the following nodes:
+    //!     1. BatchNorm - Parser pre-combines scales and bias weights for the IScaleLayer.
+    //!     2. ConstantOfShape - The value of the ConstantOfShape does not have a name, so the parser needs to create one for it.
+    std::set<std::string> mTempRefittableWeights;
+    //! An increasing suffix counter used to uniquify refittable weight names created by the parser.
+    int64_t mTempRefittableWeightsSuffixCounter{0};
     //! Set to hold output tensor names of layers that produce shape tensor outputs but do not
     //! natively support them.
     std::unordered_set<std::string> mUnsupportedShapeTensors;
@@ -221,12 +223,12 @@ class ImporterContext
     }
 
     // Register an unique name for the created weights
-    ShapedWeights createNamedTempWeights(ShapedWeights::DataType type, nvinfer1::Dims shape, bool batchNormNode = false)
+    ShapedWeights createNamedTempWeights(ShapedWeights::DataType type, nvinfer1::Dims shape, bool refittable = false)
     {
-        if (batchNormNode)
+        if (refittable)
         {
             return mWeightsContext.createNamedTempWeights(
-                type, shape, mBatchNormWeightNames, mBatchNormWeightSuffixCounter, /*batchNormNode=*/true);
+                type, shape, mTempRefittableWeights, mTempRefittableWeightsSuffixCounter, /*refittable=*/true);
         }
         return mWeightsContext.createNamedTempWeights(type, shape, mTensorNames, mSuffixCounter);
     }
 
@@ -509,7 +509,7 @@ void parseGraph(ImporterContext* ctx, ::ONNX_NAMESPACE::GraphProto const& graph,
     }
     catch (const std::exception& e)
     {
-        ONNXTRT_THROW(MAKE_ERROR("Failed to import initialzer", ErrorCode::kINVALID_GRAPH));
+        ONNXTRT_THROW(MAKE_ERROR(std::string("Failed to import initializer: ") + e.what(), ErrorCode::kINVALID_GRAPH));
     }
 
     // Keep track of graph outputs in the context to validate UINT8 nodes
Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,9 @@`
`26`	`26`	`} \`
`27`	`27`	`} while (0)`
`28`	`28`
	`29`	`+#define STRINGIFY(x) #x`
	`30`	`+#define LITERAL(x) STRINGIFY(x)`
	`31`	`+`
`29`	`32`	`namespace`
`30`	`33`	`{`
`31`	`34`
Original file line number	Diff line number	Diff line change
`@@ -509,7 +509,7 @@ void parseGraph(ImporterContext* ctx, ::ONNX_NAMESPACE::GraphProto const& graph,`
`509`	`509`	`}`
`510`	`510`	`catch (const std::exception& e)`
`511`	`511`	`{`
`512`		`- ONNXTRT_THROW(MAKE_ERROR("Failed to import initialzer", ErrorCode::kINVALID_GRAPH));`
	`512`	`+ ONNXTRT_THROW(MAKE_ERROR(std::string("Failed to import initializer: ") + e.what(), ErrorCode::kINVALID_GRAPH));`
`513`	`513`	`}`
`514`	`514`
`515`	`515`	`// Keep track of graph outputs in the context to validate UINT8 nodes`