Skip to content

Commit c727277

Browse files
authored
10.14-GA Release (#1043)
Signed-off-by: Kevin Chen <[email protected]>
1 parent 9a9f788 commit c727277

20 files changed

+669
-67
lines changed

AttentionHelpers.cpp

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
*/
5+
6+
#include "AttentionHelpers.hpp"
7+
#include "ImporterContext.hpp"
8+
#include "NvInfer.h"
9+
#include "ShapeTensor.hpp"
10+
#include "errorHelpers.hpp"
11+
#include "importerUtils.hpp"
12+
#include <cmath>
13+
#include <numeric>
14+
#include <string>
15+
#include <vector>
16+
17+
namespace
18+
{
19+
//!
20+
//! \brief Return true if `divident` is divisible by `divisor`.
21+
//!
22+
bool isDivisible(int64_t const divident, int64_t const divisor)
23+
{
24+
return (divisor != 0) && ((divident % divisor) == 0);
25+
}
26+
} // namespace
27+
28+
namespace onnx2trt
29+
{
30+
31+
//!
32+
//! \brief Reshape and return the Q, K, or V tensor from the input tensor.
33+
//!
34+
//! \param qkvInput The input tensor. This can either be a 4D tensor (batchSize, numHeads, sequenceLength, headSize) or
35+
//! a 3D tensor (batchSize, sequenceLength, hiddenSize=numHeads*headSize). If it is a 3D tensor,
36+
//! permute and reshape to the 4D shape before returning. Otherwise, return the input tensor.
37+
//! \param attrs The ONNX node attributes.
38+
//! \param ctx The importer context.
39+
//! \param isQ True if the input tensor is the Q tensor, false if it is the K or V tensor.
40+
//! \return nvinfer1::ITensor& The Q, K, or V tensor.
41+
//!
42+
nvinfer1::ITensor& reshapeQKVTensor(
43+
TensorOrWeights& qkvInput, OnnxAttrs const& attrs, ImporterContext* ctx, bool const isQ)
44+
{
45+
if (qkvInput.shape().nbDims == 3)
46+
{
47+
// qkvInput is a 3D tensor (batchSize, sequenceLength, hiddenSize=numHeads * headSize).
48+
// Get relevant dimensions.
49+
int64_t const numHeadsValue
50+
= isQ ? attrs.get<int64_t>("q_num_heads", 0) : attrs.get<int64_t>("kv_num_heads", 0);
51+
ONNXTRT_CHECK(numHeadsValue != 0,
52+
"q_num_heads and kv_num_heads attributes are not specified, which are required for 3D Q/K/V tensors",
53+
ErrorCode::kINVALID_NODE);
54+
ShapeTensor numHeads = shapeVector(numHeadsValue);
55+
56+
ShapeTensor hiddenSize = gather(ctx, shapeOf(qkvInput), shapeVector(2));
57+
if (hiddenSize.allValuesKnown())
58+
{
59+
// Perform static check for divisibility.
60+
ONNXTRT_CHECK(isDivisible(hiddenSize[0], numHeads[0]),
61+
"hidden_size must be divisible by num_heads. Received hidden_size=" << hiddenSize[0]
62+
<< " and num_heads=" << numHeads,
63+
ErrorCode::kINVALID_NODE);
64+
}
65+
66+
ShapeTensor headSize = floorDiv(ctx, hiddenSize, numHeads);
67+
68+
// == Transform (batchSize, sequenceLength, hiddenSize) -> (batchSize, numHeads, sequenceLength, headSize) by ==
69+
// 1. Reshape to (batchSize, sequenceLength, numHeads, headSize).
70+
// Use (0, 0, numHeads, headSize) as a shorthand to propagate `batchSize` and `sequenceLength` from the input
71+
// tensor without instantiating them. Set `zeroIsPlaceholder` to enable this shorthand.
72+
ShapeTensor newShape = concat(ctx, fillShapeVector(ctx, 0, shapeVector(2)), concat(ctx, numHeads, headSize));
73+
nvinfer1::IShuffleLayer* shuffle
74+
= addShuffle(ctx, convertToTensor(qkvInput, ctx), newShape, /*zeroIsPlaceholder*/ true);
75+
76+
// 2. Permute to (batchSize, numHeads, sequenceLength, headSize)
77+
shuffle->setSecondTranspose({0, 2, 1, 3});
78+
79+
return *N_CHECK(shuffle->getOutput(0));
80+
}
81+
else
82+
{
83+
return convertToTensor(qkvInput, ctx);
84+
}
85+
}
86+
87+
//!
88+
//! \brief Scale the Q or K tensor by `sqrt(scale)`.
89+
//!
90+
//! `scale` is either provided as an attribute or set as the default value of `1/sqrt(headSize)`. `scale` is defined as
91+
//! `QK^T -> QK^T * scale`, but we apply `Q -> Q * sqrt(scale)` and `K -> K * sqrt(scale)` for numerical stability.
92+
//!
93+
//! \param qkTensor The Q or K tensor to scale.
94+
//! \param attrs The ONNX node attributes.
95+
//! \param ctx The importer context.
96+
//! \return nvinfer1::ITensor& The scaled Q or K tensor.
97+
//!
98+
nvinfer1::ITensor& scaleQKTensor(nvinfer1::ITensor& qkTensor, OnnxAttrs const& attrs, ImporterContext* ctx)
99+
{
100+
nvinfer1::ITensor* sqrtScale = nullptr;
101+
102+
if (attrs.count("scale"))
103+
{
104+
// Obtain the sqrt of scale as a constant (output of a constant layer).
105+
nvinfer1::IConstantLayer* constant = addConstantScalar(
106+
ctx, std::sqrt(attrs.get<float>("scale")), ::ONNX_NAMESPACE::TensorProto::FLOAT, {4, {1, 1, 1, 1}});
107+
sqrtScale = castHelper(ctx, N_CHECK(constant)->getOutput(0), qkTensor.getType());
108+
}
109+
else
110+
{
111+
ShapeTensor headSize = gather(ctx, shapeOf(qkTensor), shapeScalar(3));
112+
nvinfer1::ITensor* headSizeF = castHelper(ctx, &headSize.tensor(ctx), qkTensor.getType());
113+
114+
// By default, scale := 1/sqrt(headSize)
115+
nvinfer1::ITensor* sqrtHeadSize = getUnaryResult(ctx, *headSizeF, nvinfer1::UnaryOperation::kSQRT);
116+
nvinfer1::ITensor* scale = getUnaryResult(ctx, *sqrtHeadSize, nvinfer1::UnaryOperation::kRECIP);
117+
118+
sqrtScale = getUnaryResult(ctx, *scale, nvinfer1::UnaryOperation::kSQRT);
119+
sqrtScale = unsqueezeTensor(ctx, *sqrtScale, {0, 1, 2, 3});
120+
}
121+
122+
// Scale Q or K tensor by `sqrt(scale)`.
123+
return *getElementWiseResult(ctx, qkTensor, *sqrtScale, nvinfer1::ElementWiseOperation::kPROD);
124+
}
125+
126+
nvinfer1::ITensor& convertToQTensor(TensorOrWeights& qInput, OnnxAttrs const& attrs, ImporterContext* ctx)
127+
{
128+
return scaleQKTensor(reshapeQKVTensor(qInput, attrs, ctx, true), attrs, ctx);
129+
}
130+
131+
nvinfer1::ITensor& convertToKTensor(TensorOrWeights& kInput, OnnxAttrs const& attrs, ImporterContext* ctx)
132+
{
133+
return scaleQKTensor(reshapeQKVTensor(kInput, attrs, ctx, false), attrs, ctx);
134+
}
135+
136+
nvinfer1::ITensor& convertToVTensor(TensorOrWeights& vInput, OnnxAttrs const& attrs, ImporterContext* ctx)
137+
{
138+
return reshapeQKVTensor(vInput, attrs, ctx, false);
139+
}
140+
141+
nvinfer1::ITensor& convertToMaskTensor(TensorOrWeights& maskInput, ImporterContext* ctx)
142+
{
143+
ONNXTRT_CHECK(maskInput.shape().nbDims <= 4,
144+
"Attention masks should have rank leq 4. Got mask with rank " << maskInput.shape().nbDims << ".",
145+
ErrorCode::kINVALID_NODE);
146+
147+
if (maskInput.shape().nbDims == 4)
148+
{
149+
// Mask has rank 4. Directly return the mask tensor.
150+
return convertToTensor(maskInput, ctx);
151+
}
152+
else
153+
{
154+
// Mask has rank less than 4. Reshape to rank 4 by prepending dimensions.
155+
int32_t const numDimsToPrepend = 4 - maskInput.shape().nbDims;
156+
std::vector<int32_t> unsqueezeAxes(numDimsToPrepend);
157+
std::iota(unsqueezeAxes.begin(), unsqueezeAxes.end(), 0);
158+
159+
return *unsqueezeTensor(ctx, convertToTensor(maskInput, ctx), unsqueezeAxes);
160+
}
161+
}
162+
163+
nvinfer1::AttentionNormalizationOp parseNormalizationOp(OnnxAttrs const& attrs)
164+
{
165+
std::string normalizationOp
166+
= attrs.get<std::string>("TRT_normalization_op", "softmax"); // Normalization op defaults to softmax.
167+
if (normalizationOp == "softmax")
168+
{
169+
return nvinfer1::AttentionNormalizationOp::kSOFTMAX;
170+
}
171+
else if (normalizationOp == "none")
172+
{
173+
return nvinfer1::AttentionNormalizationOp::kNONE;
174+
}
175+
else
176+
{
177+
ONNXTRT_CHECK(false, "Unsupported normalization op: " << normalizationOp, ErrorCode::kINVALID_NODE);
178+
}
179+
}
180+
181+
} // namespace onnx2trt

AttentionHelpers.hpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/*
2+
* SPDX-License-Identifier: Apache-2.0
3+
*
4+
* Helper functions used for importing the ONNX Attention operator.
5+
*
6+
*/
7+
8+
#pragma once
9+
10+
#include "ImporterContext.hpp"
11+
#include "OnnxAttrs.hpp"
12+
13+
namespace onnx2trt
14+
{
15+
16+
//!
17+
//! \brief Convert the input tensor to the Q (query) tensor accepted by TensorRT.
18+
//!
19+
//! This is a wrapper over \p convertToTensor with the following additional transformations:
20+
//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
21+
//! reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
22+
//! 2) Obtain `scale` from the attribute if provided, otherwise use `1/sqrt(headSize)` as the default value. While
23+
//! `scale` is defined on the QK^T product, apply `sqrt(scale)` on the Q tensor for numerical stability.
24+
//!
25+
//! \param qInput The input tensor to convert.
26+
//! \param attrs The attributes of the Attention node.
27+
//! \param ctx The importer context.
28+
//! \return nvinfer1::ITensor& The converted Q tensor with shape (batchSize, numHeads, sequenceLength, headSize).
29+
//!
30+
nvinfer1::ITensor& convertToQTensor(TensorOrWeights& qInput, OnnxAttrs const& attrs, ImporterContext* ctx);
31+
32+
//!
33+
//! \brief Convert the input tensor to the K (key) tensor accepted by TensorRT.
34+
//!
35+
//! This is a wrapper over \p convertToTensor with the following additional transformations:
36+
//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
37+
//! reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
38+
//! 2) Obtain `scale` from the attribute if provided, otherwise use `1/sqrt(headSize)` as the default value. While
39+
//! `scale` is defined on the QK^T product, apply `sqrt(scale)` on the K tensor for numerical stability.
40+
//!
41+
//! \param kInput The input tensor to convert.
42+
//! \param attrs The attributes of the Attention node.
43+
//! \param ctx The importer context.
44+
//! \return nvinfer1::ITensor& The converted K tensor with shape (batchSize, numHeads, sequenceLength, headSize).
45+
//!
46+
nvinfer1::ITensor& convertToKTensor(TensorOrWeights& kInput, OnnxAttrs const& attrs, ImporterContext* ctx);
47+
48+
//!
49+
//! \brief Convert the input tensor to the V (value) tensor accepted by TensorRT.
50+
//!
51+
//! This is a wrapper over \p convertToTensor with the following additional transformation:
52+
//! 1) If the input is a 3D tensor with shape (batchSize, sequenceLength, hiddenSize=numHeads*headSize), permute and
53+
//! reshape to the 4D tensor (batchSize, numHeads, sequenceLength, headSize) expected by TensorRT.
54+
//!
55+
//! \param vInput The input tensor to convert.
56+
//! \param attrs The attributes of the Attention node.
57+
//! \param ctx The importer context.
58+
//! \return nvinfer1::ITensor& The converted V tensor with shape (batchSize, numHeads, sequenceLength, headSize).
59+
//!
60+
nvinfer1::ITensor& convertToVTensor(TensorOrWeights& vInput, OnnxAttrs const& attrs, ImporterContext* ctx);
61+
62+
//!
63+
//! \brief Convert the input tensor to the mask tensor accepted by TensorRT.
64+
//!
65+
//! \precondition: The input tensor shape is ONNX-broadcastable to (batchSize, qNumHeads, QSequenceLength,
66+
//! KVSequenceLength), where ONNX-broadcastable is defined as satisfying any one of the following:
67+
//! 1) The input tensor has exactly the same shape as the target shape.
68+
//! 2) The input tensor has the same rank (number of dimensions) as the target shape, and each dimension is either the
69+
//! same as the target shape or 1.
70+
//! 3) The input tensor has less rank than the target shape, but it can have its shape
71+
//! prepended with dimensions of length 1 to satisfy 2).
72+
//!
73+
//! \param maskInput The input tensor to convert.
74+
//! \param ctx The importer context.
75+
//! \return nvinfer1::ITensor& The converted mask tensor that is TensorRT-broadcastable to (batchSize, qNumHeads,
76+
//! qSequenceLength, kvSequenceLength), where TensorRT-broadcastable is defined as satisfying properties 1) or
77+
//! 2) above, but not 3).
78+
//!
79+
nvinfer1::ITensor& convertToMaskTensor(TensorOrWeights& maskInput, ImporterContext* ctx);
80+
81+
//!
82+
//! \brief Parse the normalization op from the attributes.
83+
//!
84+
//! While ONNX does not support specifying normalization op (always softmax), users could use the custom attribute
85+
//! \p TRT_normalization_op to set it for TensorRT.
86+
//!
87+
//! \param attrs The attributes of the Attention node.
88+
//! \return nvinfer1::AttentionNormalizationOp The parsed normalization op.
89+
//!
90+
nvinfer1::AttentionNormalizationOp parseNormalizationOp(OnnxAttrs const& attrs);
91+
92+
} // namespace onnx2trt

CMakeLists.txt

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ add_definitions("-DSOURCE_LENGTH=${SOURCE_LENGTH}")
2828
# Version information
2929
#--------------------------------------------------
3030
set(ONNX2TRT_MAJOR 10)
31-
set(ONNX2TRT_MINOR 13)
31+
set(ONNX2TRT_MINOR 14)
3232
set(ONNX2TRT_PATCH 0)
3333
set(ONNX2TRT_VERSION "${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}.${ONNX2TRT_PATCH}" CACHE STRING "ONNX2TRT version")
3434

@@ -37,6 +37,7 @@ set(ONNX2TRT_VERSION "${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}.${ONNX2TRT_PATCH}" CAC
3737
#--------------------------------------------------
3838

3939
set(IMPORTER_SOURCES
40+
AttentionHelpers.cpp
4041
NvOnnxParser.cpp
4142
ModelImporter.cpp
4243
ModelRefitter.cpp
@@ -114,11 +115,11 @@ MESSAGE(STATUS "Found TensorRT headers at ${TENSORRT_INCLUDE_DIR}")
114115
# TensorRT Python Headers
115116
find_path(TENSORRT_PYTHON_INCLUDE_DIR NvInferPythonPlugin.h
116117
HINTS ${TENSORRT_ROOT}
117-
PATH_SUFFIXES python/include/impl)
118+
PATH_SUFFIXES include/impl)
118119

119120
# If header is not found, download it from open source release.
120121
if(NOT TENSORRT_PYTHON_INCLUDE_DIR)
121-
set(PLUGIN_URL "https://raw.githubusercontent.com/NVIDIA/TensorRT/refs/heads/release/${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}/python/include/impl/NvInferPythonPlugin.h")
122+
set(PLUGIN_URL "https://raw.githubusercontent.com/NVIDIA/TensorRT/refs/heads/release/${ONNX2TRT_MAJOR}.${ONNX2TRT_MINOR}/include/impl/NvInferPythonPlugin.h")
122123
set(FILE_DESTINATION "${CMAKE_CURRENT_BINARY_DIR}/NvInferPythonPlugin.h")
123124

124125
message(NOTICE "Required header NvInferPythonPlugin.h not found. Downloading from ${PLUGIN_URL} to ${FILE_DESTINATION}")

ImporterContext.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@
2626
} \
2727
} while (0)
2828

29+
#define STRINGIFY(x) #x
30+
#define LITERAL(x) STRINGIFY(x)
31+
2932
namespace
3033
{
3134

ImporterContext.hpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,13 @@ class ImporterContext
9494
std::set<std::string> mLayerNames;
9595
//! An increasing suffix counter used to uniquify layer names.
9696
int64_t mSuffixCounter{0};
97-
//! Set to keep track of how many times a batch norm weight name shows up,
98-
//! to avoid duplicate naming in TRT.
99-
std::set<std::string> mBatchNormWeightNames;
100-
//! An increasing suffix counter used to uniquify batch norm weight names.
101-
int64_t mBatchNormWeightSuffixCounter{0};
97+
//! Set to keep track of how many times a refittable name created by the parser shows up, to avoid duplicate naming in TRT.
98+
//! Currently tracks the following nodes:
99+
//! 1. BatchNorm - Parser pre-combines scales and bias weights for the IScaleLayer.
100+
//! 2. ConstantOfShape - The value of the ConstantOfShape does not have a name, so the parser needs to create one for it.
101+
std::set<std::string> mTempRefittableWeights;
102+
//! An increasing suffix counter used to uniquify refittable weight names created by the parser.
103+
int64_t mTempRefittableWeightsSuffixCounter{0};
102104
//! Set to hold output tensor names of layers that produce shape tensor outputs but do not
103105
//! natively support them.
104106
std::unordered_set<std::string> mUnsupportedShapeTensors;
@@ -221,12 +223,12 @@ class ImporterContext
221223
}
222224

223225
// Register an unique name for the created weights
224-
ShapedWeights createNamedTempWeights(ShapedWeights::DataType type, nvinfer1::Dims shape, bool batchNormNode = false)
226+
ShapedWeights createNamedTempWeights(ShapedWeights::DataType type, nvinfer1::Dims shape, bool refittable = false)
225227
{
226-
if (batchNormNode)
228+
if (refittable)
227229
{
228230
return mWeightsContext.createNamedTempWeights(
229-
type, shape, mBatchNormWeightNames, mBatchNormWeightSuffixCounter, /*batchNormNode=*/true);
231+
type, shape, mTempRefittableWeights, mTempRefittableWeightsSuffixCounter, /*refittable=*/true);
230232
}
231233
return mWeightsContext.createNamedTempWeights(type, shape, mTensorNames, mSuffixCounter);
232234
}

ModelImporter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -509,7 +509,7 @@ void parseGraph(ImporterContext* ctx, ::ONNX_NAMESPACE::GraphProto const& graph,
509509
}
510510
catch (const std::exception& e)
511511
{
512-
ONNXTRT_THROW(MAKE_ERROR("Failed to import initialzer", ErrorCode::kINVALID_GRAPH));
512+
ONNXTRT_THROW(MAKE_ERROR(std::string("Failed to import initializer: ") + e.what(), ErrorCode::kINVALID_GRAPH));
513513
}
514514

515515
// Keep track of graph outputs in the context to validate UINT8 nodes

0 commit comments

Comments
 (0)