-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Open
Description
Description
ONNX and TensorRT model output is different.
DockerImage:nvcr.io/nvidia/tensorrt:25.10-py3
NVIDIA GPU:L20
NVIDIA Driver Version:535.161.08
Steps To Reproduce
Step1. generate ONNX model
import json
import onnx
import time
import torch
import numpy as np
import torch.nn as nn
import onnxruntime as ort
import onnx_graphsurgeon as gs
import torch.nn.functional as F
import onnxruntime.quantization.preprocess
from transformers import AutoModelForImageTextToText, AutoProcessor
# Efficient implementation equivalent to the following:
def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
L, S = query.size(-2), key.size(-2)
scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
if is_causal:
assert attn_mask is None
temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
if attn_mask is not None:
if attn_mask.dtype == torch.bool:
attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
else:
attn_bias = attn_mask + attn_bias
if enable_gqa:
key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
attn_weight = query @ key.transpose(-2, -1) * scale_factor
attn_weight += attn_bias
attn_weight = torch.softmax(attn_weight, dim=-1)
attn_weight = torch.dropout(attn_weight, dropout_p, train=False)
return attn_weight @ value
class SDPAModel(torch.nn.Module):
def __init__(self, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=False):
super(SDPAModel, self).__init__()
self.dropout_p = dropout_p
self.is_causal = is_causal
self.scale = scale
self.enable_gqa = enable_gqa
self.proj = torch.nn.Linear(1152, 1152)
def forward(self, query, key, value):
grid_t, batch_size, _, grid_hw, _ = query.shape
seq_len = grid_t * batch_size * grid_hw
# enable_gqa=True
key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)
attn_output = F.scaled_dot_product_attention(query, key, value, None,
dropout_p=self.dropout_p, is_causal=self.is_causal, scale=self.scale, enable_gqa=self.enable_gqa)
#attn_output = scaled_dot_product_attention(query, key, value, None,
# self.dropout_p, self.is_causal, self.scale, self.enable_gqa)
attn_output = attn_output.transpose(2, 3).contiguous()
attn_output = attn_output.reshape(seq_len, -1).contiguous()
attn_output = self.proj(attn_output)
return attn_output
def export_pt2onnx():
onnx_path = 'sdpa_visal.onnx'
device = torch.device('cpu:0')
# create model
sdpa_model = SDPAModel(0.0, False, 1.0, False)
sdpa_model = sdpa_model.to(device)
sdpa_model.eval()
# random input
input_shape = (8, 1, 16, 124, 72) # (grid_t, 1, 16, grid_hw, 72) for visual model
query = torch.rand(input_shape, dtype=torch.float32, device=device)
key = torch.rand(input_shape, dtype=torch.float32, device=device)
value = torch.rand(input_shape, dtype=torch.float32, device=device)
with torch.no_grad():
out = sdpa_model(query, key, value)
print(query.shape, key.shape, value.shape, out.shape)
# export
torch.onnx.export(
sdpa_model,
(query, key, value),
onnx_path,
export_params=True,
opset_version=20,
do_constant_folding=True,
input_names=['query', 'key', 'value'],
output_names=['output'],
dynamic_axes={
'query': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
'key': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
'value': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
},
external_data=False,
)
if __name__ == '__main__':
export_pt2onnx()
Step2. compare TensorRT and ONNX output
polygraphy run \
./sdpa_visal.onnx \
--model-type onnx \
--onnxrt \
--trt \
--validate \
--val-range [-100,100] \
--trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] \
--trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] \
--trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] \
--input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] \
--log-file pg_bf16.log \
--save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine \
--infinities-compare-equal \
--verbose
Q1: ONNX and TensorRT output different large, verbose log is
[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
key [shape=(8, 1, 16, 1008, 72)],
value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-11:14:23 | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-11:14:23
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-11:14:23 | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-11:14:23
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-11:14:23 | Completed 1 iteration(s) in 119.6 ms | Average inference time: 119.6 ms.
[I] trt-runner-N0-11/24/25-11:14:23 | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename: sdpa_visal.onnx
[V] ONNX IR version: 0.0.9
[V] Opset version: 20
[V] Producer name: pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version: 0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
Profile 0:
{query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
Flags | []
Engine Capability | EngineCapability.STANDARD
Memory Pools | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
Tactic Sources | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
Profiling Verbosity | ProfilingVerbosity.DETAILED
Preview Features | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 80 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 1077608448 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 1 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.070255ms to assign 1 blocks to 1 nodes requiring 1077608448 bytes.
[V] Total Activation Memory: 1077608448 bytes
[V] Total Weights Memory: 5313280 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 7.23125 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 1175 MiB
[I] Finished engine building in 7.280 seconds
[V] Loaded engine size: 5 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +1027, now: CPU 0, GPU 1032 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-11:14:23
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-11:14:23 | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[I] trt-runner-N0-11/24/25-11:14:23
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-11:14:23 | Completed 1 iteration(s) in 52.44 ms | Average inference time: 52.44 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-11:14:23', 'trt-runner-N0-11/24/25-11:14:23']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-11:14:23 vs. trt-runner-N0-11/24/25-11:14:23
[I] Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I] Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I] onnxrt-runner-N0-11/24/25-11:14:23: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577105 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] trt-runner-N0-11/24/25-11:14:23: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (7065, 945), max=169.36 at (6455, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857768 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] Error Metrics: output
[I] Minimum Required Tolerance: elemwise error | [abs=257.55] OR [rel=1.7841e+07] (requirements may be lower if both abs/rel tolerances are set)
[I] Absolute Difference | Stats: mean=37.626, std-dev=28.434, var=808.52, median=31.813, min=0 at (0, 34), max=257.55 at (3266, 476), avg-magnitude=37.626, p90=77.559, p95=92.425, p99=121.51
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 25.8) | 3855289 | ########################################
(25.8, 51.5) | 2882361 | #############################
(51.5, 77.3) | 1611140 | ################
(77.3, 103 ) | 672056 | ######
(103 , 129 ) | 209901 | ##
(129 , 155 ) | 49154 |
(155 , 180 ) | 8586 |
(180 , 206 ) | 1118 |
(206 , 232 ) | 116 |
(232 , 258 ) | 7 |
[I] Relative Difference | Stats: mean=11.63, std-dev=6013.3, var=3.6159e+07, median=1.4141, min=0 at (0, 34), max=1.7841e+07 at (6921, 21), avg-magnitude=11.63, p90=6.4694, p95=12.812, p99=63.647
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 1.78e+06) | 9289726 | ########################################
(1.78e+06, 3.57e+06) | 1 |
(3.57e+06, 5.35e+06) | 0 |
(5.35e+06, 7.14e+06) | 0 |
(7.14e+06, 8.92e+06) | 0 |
(8.92e+06, 1.07e+07) | 0 |
(1.07e+07, 1.25e+07) | 0 |
(1.25e+07, 1.43e+07) | 0 |
(1.43e+07, 1.61e+07) | 0 |
(1.61e+07, 1.78e+07) | 1 |
[E] FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E] FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-11:14:23 vs. trt-runner-N0-11/24/25-11:14:23 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-11:14:23', 'trt-runner-N0-11/24/25-11:14:23']
[I] onnxrt-runner-N0-11/24/25-11:14:23 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919744 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] PASSED | Output: output is valid
[I] trt-runner-N0-11/24/25-11:14:23 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (7065, 945), max=169.36 at (6455, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577104 | ########################################
(28.3 , 63.5 ) | 1578265 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] PASSED | Output: output is valid
[I] PASSED | Output Validation
[E] FAILED | Runtime: 15.272s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --verbose
Work around solution is add debug tensor, maybe layer fusion error ?
--mark-debug /Reshape_6_output_0 \
Q2: BF16 precision different sometime small, but sometime large, after add
--bf16 \
ONNX and TensorRT output different large verbose log is
[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
key [shape=(8, 1, 16, 1008, 72)],
value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-10:59:43 | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-10:59:43
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-10:59:43 | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-10:59:43
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-10:59:43 | Completed 1 iteration(s) in 107.1 ms | Average inference time: 107.1 ms.
[I] trt-runner-N0-11/24/25-10:59:43 | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename: sdpa_visal.onnx
[V] ONNX IR version: 0.0.9
[V] Opset version: 20
[V] Producer name: pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version: 0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Executing postprocessing step [MarkDebug]
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
Profile 0:
{query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
Flags | [BF16]
Engine Capability | EngineCapability.STANDARD
Memory Pools | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
Tactic Sources | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
Profiling Verbosity | ProfilingVerbosity.DETAILED
Preview Features | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 400 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 74317824 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 5 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.065404ms to assign 5 blocks to 5 nodes requiring 130056704 bytes.
[V] Total Activation Memory: 130056192 bytes
[V] Total Weights Memory: 2656896 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 11.3117 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 2 MiB, GPU 1175 MiB
[I] Finished engine building in 11.419 seconds
[V] Loaded engine size: 3 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +124, now: CPU 0, GPU 126 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-10:59:43
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-10:59:43 | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[W] Not supported datatype for debug tensor in polygraphy: DataType.BF16
[I] trt-runner-N0-11/24/25-10:59:43
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-10:59:43 | Completed 1 iteration(s) in 39.67 ms | Average inference time: 39.67 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-10:59:43', 'trt-runner-N0-11/24/25-10:59:43']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-10:59:43 vs. trt-runner-N0-11/24/25-10:59:43
[I] Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I] Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I] onnxrt-runner-N0-11/24/25-10:59:43: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919744 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] trt-runner-N0-11/24/25-10:59:43: output | Stats: mean=-0.003805, std-dev=33.353, var=1112.4, median=-0.0041504, min=-183 at (1891, 945), max=169 at (7862, 168), avg-magnitude=26.606, p90=42.75, p95=54.75, p99=77.5
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 51 |
(-148 , -113 ) | 3433 |
(-113 , -77.6) | 88270 |
(-77.6, -42.3) | 854851 | #########
(-42.3, -7.02) | 2924445 | ################################
(-7.02, 28.3 ) | 3580637 | ########################################
(28.3 , 63.5 ) | 1575883 | #################
(63.5 , 98.8 ) | 247656 | ##
(98.8 , 134 ) | 14216 |
(134 , 169 ) | 286 |
[I] Error Metrics: output
[I] Minimum Required Tolerance: elemwise error | [abs=73.42] OR [rel=1.8111e+05] (requirements may be lower if both abs/rel tolerances are set)
[I] Absolute Difference | Stats: mean=0.75056, std-dev=3.0463, var=9.28, median=0.07798, min=0 at (7, 1010), max=73.42 at (4784, 235), avg-magnitude=0.75056, p90=0.24767, p95=4.4866, p99=17.254
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 7.34) | 8942788 | ########################################
(7.34, 14.7) | 209530 |
(14.7, 22 ) | 96767 |
(22 , 29.4) | 31558 |
(29.4, 36.7) | 7466 |
(36.7, 44.1) | 1332 |
(44.1, 51.4) | 232 |
(51.4, 58.7) | 38 |
(58.7, 66.1) | 14 |
(66.1, 73.4) | 3 |
[I] Relative Difference | Stats: mean=0.25063, std-dev=70.229, var=4932.1, median=0.0036124, min=0 at (7, 1010), max=1.8111e+05 at (7545, 1136), avg-magnitude=0.25063, p90=0.050332, p95=0.25349, p99=1.7265
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 1.81e+04) | 9289718 | ########################################
(1.81e+04, 3.62e+04) | 7 |
(3.62e+04, 5.43e+04) | 1 |
(5.43e+04, 7.24e+04) | 1 |
(7.24e+04, 9.06e+04) | 0 |
(9.06e+04, 1.09e+05) | 0 |
(1.09e+05, 1.27e+05) | 0 |
(1.27e+05, 1.45e+05) | 0 |
(1.45e+05, 1.63e+05) | 0 |
(1.63e+05, 1.81e+05) | 1 |
[E] FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E] FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-10:59:43 vs. trt-runner-N0-11/24/25-10:59:43 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-10:59:43', 'trt-runner-N0-11/24/25-10:59:43']
[I] onnxrt-runner-N0-11/24/25-10:59:43 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919744 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] PASSED | Output: output is valid
[I] trt-runner-N0-11/24/25-10:59:43 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.003805, std-dev=33.353, var=1112.4, median=-0.0041504, min=-183 at (1891, 945), max=169 at (7862, 168), avg-magnitude=26.606, p90=42.75, p95=54.75, p99=77.5
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 58 |
(-148 , -113 ) | 3426 |
(-113 , -77.4) | 92092 | #
(-77.4, -42.2) | 863549 | #########
(-42.2, -7 ) | 2911925 | ################################
(-7 , 28.2 ) | 3570946 | ########################################
(28.2 , 63.4 ) | 1581023 | #################
(63.4 , 98.6 ) | 252207 | ##
(98.6 , 134 ) | 14191 |
(134 , 169 ) | 311 |
[I] PASSED | Output: output is valid
[I] PASSED | Output Validation
[E] FAILED | Runtime: 19.281s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
trtexec --dumpProfile log is
[11/24/2025-11:40:51] [I] === Profile (19 iterations ) ===
[11/24/2025-11:40:51] [I] Time(ms) Avg.(ms) Median(ms) Time(%) Layer
[11/24/2025-11:40:51] [I] 0.03 0.0016 0.0016 0.8 __mye31_0_myl0_0
[11/24/2025-11:40:51] [I] 0.09 0.0047 0.0047 2.4 __myl_Cast_myl0_1
[11/24/2025-11:40:51] [I] 0.03 0.0016 0.0016 0.8 __mye31_0_myl1_0
[11/24/2025-11:40:51] [I] 0.09 0.0049 0.0045 2.5 __myl_Cast_myl1_1
[11/24/2025-11:40:51] [I] 0.03 0.0017 0.0017 0.9 __mye31_0_myl2_0
[11/24/2025-11:40:51] [I] 0.11 0.0056 0.0045 2.9 __myl_Cast_myl2_1
[11/24/2025-11:40:51] [I] 0.03 0.0017 0.0016 0.9 __mye8562_0_myl3_0
[11/24/2025-11:40:51] [I] 0.04 0.0019 0.0016 1.0 __mye29289_hc_init_myl3_1
[11/24/2025-11:40:51] [I] 0.55 0.0291 0.0287 15.0 __myl_ReplSlicReplReshTranReshMove_myl3_2
[11/24/2025-11:40:51] [I] 0.09 0.0045 0.0046 2.3 __myl_Move_myl3_3
[11/24/2025-11:40:51] [I] 0.11 0.0056 0.0057 2.9 __myl_ReplSlicReplReshMove_myl3_4
[11/24/2025-11:40:51] [I] 0.47 0.0246 0.0245 12.7 _gemm_mha_v2_myl3_5
[11/24/2025-11:40:51] [I] 0.09 0.0046 0.0046 2.4 __myl_MoveReshTranReshSlic_myl3_6
[11/24/2025-11:40:51] [I] 0.06 0.0032 0.0032 1.6 __myl_Slic_myl3_7
[11/24/2025-11:40:51] [I] 0.25 0.0131 0.0131 6.8 copy_d2h___mye27687_myl3_8
[11/24/2025-11:40:51] [I] 0.04 0.0021 0.0021 1.1 __mye2678cbr_myl3_9
[11/24/2025-11:40:51] [I] 0.05 0.0029 0.0025 1.5 __mye8576_2_myl3_13
[11/24/2025-11:40:51] [I] 0.04 0.0019 0.0017 1.0 jmp__mye2690_myl3_14
[11/24/2025-11:40:51] [I] 1.32 0.0695 0.0637 35.8 /proj/Gemm_myl3_15
[11/24/2025-11:40:51] [I] 0.03 0.0017 0.0017 0.9 __mye45_0_myl4_0
[11/24/2025-11:40:51] [I] 0.14 0.0072 0.0070 3.7 __myl_Cast_myl4_1
[11/24/2025-11:40:51] [I] 3.68 0.1938 0.1865 100.0 Total
ONNX and TensorRT output different small verbose log is
[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
key [shape=(8, 1, 16, 1008, 72)],
value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-10:57:58 | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-10:57:58
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-10:57:58 | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-10:57:58
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-10:57:58 | Completed 1 iteration(s) in 116.1 ms | Average inference time: 116.1 ms.
[I] trt-runner-N0-11/24/25-10:57:58 | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename: sdpa_visal.onnx
[V] ONNX IR version: 0.0.9
[V] Opset version: 20
[V] Producer name: pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version: 0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Executing postprocessing step [MarkDebug]
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
Profile 0:
{query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
Flags | [BF16]
Engine Capability | EngineCapability.STANDARD
Memory Pools | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
Tactic Sources | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
Profiling Verbosity | ProfilingVerbosity.DETAILED
Preview Features | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 80 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 1077608448 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 1 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.116556ms to assign 1 blocks to 1 nodes requiring 1077608448 bytes.
[V] Total Activation Memory: 1077608448 bytes
[V] Total Weights Memory: 5313282 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 11.2242 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 2 MiB, GPU 1175 MiB
[I] Finished engine building in 11.322 seconds
[V] Loaded engine size: 5 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +1027, now: CPU 0, GPU 1032 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-10:57:58
---- Inference Input(s) ----
{query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-10:57:58 | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[V] Loaded Module: onnx | Version: 1.19.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnx']
[I] trt-runner-N0-11/24/25-10:57:58
---- Inference Output(s) ----
{output [dtype=float32, shape=(8064, 1152)],
/Reshape_6_output_0 [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-10:57:58 | Completed 1 iteration(s) in 115.4 ms | Average inference time: 115.4 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-10:57:58', 'trt-runner-N0-11/24/25-10:57:58']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-10:57:58 vs. trt-runner-N0-11/24/25-10:57:58
[I] Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I] Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I] onnxrt-runner-N0-11/24/25-10:57:58: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577105 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] trt-runner-N0-11/24/25-10:57:58: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857768 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] Error Metrics: output
[I] Minimum Required Tolerance: elemwise error | [abs=0.00010681] OR [rel=7.3592] (requirements may be lower if both abs/rel tolerances are set)
[I] Absolute Difference | Stats: mean=1.0341e-05, std-dev=8.4552e-06, var=7.149e-11, median=7.6294e-06, min=0 at (0, 34), max=0.00010681 at (697, 420), avg-magnitude=1.0341e-05, p90=2.2888e-05, p95=2.6703e-05, p99=3.8147e-05
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 1.07e-05) | 5499357 | ########################################
(1.07e-05, 2.14e-05) | 2785803 | ####################
(2.14e-05, 3.2e-05 ) | 804906 | #####
(3.2e-05 , 4.27e-05) | 162051 | #
(4.27e-05, 5.34e-05) | 27775 |
(5.34e-05, 6.41e-05) | 8595 |
(6.41e-05, 7.48e-05) | 983 |
(7.48e-05, 8.54e-05) | 232 |
(8.54e-05, 9.61e-05) | 22 |
(9.61e-05, 0.000107) | 4 |
[I] Relative Difference | Stats: mean=4.4547e-06, std-dev=0.0026289, var=6.9112e-06, median=3.8218e-07, min=0 at (0, 34), max=7.3592 at (4703, 130), avg-magnitude=4.4547e-06, p90=2.2527e-06, p95=4.5171e-06, p99=2.2579e-05
[I] ---- Histogram ----
Bin Range | Num Elems | Visualization
(0 , 0.736) | 9289725 | ########################################
(0.736, 1.47 ) | 1 |
(1.47 , 2.21 ) | 0 |
(2.21 , 2.94 ) | 1 |
(2.94 , 3.68 ) | 0 |
(3.68 , 4.42 ) | 0 |
(4.42 , 5.15 ) | 0 |
(5.15 , 5.89 ) | 0 |
(5.89 , 6.62 ) | 0 |
(6.62 , 7.36 ) | 1 |
[E] FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E] FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-10:57:58 vs. trt-runner-N0-11/24/25-10:57:58 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-10:57:58', 'trt-runner-N0-11/24/25-10:57:58']
[I] onnxrt-runner-N0-11/24/25-10:57:58 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919744 | ################################
(-7.02, 28.3 ) | 3577103 | ########################################
(28.3 , 63.5 ) | 1578264 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] PASSED | Output: output is valid
[I] trt-runner-N0-11/24/25-10:57:58 | Validating output: output (check_inf=True, check_nan=True)
[I] mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-183 , -148 ) | 56 |
(-148 , -113 ) | 3417 |
(-113 , -77.6) | 89680 | #
(-77.6, -42.3) | 857766 | #########
(-42.3, -7.02) | 2919742 | ################################
(-7.02, 28.3 ) | 3577104 | ########################################
(28.3 , 63.5 ) | 1578265 | #################
(63.5 , 98.8 ) | 249245 | ##
(98.8 , 134 ) | 14161 |
(134 , 169 ) | 292 |
[I] PASSED | Output: output is valid
[I] trt-runner-N0-11/24/25-10:57:58 | Validating output: /Reshape_6_output_0 (check_inf=True, check_nan=True)
[I] mean=0.01887, std-dev=57.743, var=3334.3, median=0.044686, min=-100 at (1035, 453), max=100 at (7729, 947), avg-magnitude=50.008, p90=80.029, p95=90.007, p99=98.006
[V] ---- Histogram ----
Bin Range | Num Elems | Visualization
(-100 , -80 ) | 928973 | #######################################
(-80 , -60 ) | 927919 | #######################################
(-60 , -40 ) | 930346 | ########################################
(-40 , -20 ) | 928158 | #######################################
(-20 , -1.53e-05) | 927496 | #######################################
(-1.53e-05, 20 ) | 930290 | #######################################
(20 , 40 ) | 928940 | #######################################
(40 , 60 ) | 927104 | #######################################
(60 , 80 ) | 930178 | #######################################
(80 , 100 ) | 930324 | #######################################
[I] PASSED | Output: /Reshape_6_output_0 is valid
[I] PASSED | Output Validation
[E] FAILED | Runtime: 19.750s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
trtexec --dumpProfile log is
[11/24/2025-11:42:18] [I] === Profile (19 iterations ) ===
[11/24/2025-11:42:18] [I] Time(ms) Avg.(ms) Median(ms) Time(%) Layer
[11/24/2025-11:42:18] [I] 0.03 0.0018 0.0016 0.6 __mye8562_0_myl0_0
[11/24/2025-11:42:18] [I] 0.04 0.0019 0.0015 0.6 __mye30067_hc_init_myl0_1
[11/24/2025-11:42:18] [I] 0.44 0.0234 0.0220 7.5 __myl_ReplSlicReplReshTran_myl0_2
[11/24/2025-11:42:18] [I] 0.52 0.0276 0.0219 8.9 /MatMul_myl0_3
[11/24/2025-11:42:18] [I] 0.67 0.0354 0.0354 11.4 __myl_MaxrSubExpSumDivMul_myl0_4
[11/24/2025-11:42:18] [I] 0.14 0.0075 0.0072 2.4 __myl_ReplSlicRepl_myl0_5
[11/24/2025-11:42:18] [I] 0.51 0.0266 0.0265 8.6 /MatMul_1_myl0_6
[11/24/2025-11:42:18] [I] 0.10 0.0054 0.0054 1.7 __myl_TranReshSlic_myl0_7
[11/24/2025-11:42:18] [I] 0.06 0.0032 0.0032 1.0 __myl_Slic_myl0_8
[11/24/2025-11:42:18] [I] 0.21 0.0112 0.0106 3.6 copy_d2h___mye27831_myl0_9
[11/24/2025-11:42:18] [I] 0.05 0.0024 0.0024 0.8 __mye2678cbr_myl0_10
[11/24/2025-11:42:18] [I] 0.06 0.0032 0.0031 1.0 __mye8576_2_myl0_14
[11/24/2025-11:42:18] [I] 0.05 0.0024 0.0023 0.8 jmp__mye2690_myl0_15
[11/24/2025-11:42:18] [I] 3.01 0.1584 0.1575 51.1 /proj/Gemm_myl0_16
[11/24/2025-11:42:18] [I] 5.90 0.3103 0.2998 100.0 Total
[11/24/2025-11:42:18] [I]
Difference between small and large verbose log seems like
[W] Not supported datatype for debug tensor in polygraphy: DataType.BF16
Metadata
Metadata
Assignees
Labels
No labels