Skip to content

ONNX and TensorRT output different #4647

@godbaz

Description

@godbaz

Description

ONNX and TensorRT model output is different.

DockerImage:nvcr.io/nvidia/tensorrt:25.10-py3

NVIDIA GPU:L20

NVIDIA Driver Version:535.161.08

Steps To Reproduce

Step1. generate ONNX model

import json
import onnx
import time
import torch
import numpy as np
import torch.nn as nn
import onnxruntime as ort
import onnx_graphsurgeon as gs
import torch.nn.functional as F
import onnxruntime.quantization.preprocess
from transformers import AutoModelForImageTextToText, AutoProcessor

# Efficient implementation equivalent to the following:
def scaled_dot_product_attention(query, key, value, attn_mask=None, dropout_p=0.0,
        is_causal=False, scale=None, enable_gqa=False) -> torch.Tensor:
    L, S = query.size(-2), key.size(-2)
    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=query.device)
    if is_causal:
        assert attn_mask is None
        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))

    if attn_mask is not None:
        if attn_mask.dtype == torch.bool:
            attn_bias.masked_fill_(attn_mask.logical_not(), float("-inf"))
        else:
            attn_bias = attn_mask + attn_bias

    if enable_gqa:
        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)

    attn_weight = query @ key.transpose(-2, -1) * scale_factor
    attn_weight += attn_bias
    attn_weight = torch.softmax(attn_weight, dim=-1)
    attn_weight = torch.dropout(attn_weight, dropout_p, train=False)
    return attn_weight @ value


class SDPAModel(torch.nn.Module):
    def __init__(self, dropout_p=0.0, is_causal=False, scale=None, enable_gqa=False):
        super(SDPAModel, self).__init__()
        self.dropout_p = dropout_p
        self.is_causal = is_causal
        self.scale = scale
        self.enable_gqa = enable_gqa
        self.proj = torch.nn.Linear(1152, 1152)

    def forward(self, query, key, value):
        grid_t, batch_size, _, grid_hw, _ = query.shape
        seq_len = grid_t * batch_size * grid_hw

        # enable_gqa=True
        key = key.repeat_interleave(query.size(-3)//key.size(-3), -3)
        value = value.repeat_interleave(query.size(-3)//value.size(-3), -3)

        attn_output = F.scaled_dot_product_attention(query, key, value, None,
                dropout_p=self.dropout_p, is_causal=self.is_causal, scale=self.scale, enable_gqa=self.enable_gqa)

        #attn_output = scaled_dot_product_attention(query, key, value, None,
        #        self.dropout_p, self.is_causal, self.scale, self.enable_gqa)

        attn_output = attn_output.transpose(2, 3).contiguous()
        attn_output = attn_output.reshape(seq_len, -1).contiguous()
        attn_output = self.proj(attn_output)
        return attn_output


def export_pt2onnx():
    onnx_path = 'sdpa_visal.onnx'
    device = torch.device('cpu:0')

    # create model
    sdpa_model = SDPAModel(0.0, False, 1.0, False)
    sdpa_model = sdpa_model.to(device)
    sdpa_model.eval()

    # random input
    input_shape = (8, 1, 16, 124, 72) # (grid_t, 1, 16, grid_hw, 72) for visual model
    query = torch.rand(input_shape, dtype=torch.float32, device=device)
    key = torch.rand(input_shape, dtype=torch.float32, device=device)
    value = torch.rand(input_shape, dtype=torch.float32, device=device)

    with torch.no_grad():
        out = sdpa_model(query, key, value)

    print(query.shape, key.shape, value.shape, out.shape)

    # export
    torch.onnx.export(
            sdpa_model,
            (query, key, value),
            onnx_path,
            export_params=True,
            opset_version=20,
            do_constant_folding=True,
            input_names=['query', 'key', 'value'],
            output_names=['output'],
            dynamic_axes={
                'query': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
                'key': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
                'value': {0: 'grid_t', 1: 'batch_size', 3: 'grid_hw'},
                },
            external_data=False,
            )


if __name__ == '__main__':
    export_pt2onnx()

Step2. compare TensorRT and ONNX output

polygraphy run \
  ./sdpa_visal.onnx \
  --model-type onnx \
  --onnxrt \
  --trt \
  --validate \
  --val-range [-100,100] \
  --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] \
  --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] \
  --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] \
  --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] \
  --log-file pg_bf16.log \
  --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine \
  --infinities-compare-equal \
  --verbose

Q1: ONNX and TensorRT output different large, verbose log is

[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
     key [shape=(8, 1, 16, 1008, 72)],
     value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-11:14:23  | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-11:14:23
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-11:14:23  | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-11:14:23
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-11:14:23  | Completed 1 iteration(s) in 119.6 ms | Average inference time: 119.6 ms.
[I] trt-runner-N0-11/24/25-11:14:23     | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename:   sdpa_visal.onnx
[V] ONNX IR version:  0.0.9
[V] Opset version:    20
[V] Producer name:    pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version:    0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
        Profile 0:
            {query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
    ]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
    Flags                  | []
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 80 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 1077608448 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 1 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.070255ms to assign 1 blocks to 1 nodes requiring 1077608448 bytes.
[V] Total Activation Memory: 1077608448 bytes
[V] Total Weights Memory: 5313280 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 7.23125 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 0 MiB, GPU 1175 MiB
[I] Finished engine building in 7.280 seconds
[V] Loaded engine size: 5 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +1027, now: CPU 0, GPU 1032 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-11:14:23
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-11:14:23     | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[I] trt-runner-N0-11/24/25-11:14:23
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-11:14:23     | Completed 1 iteration(s) in 52.44 ms | Average inference time: 52.44 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-11:14:23', 'trt-runner-N0-11/24/25-11:14:23']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-11:14:23 vs. trt-runner-N0-11/24/25-11:14:23
[I]     Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I]         Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I]         onnxrt-runner-N0-11/24/25-11:14:23: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577105 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         trt-runner-N0-11/24/25-11:14:23: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (7065, 945), max=169.36 at (6455, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857768 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         Error Metrics: output
[I]             Minimum Required Tolerance: elemwise error | [abs=257.55] OR [rel=1.7841e+07] (requirements may be lower if both abs/rel tolerances are set)
[I]             Absolute Difference | Stats: mean=37.626, std-dev=28.434, var=808.52, median=31.813, min=0 at (0, 34), max=257.55 at (3266, 476), avg-magnitude=37.626, p90=77.559, p95=92.425, p99=121.51
[I]                 ---- Histogram ----
                    Bin Range    |  Num Elems | Visualization
                    (0   , 25.8) |    3855289 | ########################################
                    (25.8, 51.5) |    2882361 | #############################
                    (51.5, 77.3) |    1611140 | ################
                    (77.3, 103 ) |     672056 | ######
                    (103 , 129 ) |     209901 | ##
                    (129 , 155 ) |      49154 |
                    (155 , 180 ) |       8586 |
                    (180 , 206 ) |       1118 |
                    (206 , 232 ) |        116 |
                    (232 , 258 ) |          7 |
[I]             Relative Difference | Stats: mean=11.63, std-dev=6013.3, var=3.6159e+07, median=1.4141, min=0 at (0, 34), max=1.7841e+07 at (6921, 21), avg-magnitude=11.63, p90=6.4694, p95=12.812, p99=63.647
[I]                 ---- Histogram ----
                    Bin Range            |  Num Elems | Visualization
                    (0       , 1.78e+06) |    9289726 | ########################################
                    (1.78e+06, 3.57e+06) |          1 |
                    (3.57e+06, 5.35e+06) |          0 |
                    (5.35e+06, 7.14e+06) |          0 |
                    (7.14e+06, 8.92e+06) |          0 |
                    (8.92e+06, 1.07e+07) |          0 |
                    (1.07e+07, 1.25e+07) |          0 |
                    (1.25e+07, 1.43e+07) |          0 |
                    (1.43e+07, 1.61e+07) |          0 |
                    (1.61e+07, 1.78e+07) |          1 |
[E]         FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E]     FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-11:14:23 vs. trt-runner-N0-11/24/25-11:14:23 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-11:14:23', 'trt-runner-N0-11/24/25-11:14:23']
[I]     onnxrt-runner-N0-11/24/25-11:14:23  | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919744 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         PASSED | Output: output is valid
[I]     trt-runner-N0-11/24/25-11:14:23     | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (7065, 945), max=169.36 at (6455, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577104 | ########################################
                (28.3 , 63.5 ) |    1578265 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         PASSED | Output: output is valid
[I]     PASSED | Output Validation
[E] FAILED | Runtime: 15.272s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --verbose

Work around solution is add debug tensor, maybe layer fusion error ?

--mark-debug /Reshape_6_output_0 \

Q2: BF16 precision different sometime small, but sometime large, after add

--bf16 \

ONNX and TensorRT output different large verbose log is

[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
     key [shape=(8, 1, 16, 1008, 72)],
     value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-10:59:43  | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-10:59:43
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-10:59:43  | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-10:59:43
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-10:59:43  | Completed 1 iteration(s) in 107.1 ms | Average inference time: 107.1 ms.
[I] trt-runner-N0-11/24/25-10:59:43     | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename:   sdpa_visal.onnx
[V] ONNX IR version:  0.0.9
[V] Opset version:    20
[V] Producer name:    pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version:    0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Executing postprocessing step [MarkDebug]
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
        Profile 0:
            {query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
    ]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
    Flags                  | [BF16]
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 400 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 74317824 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 5 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.065404ms to assign 5 blocks to 5 nodes requiring 130056704 bytes.
[V] Total Activation Memory: 130056192 bytes
[V] Total Weights Memory: 2656896 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 11.3117 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 2 MiB, GPU 1175 MiB
[I] Finished engine building in 11.419 seconds
[V] Loaded engine size: 3 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +124, now: CPU 0, GPU 126 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-10:59:43
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-10:59:43     | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[W] Not supported datatype for debug tensor in polygraphy: DataType.BF16
[I] trt-runner-N0-11/24/25-10:59:43
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-10:59:43     | Completed 1 iteration(s) in 39.67 ms | Average inference time: 39.67 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-10:59:43', 'trt-runner-N0-11/24/25-10:59:43']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-10:59:43 vs. trt-runner-N0-11/24/25-10:59:43
[I]     Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I]         Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I]         onnxrt-runner-N0-11/24/25-10:59:43: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919744 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         trt-runner-N0-11/24/25-10:59:43: output | Stats: mean=-0.003805, std-dev=33.353, var=1112.4, median=-0.0041504, min=-183 at (1891, 945), max=169 at (7862, 168), avg-magnitude=26.606, p90=42.75, p95=54.75, p99=77.5
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         51 |
                (-148 , -113 ) |       3433 |
                (-113 , -77.6) |      88270 |
                (-77.6, -42.3) |     854851 | #########
                (-42.3, -7.02) |    2924445 | ################################
                (-7.02, 28.3 ) |    3580637 | ########################################
                (28.3 , 63.5 ) |    1575883 | #################
                (63.5 , 98.8 ) |     247656 | ##
                (98.8 , 134  ) |      14216 |
                (134  , 169  ) |        286 |
[I]         Error Metrics: output
[I]             Minimum Required Tolerance: elemwise error | [abs=73.42] OR [rel=1.8111e+05] (requirements may be lower if both abs/rel tolerances are set)
[I]             Absolute Difference | Stats: mean=0.75056, std-dev=3.0463, var=9.28, median=0.07798, min=0 at (7, 1010), max=73.42 at (4784, 235), avg-magnitude=0.75056, p90=0.24767, p95=4.4866, p99=17.254
[I]                 ---- Histogram ----
                    Bin Range    |  Num Elems | Visualization
                    (0   , 7.34) |    8942788 | ########################################
                    (7.34, 14.7) |     209530 |
                    (14.7, 22  ) |      96767 |
                    (22  , 29.4) |      31558 |
                    (29.4, 36.7) |       7466 |
                    (36.7, 44.1) |       1332 |
                    (44.1, 51.4) |        232 |
                    (51.4, 58.7) |         38 |
                    (58.7, 66.1) |         14 |
                    (66.1, 73.4) |          3 |
[I]             Relative Difference | Stats: mean=0.25063, std-dev=70.229, var=4932.1, median=0.0036124, min=0 at (7, 1010), max=1.8111e+05 at (7545, 1136), avg-magnitude=0.25063, p90=0.050332, p95=0.25349, p99=1.7265
[I]                 ---- Histogram ----
                    Bin Range            |  Num Elems | Visualization
                    (0       , 1.81e+04) |    9289718 | ########################################
                    (1.81e+04, 3.62e+04) |          7 |
                    (3.62e+04, 5.43e+04) |          1 |
                    (5.43e+04, 7.24e+04) |          1 |
                    (7.24e+04, 9.06e+04) |          0 |
                    (9.06e+04, 1.09e+05) |          0 |
                    (1.09e+05, 1.27e+05) |          0 |
                    (1.27e+05, 1.45e+05) |          0 |
                    (1.45e+05, 1.63e+05) |          0 |
                    (1.63e+05, 1.81e+05) |          1 |
[E]         FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E]     FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-10:59:43 vs. trt-runner-N0-11/24/25-10:59:43 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-10:59:43', 'trt-runner-N0-11/24/25-10:59:43']
[I]     onnxrt-runner-N0-11/24/25-10:59:43  | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919744 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         PASSED | Output: output is valid
[I]     trt-runner-N0-11/24/25-10:59:43     | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.003805, std-dev=33.353, var=1112.4, median=-0.0041504, min=-183 at (1891, 945), max=169 at (7862, 168), avg-magnitude=26.606, p90=42.75, p95=54.75, p99=77.5
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         58 |
                (-148 , -113 ) |       3426 |
                (-113 , -77.4) |      92092 | #
                (-77.4, -42.2) |     863549 | #########
                (-42.2, -7   ) |    2911925 | ################################
                (-7   , 28.2 ) |    3570946 | ########################################
                (28.2 , 63.4 ) |    1581023 | #################
                (63.4 , 98.6 ) |     252207 | ##
                (98.6 , 134  ) |      14191 |
                (134  , 169  ) |        311 |
[I]         PASSED | Output: output is valid
[I]     PASSED | Output Validation
[E] FAILED | Runtime: 19.281s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose

trtexec --dumpProfile log is

[11/24/2025-11:40:51] [I] === Profile (19 iterations ) ===
[11/24/2025-11:40:51] [I]    Time(ms)     Avg.(ms)   Median(ms)   Time(%)   Layer
[11/24/2025-11:40:51] [I]        0.03       0.0016       0.0016       0.8   __mye31_0_myl0_0
[11/24/2025-11:40:51] [I]        0.09       0.0047       0.0047       2.4   __myl_Cast_myl0_1
[11/24/2025-11:40:51] [I]        0.03       0.0016       0.0016       0.8   __mye31_0_myl1_0
[11/24/2025-11:40:51] [I]        0.09       0.0049       0.0045       2.5   __myl_Cast_myl1_1
[11/24/2025-11:40:51] [I]        0.03       0.0017       0.0017       0.9   __mye31_0_myl2_0
[11/24/2025-11:40:51] [I]        0.11       0.0056       0.0045       2.9   __myl_Cast_myl2_1
[11/24/2025-11:40:51] [I]        0.03       0.0017       0.0016       0.9   __mye8562_0_myl3_0
[11/24/2025-11:40:51] [I]        0.04       0.0019       0.0016       1.0   __mye29289_hc_init_myl3_1
[11/24/2025-11:40:51] [I]        0.55       0.0291       0.0287      15.0   __myl_ReplSlicReplReshTranReshMove_myl3_2
[11/24/2025-11:40:51] [I]        0.09       0.0045       0.0046       2.3   __myl_Move_myl3_3
[11/24/2025-11:40:51] [I]        0.11       0.0056       0.0057       2.9   __myl_ReplSlicReplReshMove_myl3_4
[11/24/2025-11:40:51] [I]        0.47       0.0246       0.0245      12.7   _gemm_mha_v2_myl3_5
[11/24/2025-11:40:51] [I]        0.09       0.0046       0.0046       2.4   __myl_MoveReshTranReshSlic_myl3_6
[11/24/2025-11:40:51] [I]        0.06       0.0032       0.0032       1.6   __myl_Slic_myl3_7
[11/24/2025-11:40:51] [I]        0.25       0.0131       0.0131       6.8   copy_d2h___mye27687_myl3_8
[11/24/2025-11:40:51] [I]        0.04       0.0021       0.0021       1.1   __mye2678cbr_myl3_9
[11/24/2025-11:40:51] [I]        0.05       0.0029       0.0025       1.5   __mye8576_2_myl3_13
[11/24/2025-11:40:51] [I]        0.04       0.0019       0.0017       1.0   jmp__mye2690_myl3_14
[11/24/2025-11:40:51] [I]        1.32       0.0695       0.0637      35.8   /proj/Gemm_myl3_15
[11/24/2025-11:40:51] [I]        0.03       0.0017       0.0017       0.9   __mye45_0_myl4_0
[11/24/2025-11:40:51] [I]        0.14       0.0072       0.0070       3.7   __myl_Cast_myl4_1
[11/24/2025-11:40:51] [I]        3.68       0.1938       0.1865     100.0   Total

ONNX and TensorRT output different small verbose log is

[I] RUNNING | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose
[V] Loaded Module: polygraphy | Version: 0.49.26 | Path: ['/usr/local/lib/python3.12/dist-packages/polygraphy']
[V] Loaded extension modules: []
[I] Will generate inference input data according to provided TensorMetadata: {query [shape=(8, 1, 16, 1008, 72)],
     key [shape=(8, 1, 16, 1008, 72)],
     value [shape=(8, 1, 16, 1008, 72)]}
[I] TF32 is disabled by default. Turn on TF32 for better performance with minor accuracy differences.
[I] onnxrt-runner-N0-11/24/25-10:57:58  | Activating and starting inference
[V] Loaded Module: onnxruntime | Version: 1.23.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnxruntime']
[I] Creating ONNX-Runtime Inference Session with providers: ['CPUExecutionProvider']
[V] Loading inputs from data loader
[V] Generating data using numpy seed: 1
[V] Loaded Module: numpy | Version: 1.26.4 | Path: ['/usr/local/lib/python3.12/dist-packages/numpy']
[V] Input tensor: query | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: key | Generating input data in range: [-100.0, 100.0]
[V] Input tensor: value | Generating input data in range: [-100.0, 100.0]
[I] onnxrt-runner-N0-11/24/25-10:57:58
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] onnxrt-runner-N0-11/24/25-10:57:58  | Input metadata is: {query [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     key [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)],
     value [dtype=float32, shape=('grid_t', 'batch_size', 16, 'grid_hw', 72)]}
[I] onnxrt-runner-N0-11/24/25-10:57:58
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)]}
[I] onnxrt-runner-N0-11/24/25-10:57:58  | Completed 1 iteration(s) in 116.1 ms | Average inference time: 116.1 ms.
[I] trt-runner-N0-11/24/25-10:57:58     | Activating and starting inference
[V] Loaded Module: tensorrt | Version: 10.13.3.9 | Path: ['/usr/local/lib/python3.12/dist-packages/tensorrt']
[V] [MemUsageChange] Init CUDA: CPU +39, GPU +0, now: CPU 1326, GPU 1892 (MiB)
[V] [MemUsageChange] Init builder kernel library: CPU +1012, GPU +8, now: CPU 2535, GPU 1900 (MiB)
[V] ----------------------------------------------------------------
[V] Input filename:   sdpa_visal.onnx
[V] ONNX IR version:  0.0.9
[V] Opset version:    20
[V] Producer name:    pytorch
[V] Producer version: 2.8.0
[V] Domain:
[V] Model version:    0
[V] Doc string:
[V] ----------------------------------------------------------------
[V] Executing postprocessing step [MarkDebug]
[V] Setting TensorRT Optimization Profiles
[V] Input tensor: query (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: key (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[V] Input tensor: value (dtype=DataType.FLOAT, shape=(-1, -1, 16, -1, 72)) | Setting input tensor shapes to: (min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72])
[I] Configuring with profiles:[
        Profile 0:
            {query [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             key [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]],
             value [min=[1, 1, 16, 4, 72], opt=[1, 1, 16, 4, 72], max=[8, 1, 16, 1008, 72]]}
    ]
[W] profileSharing0806 is on by default in TensorRT 10.0. This flag is deprecated and has no effect.
[I] Building engine with configuration:
    Flags                  | [BF16]
    Engine Capability      | EngineCapability.STANDARD
    Memory Pools           | [WORKSPACE: 45596.06 MiB, TACTIC_DRAM: 45596.06 MiB, TACTIC_SHARED_MEMORY: 1024.00 MiB]
    Tactic Sources         | [EDGE_MASK_CONVOLUTIONS, JIT_CONVOLUTIONS]
    Profiling Verbosity    | ProfilingVerbosity.DETAILED
    Preview Features       | [PROFILE_SHARING_0806]
[V] Global timing cache in use. Profiling results in this builder pass will be stored.
[V] Compiler backend is used during engine build.
[V] Detected 3 inputs and 1 output network tensors.
[V] Total Host Persistent Memory: 80 bytes
[V] Total Device Persistent Memory: 0 bytes
[V] Max Scratch Memory: 1077608448 bytes
[V] [BlockAssignment] Started assigning block shifts. This will take 1 steps to complete.
[V] [BlockAssignment] Algorithm ShiftNTopDown took 0.116556ms to assign 1 blocks to 1 nodes requiring 1077608448 bytes.
[V] Total Activation Memory: 1077608448 bytes
[V] Total Weights Memory: 5313282 bytes
[V] Compiler backend is used during engine execution.
[V] Engine generation completed in 11.2242 seconds.
[V] [MemUsageStats] Peak memory usage of TRT CPU/GPU memory allocators: CPU 2 MiB, GPU 1175 MiB
[I] Finished engine building in 11.322 seconds
[V] Loaded engine size: 5 MiB
[V] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +0, GPU +1027, now: CPU 0, GPU 1032 (MiB)
[V] Found candidate CUDA libraries: ['/usr/local/cuda/lib64/libcudart.so.13', '/usr/local/cuda/lib64/libcudart.so.13.0.96', '/usr/local/cuda/lib64/libcudart.so']
[I] trt-runner-N0-11/24/25-10:57:58
    ---- Inference Input(s) ----
    {query [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     key [dtype=float32, shape=(8, 1, 16, 1008, 72)],
     value [dtype=float32, shape=(8, 1, 16, 1008, 72)]}
[V] trt-runner-N0-11/24/25-10:57:58     | Input metadata is: {query [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     key [dtype=float32, shape=(-1, 1, 16, -1, 72)],
     value [dtype=float32, shape=(-1, 1, 16, -1, 72)]}
[V] Loaded Module: onnx | Version: 1.19.1 | Path: ['/usr/local/lib/python3.12/dist-packages/onnx']
[I] trt-runner-N0-11/24/25-10:57:58
    ---- Inference Output(s) ----
    {output [dtype=float32, shape=(8064, 1152)],
     /Reshape_6_output_0 [dtype=float32, shape=(8064, 1152)]}
[I] trt-runner-N0-11/24/25-10:57:58     | Completed 1 iteration(s) in 115.4 ms | Average inference time: 115.4 ms.
[V] Successfully ran: ['onnxrt-runner-N0-11/24/25-10:57:58', 'trt-runner-N0-11/24/25-10:57:58']
[I] Accuracy Comparison | onnxrt-runner-N0-11/24/25-10:57:58 vs. trt-runner-N0-11/24/25-10:57:58
[I]     Comparing Output: 'output' (dtype=float32, shape=(8064, 1152)) with 'output' (dtype=float32, shape=(8064, 1152))
[I]         Tolerance: [abs=1e-05, rel=1e-05] | Checking elemwise error
[I]         onnxrt-runner-N0-11/24/25-10:57:58: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577105 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         trt-runner-N0-11/24/25-10:57:58: output | Stats: mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[I]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857768 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         Error Metrics: output
[I]             Minimum Required Tolerance: elemwise error | [abs=0.00010681] OR [rel=7.3592] (requirements may be lower if both abs/rel tolerances are set)
[I]             Absolute Difference | Stats: mean=1.0341e-05, std-dev=8.4552e-06, var=7.149e-11, median=7.6294e-06, min=0 at (0, 34), max=0.00010681 at (697, 420), avg-magnitude=1.0341e-05, p90=2.2888e-05, p95=2.6703e-05, p99=3.8147e-05
[I]                 ---- Histogram ----
                    Bin Range            |  Num Elems | Visualization
                    (0       , 1.07e-05) |    5499357 | ########################################
                    (1.07e-05, 2.14e-05) |    2785803 | ####################
                    (2.14e-05, 3.2e-05 ) |     804906 | #####
                    (3.2e-05 , 4.27e-05) |     162051 | #
                    (4.27e-05, 5.34e-05) |      27775 |
                    (5.34e-05, 6.41e-05) |       8595 |
                    (6.41e-05, 7.48e-05) |        983 |
                    (7.48e-05, 8.54e-05) |        232 |
                    (8.54e-05, 9.61e-05) |         22 |
                    (9.61e-05, 0.000107) |          4 |
[I]             Relative Difference | Stats: mean=4.4547e-06, std-dev=0.0026289, var=6.9112e-06, median=3.8218e-07, min=0 at (0, 34), max=7.3592 at (4703, 130), avg-magnitude=4.4547e-06, p90=2.2527e-06, p95=4.5171e-06, p99=2.2579e-05
[I]                 ---- Histogram ----
                    Bin Range      |  Num Elems | Visualization
                    (0    , 0.736) |    9289725 | ########################################
                    (0.736, 1.47 ) |          1 |
                    (1.47 , 2.21 ) |          0 |
                    (2.21 , 2.94 ) |          1 |
                    (2.94 , 3.68 ) |          0 |
                    (3.68 , 4.42 ) |          0 |
                    (4.42 , 5.15 ) |          0 |
                    (5.15 , 5.89 ) |          0 |
                    (5.89 , 6.62 ) |          0 |
                    (6.62 , 7.36 ) |          1 |
[E]         FAILED | Output: 'output' | Difference exceeds tolerance (rel=1e-05, abs=1e-05)
[E]     FAILED | Mismatched outputs: ['output']
[E] Accuracy Summary | onnxrt-runner-N0-11/24/25-10:57:58 vs. trt-runner-N0-11/24/25-10:57:58 | Passed: 0/1 iterations | Pass Rate: 0.0%
[I] Output Validation | Runners: ['onnxrt-runner-N0-11/24/25-10:57:58', 'trt-runner-N0-11/24/25-10:57:58']
[I]     onnxrt-runner-N0-11/24/25-10:57:58  | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027323, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919744 | ################################
                (-7.02, 28.3 ) |    3577103 | ########################################
                (28.3 , 63.5 ) |    1578264 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         PASSED | Output: output is valid
[I]     trt-runner-N0-11/24/25-10:57:58     | Validating output: output (check_inf=True, check_nan=True)
[I]         mean=-0.0052821, std-dev=33.354, var=1112.5, median=-0.0027359, min=-183.4 at (1891, 945), max=169.36 at (7862, 168), avg-magnitude=26.606, p90=42.716, p95=54.842, p99=77.604
[V]             ---- Histogram ----
                Bin Range      |  Num Elems | Visualization
                (-183 , -148 ) |         56 |
                (-148 , -113 ) |       3417 |
                (-113 , -77.6) |      89680 | #
                (-77.6, -42.3) |     857766 | #########
                (-42.3, -7.02) |    2919742 | ################################
                (-7.02, 28.3 ) |    3577104 | ########################################
                (28.3 , 63.5 ) |    1578265 | #################
                (63.5 , 98.8 ) |     249245 | ##
                (98.8 , 134  ) |      14161 |
                (134  , 169  ) |        292 |
[I]         PASSED | Output: output is valid
[I]     trt-runner-N0-11/24/25-10:57:58     | Validating output: /Reshape_6_output_0 (check_inf=True, check_nan=True)
[I]         mean=0.01887, std-dev=57.743, var=3334.3, median=0.044686, min=-100 at (1035, 453), max=100 at (7729, 947), avg-magnitude=50.008, p90=80.029, p95=90.007, p99=98.006
[V]             ---- Histogram ----
                Bin Range              |  Num Elems | Visualization
                (-100     , -80      ) |     928973 | #######################################
                (-80      , -60      ) |     927919 | #######################################
                (-60      , -40      ) |     930346 | ########################################
                (-40      , -20      ) |     928158 | #######################################
                (-20      , -1.53e-05) |     927496 | #######################################
                (-1.53e-05, 20       ) |     930290 | #######################################
                (20       , 40       ) |     928940 | #######################################
                (40       , 60       ) |     927104 | #######################################
                (60       , 80       ) |     930178 | #######################################
                (80       , 100      ) |     930324 | #######################################
[I]         PASSED | Output: /Reshape_6_output_0 is valid
[I]     PASSED | Output Validation
[E] FAILED | Runtime: 19.750s | Command: /usr/local/bin/polygraphy run ./sdpa_visal.onnx --model-type onnx --onnxrt --trt --validate --val-range [-100,100] --trt-min-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-opt-shapes query:[1,1,16,4,72] key:[1,1,16,4,72] value:[1,1,16,4,72] --trt-max-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --input-shapes query:[8,1,16,1008,72] key:[8,1,16,1008,72] value:[8,1,16,1008,72] --bf16 --log-file pg_bf16.log --save-engine ./sdpa_visal_tensorrtx13_L20_BF16.engine --infinities-compare-equal --mark-debug /Reshape_6_output_0 --verbose

trtexec --dumpProfile log is

[11/24/2025-11:42:18] [I] === Profile (19 iterations ) ===
[11/24/2025-11:42:18] [I]    Time(ms)     Avg.(ms)   Median(ms)   Time(%)   Layer
[11/24/2025-11:42:18] [I]        0.03       0.0018       0.0016       0.6   __mye8562_0_myl0_0
[11/24/2025-11:42:18] [I]        0.04       0.0019       0.0015       0.6   __mye30067_hc_init_myl0_1
[11/24/2025-11:42:18] [I]        0.44       0.0234       0.0220       7.5   __myl_ReplSlicReplReshTran_myl0_2
[11/24/2025-11:42:18] [I]        0.52       0.0276       0.0219       8.9   /MatMul_myl0_3
[11/24/2025-11:42:18] [I]        0.67       0.0354       0.0354      11.4   __myl_MaxrSubExpSumDivMul_myl0_4
[11/24/2025-11:42:18] [I]        0.14       0.0075       0.0072       2.4   __myl_ReplSlicRepl_myl0_5
[11/24/2025-11:42:18] [I]        0.51       0.0266       0.0265       8.6   /MatMul_1_myl0_6
[11/24/2025-11:42:18] [I]        0.10       0.0054       0.0054       1.7   __myl_TranReshSlic_myl0_7
[11/24/2025-11:42:18] [I]        0.06       0.0032       0.0032       1.0   __myl_Slic_myl0_8
[11/24/2025-11:42:18] [I]        0.21       0.0112       0.0106       3.6   copy_d2h___mye27831_myl0_9
[11/24/2025-11:42:18] [I]        0.05       0.0024       0.0024       0.8   __mye2678cbr_myl0_10
[11/24/2025-11:42:18] [I]        0.06       0.0032       0.0031       1.0   __mye8576_2_myl0_14
[11/24/2025-11:42:18] [I]        0.05       0.0024       0.0023       0.8   jmp__mye2690_myl0_15
[11/24/2025-11:42:18] [I]        3.01       0.1584       0.1575      51.1   /proj/Gemm_myl0_16
[11/24/2025-11:42:18] [I]        5.90       0.3103       0.2998     100.0   Total
[11/24/2025-11:42:18] [I]

Difference between small and large verbose log seems like

[W] Not supported datatype for debug tensor in polygraphy: DataType.BF16

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions