Add CMake option to enable saturation checker for ConvSymKernelAvx2 (microsoft#24220)

yihonglyu · web-flow · commit 7e0ee2bbe5c7 · 2025-04-27T23:46:40.000-07:00
### Description
&lt;!-- Describe your changes. --&gt;

This PR adds a new CMake option:
onnxruntime_ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER. When enabled, this
option activates a saturation checker for the VPMADDUBSW instruction
used in the ConvSymKernelAvx2 path.

The checker works by calling a helper function before each VPMADDUBSW
instruction. This function simulates the computation using C++ and
intrinsics with higher-precision types (int32_t) to detect whether the
result exceeds the bounds of int16_t (i.e., greater than INT16_MAX or
less than INT16_MIN).

By default, the checker logs a warning only once per inference session.
However, the logic can be easily extended to print more frequently if
needed. Developers can also reuse this pattern to implement similar
saturation checks for other instructions.

### Motivation and Context
&lt;!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. --&gt;

On some models running with AVX2 (instead of AVX-VNNI), we've observed
accuracy degradation due to saturation in vectorized instructions. This
saturation checker provides a way to debug and detect those cases by
reporting potential overflow in intermediate computations.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -65,6 +65,7 @@ option(onnxruntime_REDIRECT_STATIC_ANALYSIS_OUTPUTS_TO_FILE "Use a custom SDL Ru
 option(onnxruntime_ENABLE_PYTHON "Enable python bindings" OFF)
 # Enable it may cause LNK1169 error
 option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak checker in Windows debug build" OFF)
+option(onnxruntime_ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER "Experimental: Enable ConvSymKernelAvx2 assembly saturation checker in build" OFF)
 option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -46,6 +46,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/rotary_embedding.h
   ${MLAS_SRC_DIR}/rotary_embedding.cpp
   ${MLAS_SRC_DIR}/softmax.h
+  ${MLAS_SRC_DIR}/saturation_check.cpp
 )
 
 target_sources(onnxruntime_mlas PRIVATE
@@ -239,6 +240,10 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/amd64/ErfKernelFma3.asm
     )
 
+    if(onnxruntime_ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER)
+      set_source_files_properties(${MLAS_SRC_DIR}/amd64/ConvSymKernelAvx2.asm PROPERTIES COMPILE_FLAGS "-DENABLE_CONVSYMKERNELAVX2_SAT_CHECKER")
+    endif()
+
     if(MSVC_VERSION GREATER_EQUAL 1933)
       target_sources(onnxruntime_mlas PRIVATE
         ${MLAS_SRC_DIR}/amd64/cvtfp16Avx.asm
@@ -637,6 +642,7 @@ else()
           ${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
           ${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
+          ${MLAS_SRC_DIR}/intrinsics/avx2/saturation_check_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.h
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp
@@ -716,6 +722,10 @@ endif()
           set_source_files_properties(${MLAS_SRC_DIR}/x86_64/QgemmU8S8KernelAmx.S PROPERTIES COMPILE_FLAGS "-mavx2 -mavx512bw -mavx512dq -mavx512vl -mavx512f")
         endif()
 
+        if(onnxruntime_ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER)
+          set_source_files_properties(${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx2.S PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mf16c -DENABLE_CONVSYMKERNELAVX2_SAT_CHECKER")
+        endif()
+
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
           onnxruntime_add_static_library(onnxruntime_mlas_x86_64 ${mlas_platform_srcs})
           set_target_properties(onnxruntime_mlas_x86_64 PROPERTIES OSX_ARCHITECTURES "x86_64")
diff --git a/onnxruntime/core/mlas/lib/amd64/ConvSymKernelAvx2.asm b/onnxruntime/core/mlas/lib/amd64/ConvSymKernelAvx2.asm
@@ -23,6 +23,87 @@ INCLUDE ConvSymKernelCommon.inc
 INCLUDE AssembleAvxVnni.inc
         .list
 
+extern CheckSaturationForVPMADDUBSW:proc
+
+CheckSaturation MACRO VecReg1Num, VecReg2Num
+
+;
+; Save all caller-saved registers (RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11). no RSI, RDI.
+;
+
+        push_reg    rax
+        push_reg    rcx
+        push_reg    rdx
+        push_reg    r8
+        push_reg    r9
+        push_reg    r10
+        push_reg    r11
+
+        sub     rsp, 512                        ; reserve space for 16 YMM registers (32 bytes)
+
+;
+; Save YMM registers (YMM0 to YMM15).
+;
+
+        vmovdqu  YMMWORD PTR [rsp], ymm0
+        vmovdqu  YMMWORD PTR [rsp+32], ymm1
+        vmovdqu  YMMWORD PTR [rsp+64], ymm2
+        vmovdqu  YMMWORD PTR [rsp+96], ymm3
+        vmovdqu  YMMWORD PTR [rsp+128], ymm4
+        vmovdqu  YMMWORD PTR [rsp+160], ymm5
+        vmovdqu  YMMWORD PTR [rsp+192], ymm6
+        vmovdqu  YMMWORD PTR [rsp+224], ymm7
+        vmovdqu  YMMWORD PTR [rsp+256], ymm8
+        vmovdqu  YMMWORD PTR [rsp+288], ymm9
+        vmovdqu  YMMWORD PTR [rsp+320], ymm10
+        vmovdqu  YMMWORD PTR [rsp+352], ymm11
+        vmovdqu  YMMWORD PTR [rsp+384], ymm12
+        vmovdqu  YMMWORD PTR [rsp+416], ymm13
+        vmovdqu  YMMWORD PTR [rsp+448], ymm14
+        vmovdqu  YMMWORD PTR [rsp+480], ymm15
+
+        lea rcx, [rsp+32*VecReg1Num]            ; first operand (unsigned)
+        lea rdx, [rsp+32*VecReg2Num]            ; second operand (signed)
+
+        call    CheckSaturationForVPMADDUBSW
+
+;
+; Restore YMM registers.
+;
+
+        vmovdqu  ymm0, YMMWORD PTR [rsp]
+        vmovdqu  ymm1, YMMWORD PTR [rsp+32]
+        vmovdqu  ymm2, YMMWORD PTR [rsp+64]
+        vmovdqu  ymm3, YMMWORD PTR [rsp+96]
+        vmovdqu  ymm4, YMMWORD PTR [rsp+128]
+        vmovdqu  ymm5, YMMWORD PTR [rsp+160]
+        vmovdqu  ymm6, YMMWORD PTR [rsp+192]
+        vmovdqu  ymm7, YMMWORD PTR [rsp+224]
+        vmovdqu  ymm8, YMMWORD PTR [rsp+256]
+        vmovdqu  ymm9, YMMWORD PTR [rsp+288]
+        vmovdqu  ymm10, YMMWORD PTR [rsp+320]
+        vmovdqu  ymm11, YMMWORD PTR [rsp+352]
+        vmovdqu  ymm12, YMMWORD PTR [rsp+384]
+        vmovdqu  ymm13, YMMWORD PTR [rsp+416]
+        vmovdqu  ymm14, YMMWORD PTR [rsp+448]
+        vmovdqu  ymm15, YMMWORD PTR [rsp+480]
+
+        add     rsp, 512                        ; clean up the reserved stack space
+
+;
+; Restore all caller-saved registers (RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11), no RSI, RDI.
+;
+
+        pop     r11
+        pop     r10
+        pop     r9
+        pop     r8
+        pop     rdx
+        pop     rcx
+        pop     rax
+
+        ENDM
+
 ;
 ; Macro Description:
 ;
@@ -50,9 +131,15 @@ INCLUDE AssembleAvxVnni.inc
 
 MultiplyAccumulateRowAvx2 MACRO Vec1Reg, Vec2Reg
 
+IFDEF ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER
+        CheckSaturation 2,0
+ENDIF
         vpmaddubsw ymm3,ymm2,ymm0
         vpmaddwd ymm3,ymm3,ymm12
         vpaddd Vec1Reg,Vec1Reg,ymm3
+IFDEF ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER
+        CheckSaturation 2,1
+ENDIF
         vpmaddubsw ymm2,ymm2,ymm1
         vpmaddwd ymm2,ymm2,ymm12
         vpaddd Vec2Reg,Vec2Reg,ymm2
diff --git a/onnxruntime/core/mlas/lib/intrinsics/avx2/saturation_check_avx2.cpp b/onnxruntime/core/mlas/lib/intrinsics/avx2/saturation_check_avx2.cpp
@@ -0,0 +1,62 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    saturation_check_avx2.cpp
+
+Abstract:
+
+    This module implements logic to check saturation of the VPMADDUBSW
+    instruction.
+
+--*/
+
+#include <immintrin.h>
+
+#include <atomic>
+#include <iostream>
+
+namespace onnxruntime
+{
+extern std::atomic<int> saturation_count;
+}
+
+extern "C" void
+CheckSaturationForVPMADDUBSW(const __m256i* unsigned_ptr, const __m256i* signed_ptr)
+{
+    // Load data from memory (unaligned load)
+    __m256i unsigned_data = _mm256_loadu_si256(unsigned_ptr);
+    __m256i signed_data = _mm256_loadu_si256(signed_ptr);
+
+    alignas(32) uint8_t unsigned_bytes[32];  // Unsigned input values
+    alignas(32) int8_t signed_bytes[32];     // Signed input values
+
+    // Store the data into the byte arrays
+    _mm256_store_si256(reinterpret_cast<__m256i*>(unsigned_bytes), unsigned_data);
+    _mm256_store_si256(reinterpret_cast<__m256i*>(signed_bytes), signed_data);
+
+    bool saturation_detected = false;
+
+    // Iterate through the 16 pairs of 8-bit unsigned and signed values
+    for (int i = 0; i < 16; ++i) {
+        // Perform the VPMADDUBSW operation in higher precision (int32_t)
+        int32_t computed_value =
+            static_cast<int32_t>(signed_bytes[2 * i]) * static_cast<int32_t>(static_cast<uint32_t>(unsigned_bytes[2 * i])) +
+            static_cast<int32_t>(signed_bytes[2 * i + 1]) * static_cast<int32_t>(static_cast<uint32_t>(unsigned_bytes[2 * i + 1]));
+
+        // If the computed value exceeds the 16-bit signed integer range, saturation occurred
+        if (computed_value > INT16_MAX || computed_value < INT16_MIN) {
+            saturation_detected = true;
+            break;
+        }
+    }
+
+    // If saturation is detected, log a warning (only log once based on the atomic count)
+    if (saturation_detected && ++onnxruntime::saturation_count < 2) {
+        std::cerr << "Warning: saturation detected in VPMADDUBSW instruction." << std::endl;
+    }
+}
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -18,6 +18,7 @@ Module Name:
 #pragma once
 
 #include <algorithm>
+#include <atomic>
 #include <cmath>
 #include <functional>
 #include <limits>
diff --git a/onnxruntime/core/mlas/lib/saturation_check.cpp b/onnxruntime/core/mlas/lib/saturation_check.cpp
@@ -0,0 +1,42 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    saturation_check.cpp
+
+Abstract:
+
+    This module implements logic to check saturation of the VPMADDUBSW
+    instruction.
+
+--*/
+
+#include "mlasi.h"
+
+namespace onnxruntime
+{
+
+#if defined(MLAS_TARGET_AMD64)
+
+std::atomic<int> saturation_count{0};
+
+void
+reset_saturation_count()
+{
+    saturation_count = 0;
+}
+
+#else
+
+void
+reset_saturation_count()
+{
+}
+
+#endif
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/mlas/lib/x86_64/ConvSymKernelAvx2.S b/onnxruntime/core/mlas/lib/x86_64/ConvSymKernelAvx2.S
@@ -23,6 +23,91 @@ Abstract:
 
         .intel_syntax noprefix
 
+        .extern CheckSaturationForVPMADDUBSW
+
+        .macro CheckSaturation VecReg1Num, VecReg2Num
+
+//
+// Save all caller-saved registers (RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11)
+//
+
+        push    rax
+        push    rcx
+        push    rdx
+        push    rsi
+        push    rdi
+        push    r8
+        push    r9
+        push    r10
+        push    r11
+
+        sub     rsp, 512                        # reserve space for 16 YMM registers (32 bytes)
+
+//
+// Save YMM registers (YMM0 to YMM15)
+//
+
+        vmovdqu  [rsp], ymm0
+        vmovdqu  [rsp+32], ymm1
+        vmovdqu  [rsp+64], ymm2
+        vmovdqu  [rsp+96], ymm3
+        vmovdqu  [rsp+128], ymm4
+        vmovdqu  [rsp+160], ymm5
+        vmovdqu  [rsp+192], ymm6
+        vmovdqu  [rsp+224], ymm7
+        vmovdqu  [rsp+256], ymm8
+        vmovdqu  [rsp+288], ymm9
+        vmovdqu  [rsp+320], ymm10
+        vmovdqu  [rsp+352], ymm11
+        vmovdqu  [rsp+384], ymm12
+        vmovdqu  [rsp+416], ymm13
+        vmovdqu  [rsp+448], ymm14
+        vmovdqu  [rsp+480], ymm15
+
+        lea rdi, [rsp+32*\VecReg1Num\()]        # first operand (unsigned)
+        lea rsi, [rsp+32*\VecReg2Num\()]        # second operand (signed)
+
+        call    CheckSaturationForVPMADDUBSW
+
+//
+// Restore YMM registers
+//
+
+        vmovdqu  ymm0, [rsp]
+        vmovdqu  ymm1, [rsp+32]
+        vmovdqu  ymm2, [rsp+64]
+        vmovdqu  ymm3, [rsp+96]
+        vmovdqu  ymm4, [rsp+128]
+        vmovdqu  ymm5, [rsp+160]
+        vmovdqu  ymm6, [rsp+192]
+        vmovdqu  ymm7, [rsp+224]
+        vmovdqu  ymm8, [rsp+256]
+        vmovdqu  ymm9, [rsp+288]
+        vmovdqu  ymm10, [rsp+320]
+        vmovdqu  ymm11, [rsp+352]
+        vmovdqu  ymm12, [rsp+384]
+        vmovdqu  ymm13, [rsp+416]
+        vmovdqu  ymm14, [rsp+448]
+        vmovdqu  ymm15, [rsp+480]
+
+        add     rsp, 512                        # clean up the reserved stack space
+
+//
+// Restore all caller-saved registers (RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11)
+//
+
+        pop     r11
+        pop     r10
+        pop     r9
+        pop     r8
+        pop     rdi
+        pop     rsi
+        pop     rdx
+        pop     rcx
+        pop     rax
+
+        .endm
+
 /*++
 
 Macro Description:
@@ -52,9 +137,15 @@ Implicit Arguments:
 
         .macro MultiplyAccumulateRowAvx2 Vec1Reg, Vec2Reg
 
+#if defined(ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER)
+        CheckSaturation 2,0
+#endif
         vpmaddubsw ymm3,ymm2,ymm0
         vpmaddwd ymm3,ymm3,ymm12
         vpaddd \Vec1Reg\(),\Vec1Reg\(),ymm3
+#if defined(ENABLE_CONVSYMKERNELAVX2_SAT_CHECKER)
+        CheckSaturation 2,1
+#endif
         vpmaddubsw ymm2,ymm2,ymm1
         vpmaddwd ymm2,ymm2,ymm12
         vpaddd \Vec2Reg\(),\Vec2Reg\(),ymm2
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
@@ -2865,6 +2865,8 @@ Status InferenceSession::Run(const RunOptions& run_options,
   }
 #endif
 
+  reset_saturation_count();
+
   // As N+1 inference runs (N for memory allocation and 1 for graph capturing)
   // are needed before replaying the captured graph, here run N inference runs recursively until graph captured,
   // so that users just need one session run to capture the graph.
diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h
@@ -58,6 +58,8 @@ class IExecutionProvider;
 class IOBinding;
 struct Notification;
 
+void reset_saturation_count();
+
 #ifdef ENABLE_TRAINING
 struct PartialGraphExecutionState;
 using OrtValueCache = InlinedHashMap<std::string, OrtValue>;

Original file line number	Diff line number	Diff line change
`@@ -2865,6 +2865,8 @@ Status InferenceSession::Run(const RunOptions& run_options,`
`2865`	`2865`	`}`
`2866`	`2866`	`#endif`
`2867`	`2867`
	`2868`	`+ reset_saturation_count();`
	`2869`	`+`
`2868`	`2870`	`// As N+1 inference runs (N for memory allocation and 1 for graph capturing)`
`2869`	`2871`	`// are needed before replaying the captured graph, here run N inference runs recursively until graph captured,`
`2870`	`2872`	`// so that users just need one session run to capture the graph.`