Skip to content

Commit 65f3fbf

Browse files
authored
[ORT 1.18.0 Release] Cherry pick 1st round (#20585)
1 parent 204f1f5 commit 65f3fbf

File tree

211 files changed

+12508
-1429
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

211 files changed

+12508
-1429
lines changed

cmake/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
8888
option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
8989
option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
9090
option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
91-
option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
91+
option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
9292
option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
9393
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
9494
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)

cmake/deps.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
3737
mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
3838
neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
3939
onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
40-
#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
41-
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
40+
#use the latest commit of 10.0-GA
41+
onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
4242
protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
4343
protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
4444
protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874

cmake/onnxruntime_compile_triton_kernel.cmake

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@
44
find_package(Python3 COMPONENTS Interpreter REQUIRED)
55

66
# set all triton kernel ops that need to be compiled
7-
set(triton_kernel_scripts
8-
"onnxruntime/core/providers/rocm/math/softmax_triton.py"
9-
"onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
10-
)
7+
if(onnxruntime_USE_ROCM)
8+
set(triton_kernel_scripts
9+
"onnxruntime/core/providers/rocm/math/softmax_triton.py"
10+
"onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
11+
)
12+
endif()
1113

1214
function(compile_triton_kernel out_triton_kernel_obj_file out_triton_kernel_header_dir)
1315
# compile triton kernel, generate .a and .h files

cmake/onnxruntime_mlas.cmake

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,9 @@ function(setup_mlas_source_for_windows)
167167
${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
168168
${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
169169
${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
170+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
171+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
172+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
170173
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
171174
${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
172175
${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
@@ -530,6 +533,7 @@ else()
530533
${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
531534
${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
532535
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
536+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
533537
)
534538
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
535539

@@ -549,9 +553,15 @@ else()
549553
${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
550554
${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
551555
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
556+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
552557
)
553558
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
554559

560+
set(mlas_platform_srcs_avx512vnni
561+
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
562+
)
563+
set_source_files_properties(${mlas_platform_srcs_avx512vnni} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
564+
555565
set(mlas_platform_srcs
556566
${MLAS_SRC_DIR}/activate_fp16.cpp
557567
${MLAS_SRC_DIR}/dwconv.cpp
@@ -563,6 +573,7 @@ else()
563573
${mlas_platform_srcs_avx2}
564574
${mlas_platform_srcs_avx512f}
565575
${mlas_platform_srcs_avx512core}
576+
${mlas_platform_srcs_avx512vnni}
566577
)
567578

568579
if (NOT onnxruntime_ORT_MINIMAL_BUILD)

cmake/onnxruntime_python.cmake

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,6 +1015,14 @@ if (onnxruntime_USE_QNN)
10151015
${QNN_LIB_FILES}
10161016
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
10171017
)
1018+
if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
1019+
add_custom_command(
1020+
TARGET onnxruntime_pybind11_state POST_BUILD
1021+
COMMAND ${CMAKE_COMMAND} -E copy
1022+
"${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf"
1023+
$<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
1024+
)
1025+
endif()
10181026
endif()
10191027

10201028
endif()

cmake/onnxruntime_rocm_hipify.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ set(contrib_ops_excluded_files
4646
"math/gemm_float8.cu"
4747
"math/gemm_float8.h"
4848
"moe/*"
49+
"sparse/*"
4950
"quantization/attention_quantization.cc"
5051
"quantization/attention_quantization.h"
5152
"quantization/attention_quantization_impl.cu"

cmake/onnxruntime_unittests.cmake

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -876,6 +876,11 @@ if (MSVC)
876876
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26451>")
877877
target_compile_options(onnxruntime_test_all PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4244>"
878878
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4244>")
879+
# Avoid the error for Win arm64 Release build. error C1128: number of sections exceeded object file format limit: compile with /bigobj
880+
string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
881+
if (${GEN_PLATFORM} STREQUAL "arm64" AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
882+
target_compile_options(onnxruntime_test_all PRIVATE "/bigobj")
883+
endif()
879884
else()
880885
target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
881886
endif()
@@ -994,6 +999,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
994999
COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${test_data_target}>
9951000
)
9961001
endif()
1002+
if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
1003+
add_custom_command(
1004+
TARGET ${test_data_target} POST_BUILD
1005+
COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf" $<TARGET_FILE_DIR:${test_data_target}>
1006+
)
1007+
endif()
9971008
endif()
9981009

9991010
if (onnxruntime_USE_DNNL)

cmake/patches/protobuf/protobuf_cmake.patch

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,3 +29,27 @@ index 04cb3303a..4025805cf 100644
2929
# When building with "make", "lib" prefix will be added automatically by
3030
# the build tool.
3131
set(LIB_PREFIX)
32+
diff --git a/src/google/protobuf/map.h b/src/google/protobuf/map.h
33+
index 008c19225..cbab108c2 100644
34+
--- a/src/google/protobuf/map.h
35+
+++ b/src/google/protobuf/map.h
36+
@@ -52,7 +52,8 @@
37+
#endif // defined(__cpp_lib_string_view)
38+
39+
#if !defined(GOOGLE_PROTOBUF_NO_RDTSC) && defined(__APPLE__)
40+
-#include <mach/mach_time.h>
41+
+// apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
42+
+#include <time.h>
43+
#endif
44+
45+
#include <google/protobuf/stubs/common.h>
46+
@@ -1154,7 +1155,8 @@ class Map {
47+
#if defined(__APPLE__)
48+
// Use a commpage-based fast time function on Apple environments (MacOS,
49+
// iOS, tvOS, watchOS, etc).
50+
- s += mach_absolute_time();
51+
+ // apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
52+
+ s += clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
53+
#elif defined(__x86_64__) && defined(__GNUC__)
54+
uint32_t hi, lo;
55+
asm volatile("rdtsc" : "=a"(lo), "=d"(hi));

csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
<PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR
4040
'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Azure') AND
4141
'$(IncludeMobileTargets)' == 'true'">
42-
<MobileTargets>$(MobileTargets);net6.0-android;net6.0-ios</MobileTargets>
42+
<MobileTargets>$(MobileTargets);net6.0-android;net6.0-ios;net6.0-maccatalyst</MobileTargets>
4343
</PropertyGroup>
4444

4545
<PropertyGroup Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training' AND
@@ -121,9 +121,9 @@
121121
<IsAndroidTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android' OR
122122
$(TargetFramework.StartsWith('monoandroid'))">true</IsAndroidTarget>
123123
<IsIOSTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'ios' OR
124+
$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'maccatalyst' OR
124125
$(TargetFramework.StartsWith('xamarinios'))">true</IsIOSTarget>
125-
<IsMacTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'macos' OR
126-
$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'maccatalyst'">true</IsMacTarget>
126+
<IsMacTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'macos'">true</IsMacTarget>
127127
</PropertyGroup>
128128

129129
<!-- Enable training APIs for the build. The native package must be
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
### Notes for maccatalyst .NET targets:
2+
3+
We only add a blank file for the target framework folder here and thus will be including blank TFM under build/ and buildTransitive/ in the Nuget package. The reason is for Mac Catalyst platform, it directly will resolve the xcframework from the runtimes/native/ios folder based on this [RuntimeidentifierGraph](https://github.com/dotnet/sdk/blob/main/src/Layout/redist/PortableRuntimeIdentifierGraph.json#L300-L304)

0 commit comments

Comments
 (0)