microsoft
diff --git a/‎cmake/CMakeLists.txt
Lines changed: 1 addition & 1 deletion b/‎cmake/CMakeLists.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/deps.txt
Lines changed: 2 additions & 2 deletions b/‎cmake/deps.txt
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/onnxruntime_compile_triton_kernel.cmake
Lines changed: 6 additions & 4 deletions b/‎cmake/onnxruntime_compile_triton_kernel.cmake
Lines changed: 6 additions & 4 deletions
diff --git a/‎cmake/onnxruntime_mlas.cmake
Lines changed: 11 additions & 0 deletions b/‎cmake/onnxruntime_mlas.cmake
Lines changed: 11 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_python.cmake
Lines changed: 8 additions & 0 deletions b/‎cmake/onnxruntime_python.cmake
Lines changed: 8 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_rocm_hipify.cmake
Lines changed: 1 addition & 0 deletions b/‎cmake/onnxruntime_rocm_hipify.cmake
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake
Lines changed: 11 additions & 0 deletions b/‎cmake/onnxruntime_unittests.cmake
Lines changed: 11 additions & 0 deletions
diff --git a/‎cmake/patches/protobuf/protobuf_cmake.patch
Lines changed: 24 additions & 0 deletions b/‎cmake/patches/protobuf/protobuf_cmake.patch
Lines changed: 24 additions & 0 deletions
diff --git a/‎csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
Lines changed: 3 additions & 3 deletions b/‎csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj
Lines changed: 3 additions & 3 deletions
diff --git a/‎csharp/src/Microsoft.ML.OnnxRuntime/targets/net6.0-maccatalyst/README.md
Lines changed: 3 additions & 0 deletions b/‎csharp/src/Microsoft.ML.OnnxRuntime/targets/net6.0-maccatalyst/README.md
Lines changed: 3 additions & 0 deletions
@@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF)
 option(onnxruntime_USE_SNPE "Build with SNPE support" OFF)
 option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF)
 option(onnxruntime_USE_DNNL "Build with DNNL support" OFF)
-option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON)
+option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF)
 option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF)
 option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
 option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
 
@@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851
 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d
-#use the commit of Final DDS removal. DDS output is now supported by ORT TRT.
-onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26
+#use the latest commit of 10.0-GA
+onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/eb43908b02a296ea0594432f06e9d3fac288d672.zip;94d07871810a36a5bc70a1def5c50504101c9bd1
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa
 protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a
 protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874
 
@@ -4,10 +4,12 @@
 find_package(Python3 COMPONENTS Interpreter REQUIRED)
 
 # set all triton kernel ops that need to be compiled
-set(triton_kernel_scripts
-    "onnxruntime/core/providers/rocm/math/softmax_triton.py"
-    "onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
-)
+if(onnxruntime_USE_ROCM)
+  set(triton_kernel_scripts
+      "onnxruntime/core/providers/rocm/math/softmax_triton.py"
+      "onnxruntime/contrib_ops/rocm/diffusion/group_norm_triton.py"
+  )
+endif()
 
 function(compile_triton_kernel out_triton_kernel_obj_file out_triton_kernel_header_dir)
   # compile triton kernel, generate .a and .h files
 
@@ -167,6 +167,9 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
       ${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
       ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+      ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
+      ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
+      ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
       ${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAmx.asm
       ${MLAS_SRC_DIR}/amd64/QgemmU8S8KernelAvx2.asm
       ${MLAS_SRC_DIR}/amd64/QgemmU8U8KernelAvx2.asm
@@ -530,6 +533,7 @@ else()
           ${MLAS_SRC_DIR}/x86_64/ErfKernelFma3.S
           ${MLAS_SRC_DIR}/intrinsics/avx2/qladd_avx2.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
+          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
         )
         set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
 
@@ -549,9 +553,15 @@ else()
           ${MLAS_SRC_DIR}/x86_64/QgemvU8S8KernelAvx512Vnni.S
           ${MLAS_SRC_DIR}/x86_64/QgemmU8X8KernelAvx512Core.S
           ${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
+          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
         )
         set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
 
+        set(mlas_platform_srcs_avx512vnni
+          ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
+        )
+        set_source_files_properties(${mlas_platform_srcs_avx512vnni} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl -mavx512f")
+
         set(mlas_platform_srcs
           ${MLAS_SRC_DIR}/activate_fp16.cpp
           ${MLAS_SRC_DIR}/dwconv.cpp
@@ -563,6 +573,7 @@ else()
           ${mlas_platform_srcs_avx2}
           ${mlas_platform_srcs_avx512f}
           ${mlas_platform_srcs_avx512core}
+          ${mlas_platform_srcs_avx512vnni}
         )
 
         if (NOT onnxruntime_ORT_MINIMAL_BUILD)
 
@@ -1015,6 +1015,14 @@ if (onnxruntime_USE_QNN)
         ${QNN_LIB_FILES}
         $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/capi/
   )
+  if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+    add_custom_command(
+      TARGET onnxruntime_pybind11_state POST_BUILD
+      COMMAND ${CMAKE_COMMAND} -E copy
+          "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf"
+          $<TARGET_FILE_DIR:${build_output_target}>/onnxruntime/
+    )
+  endif()
 endif()
 
 endif()
 
@@ -46,6 +46,7 @@ set(contrib_ops_excluded_files
   "math/gemm_float8.cu"
   "math/gemm_float8.h"
   "moe/*"
+  "sparse/*"
   "quantization/attention_quantization.cc"
   "quantization/attention_quantization.h"
   "quantization/attention_quantization_impl.cu"
 
@@ -876,6 +876,11 @@ if (MSVC)
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd26451>")
   target_compile_options(onnxruntime_test_all PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4244>"
                 "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4244>")
+  # Avoid the error for Win arm64 Release build. error C1128: number of sections exceeded object file format limit: compile with /bigobj
+  string(TOLOWER ${onnxruntime_target_platform} GEN_PLATFORM)
+  if (${GEN_PLATFORM} STREQUAL "arm64" AND "${CMAKE_BUILD_TYPE}" STREQUAL "Release")
+    target_compile_options(onnxruntime_test_all PRIVATE "/bigobj")
+  endif()
 else()
   target_compile_options(onnxruntime_test_all PRIVATE "-Wno-parentheses")
 endif()
@@ -994,6 +999,12 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP)
         COMMAND ${CMAKE_COMMAND} -E copy ${QNN_LIB_FILES} $<TARGET_FILE_DIR:${test_data_target}>
         )
     endif()
+    if (EXISTS "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf")
+      add_custom_command(
+        TARGET ${test_data_target} POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E copy "${onnxruntime_QNN_HOME}/Qualcomm AI Hub Proprietary License.pdf" $<TARGET_FILE_DIR:${test_data_target}>
+        )
+    endif()
   endif()
 
   if (onnxruntime_USE_DNNL)
 
@@ -29,3 +29,27 @@ index 04cb3303a..4025805cf 100644
    # When building with "make", "lib" prefix will be added automatically by
    # the build tool.
    set(LIB_PREFIX)
+diff --git a/src/google/protobuf/map.h b/src/google/protobuf/map.h
+index 008c19225..cbab108c2 100644
+--- a/src/google/protobuf/map.h
++++ b/src/google/protobuf/map.h
+@@ -52,7 +52,8 @@
+ #endif  // defined(__cpp_lib_string_view)
+ 
+ #if !defined(GOOGLE_PROTOBUF_NO_RDTSC) && defined(__APPLE__)
+-#include <mach/mach_time.h>
++// apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++#include <time.h>
+ #endif
+ 
+ #include <google/protobuf/stubs/common.h>
+@@ -1154,7 +1155,8 @@ class Map {
+ #if defined(__APPLE__)
+       // Use a commpage-based fast time function on Apple environments (MacOS,
+       // iOS, tvOS, watchOS, etc).
+-      s += mach_absolute_time();
++      // apply update from https://github.com/protocolbuffers/protobuf/pull/15662/
++      s += clock_gettime_nsec_np(CLOCK_UPTIME_RAW);
+ #elif defined(__x86_64__) && defined(__GNUC__)
+       uint32_t hi, lo;
+       asm volatile("rdtsc" : "=a"(lo), "=d"(hi));
@@ -39,7 +39,7 @@
   <PropertyGroup Condition="('$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime' OR
                              '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Azure') AND
                             '$(IncludeMobileTargets)' == 'true'">
-    <MobileTargets>$(MobileTargets);net6.0-android;net6.0-ios</MobileTargets>
+    <MobileTargets>$(MobileTargets);net6.0-android;net6.0-ios;net6.0-maccatalyst</MobileTargets>
   </PropertyGroup>
 
   <PropertyGroup Condition="'$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Training' AND
@@ -121,9 +121,9 @@
     <IsAndroidTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'android' OR
                                 $(TargetFramework.StartsWith('monoandroid'))">true</IsAndroidTarget>
     <IsIOSTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'ios' OR
+                            $([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'maccatalyst' OR
                             $(TargetFramework.StartsWith('xamarinios'))">true</IsIOSTarget>
-    <IsMacTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'macos' OR
-                            $([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'maccatalyst'">true</IsMacTarget>
+    <IsMacTarget Condition="$([MSBuild]::GetTargetPlatformIdentifier('$(TargetFramework)')) == 'macos'">true</IsMacTarget>
   </PropertyGroup>
 
   <!-- Enable training APIs for the build. The native package must be
 
@@ -0,0 +1,3 @@
+### Notes for maccatalyst .NET targets:
+
+We only add a blank file for the target framework folder here and thus will be including blank TFM under build/ and buildTransitive/ in the Nuget package. The reason is for Mac Catalyst platform, it directly will resolve the xcframework from the runtimes/native/ios folder based on this [RuntimeidentifierGraph](https://github.com/dotnet/sdk/blob/main/src/Layout/redist/PortableRuntimeIdentifierGraph.json#L300-L304)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+### Notes for maccatalyst .NET targets:`
	`2`	`+`
	`3`	`+We only add a blank file for the target framework folder here and thus will be including blank TFM under build/ and buildTransitive/ in the Nuget package. The reason is for Mac Catalyst platform, it directly will resolve the xcframework from the runtimes/native/ios folder based on this [RuntimeidentifierGraph](https://github.com/dotnet/sdk/blob/main/src/Layout/redist/PortableRuntimeIdentifierGraph.json#L300-L304)`