CodeLinaro
diff --git a/‎.github/workflows/windows_x86.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/windows_x86.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion b/‎cmake/CMakeLists.txt‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/deps.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎js/common/lib/inference-session.ts‎
Lines changed: 14 additions & 0 deletions b/‎js/common/lib/inference-session.ts‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎js/package-lock.json‎
Lines changed: 6 additions & 6 deletions b/‎js/package-lock.json‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎js/web/lib/wasm/session-options.ts‎
Lines changed: 5 additions & 0 deletions b/‎js/web/lib/wasm/session-options.ts‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc‎
Lines changed: 3 additions & 3 deletions b/‎onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎onnxruntime/contrib_ops/webgpu/bert/attention.cc‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/contrib_ops/webgpu/bert/attention.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc‎
Lines changed: 95 additions & 0 deletions b/‎onnxruntime/contrib_ops/webgpu/bert/bias_gelu.cc‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h‎
Lines changed: 38 additions & 0 deletions b/‎onnxruntime/contrib_ops/webgpu/bert/bias_gelu.h‎
Lines changed: 38 additions & 0 deletions
@@ -61,7 +61,7 @@ jobs:
         working-directory: ${{ github.workspace }}
 
       - name: Use .NET 8.x
-        uses: actions/setup-dotnet@v5
+        uses: actions/setup-dotnet@v3
         with:
           dotnet-version: '8.x'
         env:
 
@@ -1456,7 +1456,12 @@ if (onnxruntime_USE_CUDA)
     message(STATUS "CUDA Toolkit version is greater or equal than 12.8, enable -DENABLE_FP4 flag")
   endif()
 
-  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
+  if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "13.0")
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all -compress-mode=size")
+  else()
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
+  endif()
+
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
 
 
@@ -56,5 +56,5 @@ extensions;https://github.com/microsoft/onnxruntime-extensions/archive/c24b7bab0
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
 dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
-kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.10.0.tar.gz;11b62149cb2514b3b9069cc435c3aa7a4e82b97a
+kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
 duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
@@ -258,6 +258,20 @@ export declare namespace InferenceSession {
      */
     forceCpuNodeNames?: readonly string[];
 
+    /**
+     * Specify the validation mode for WebGPU execution provider.
+     * - 'disabled': Disable all validation.
+     * When used in Node.js, disable validation may cause process crash if WebGPU errors occur. Be cautious when using
+     * this mode.
+     * When used in web, this mode is equivalent to 'wgpuOnly'.
+     * - 'wgpuOnly': Perform WebGPU internal validation only.
+     * - 'basic': Perform basic validation including WebGPU internal validation. This is the default mode.
+     * - 'full': Perform full validation. This mode may have performance impact. Use it for debugging purpose.
+     *
+     * @default 'basic'
+     */
+    validationMode?: 'disabled' | 'wgpuOnly' | 'basic' | 'full';
+
     /**
      * Specify an optional WebGPU device to be used by the WebGPU execution provider.
      */
 
@@ -129,6 +129,11 @@ const setExecutionProviders = async (
 
               appendEpOption(epOptions, 'forceCpuNodeNames', names.join('\n'), allocs);
             }
+
+            // set validation mode
+            if (webgpuOptions.validationMode) {
+              appendEpOption(epOptions, 'validationMode', webgpuOptions.validationMode, allocs);
+            }
           }
 
           const info = getInstance().webgpuRegisterDevice!(customDevice);
 
@@ -1,7 +1,7 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "core/common/cpuid_info.h"  // for CPUIDInfo::GetCPUIDInfo().HasArm_SME()
+#include "core/common/cpuid_info.h"  // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2()
 #include "core/common/narrow.h"
 #include "core/common/safeint.h"
 #include "core/mlas/inc/mlas.h"
@@ -213,9 +213,9 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
         }
       }
 
-      // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops.
+      // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops.
       // We check that here too before attempting to use them.
-      if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) {
+      if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) {
         can_use_dynamic_quant_mlas_ = false;
       }
 
 
@@ -284,7 +284,7 @@ Status InPlaceSoftmaxProgram::GenerateShaderCode(ShaderHelper& shader) const {
 
   if (has_head_sink_) {
     // Handle head sink
-    shader.MainFunctionBody() << "let sink_value: f32 = head_sink[head_idx];\n"
+    shader.MainFunctionBody() << "let sink_value: f32 = f32(head_sink[head_idx]);\n"
                               << "var max_value = sink_value;\n";
   } else if (use_smooth_softmax_) {
     shader.MainFunctionBody() << "var max_value: f32 = 0.0;\n";
 
@@ -0,0 +1,95 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "core/providers/webgpu/math/unary_elementwise_ops.h"
+#include "contrib_ops/webgpu/bert/bias_gelu.h"
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+ONNX_OPERATOR_KERNEL_EX(
+    BiasGelu,
+    kMSDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes()),
+    BiasGelu);
+
+Status BiasGeluProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& x = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseValueTypeAlias);
+  const auto& bias = shader.AddInput("bias", ShaderUsage::UseUniform | ShaderUsage::UseShapeAndStride);
+  const auto& y = shader.AddOutput("y", ShaderUsage::UseUniform);
+
+  shader.AdditionalImplementation() << onnxruntime::webgpu::ErfImpl;
+  shader.MainFunctionBody() << shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.vec_size")
+                            << "  var a = " << x.GetByOffset("global_idx") << ";\n";
+
+  // Add bias to input
+  if (bias_components_ == 1) {
+    shader.MainFunctionBody() << "  let bias_offset = global_idx * 4;\n"
+                                 "  a += x_value_t("
+                              << bias.GetByOffset("bias_offset % uniforms.bias_shape") << ", "
+                              << bias.GetByOffset("(bias_offset + 1) % uniforms.bias_shape") << ", "
+                              << bias.GetByOffset("(bias_offset + 2) % uniforms.bias_shape") << ", "
+                              << bias.GetByOffset("(bias_offset + 3) % uniforms.bias_shape") << ");\n";
+  } else {
+    shader.MainFunctionBody() << "  a += " << bias.GetByOffset("global_idx % uniforms.bias_shape") + ";\n";
+  }
+
+  // Apply GELU activation: 0.5 * a * (1.0 + erf(a * 0.7071067811865475))
+  shader.MainFunctionBody() << y.SetByOffset("global_idx", onnxruntime::webgpu::GeluExpr);
+
+  return Status::OK();
+}
+
+Status BiasGelu::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const auto* input = context.Input(0);
+  const auto* bias = context.Input(1);
+  auto* output = context.Output(0, input->Shape());
+
+  uint32_t data_size = onnxruntime::narrow<uint32_t>(output->Shape().Size());
+  if (data_size == 0) {
+    return Status::OK();
+  }
+
+  const auto& input_shape = input->Shape();
+  const auto& bias_shape = bias->Shape();
+
+  // Validate inputs
+  if (input_shape.NumDimensions() < 1 || bias_shape.NumDimensions() != 1) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "BiasGelu: input must have at least 1 dimension and bias must be 1-dimensional.");
+  }
+
+  if (input_shape.GetDims().back() != bias_shape.GetDims().back()) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT,
+                           "BiasGelu: bias must match the last dimension of input.");
+  }
+
+  const auto vec_size = (data_size + 3) / 4;
+  uint32_t bias_size = onnxruntime::narrow<uint32_t>(bias->Shape().Size());
+  int bias_components = 1;
+
+  if (bias_size % 4 == 0) {
+    bias_components = 4;
+    bias_size = bias_size / 4;
+  }
+
+  BiasGeluProgram program{bias_components};
+  program.AddInput({input, ProgramTensorMetadataDependency::Type, {vec_size}, 4})
+      .AddInput({bias, ProgramTensorMetadataDependency::TypeAndRank, {bias_size}, bias_components})
+      .AddOutput({output, ProgramTensorMetadataDependency::None, {vec_size}, 4})
+      .SetDispatchGroupSize((vec_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariable({vec_size});
+
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
@@ -0,0 +1,38 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+using onnxruntime::webgpu::ComputeContext;
+
+class BiasGeluProgram final : public Program<BiasGeluProgram> {
+ public:
+  BiasGeluProgram(int bias_components) : Program{"BiasGelu"}, bias_components_{bias_components} {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"vec_size", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  int bias_components_;
+};
+
+class BiasGelu final : public WebGpuKernel {
+ public:
+  BiasGelu(const OpKernelInfo& info) : WebGpuKernel(info) {}
+
+  Status ComputeInternal(ComputeContext& context) const override;
+};
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
Original file line number	Diff line number	Diff line change
`@@ -129,6 +129,11 @@ const setExecutionProviders = async (`
`129`	`129`
`130`	`130`	`appendEpOption(epOptions, 'forceCpuNodeNames', names.join('\n'), allocs);`
`131`	`131`	`}`
	`132`	`+`
	`133`	`+ // set validation mode`
	`134`	`+ if (webgpuOptions.validationMode) {`
	`135`	`+ appendEpOption(epOptions, 'validationMode', webgpuOptions.validationMode, allocs);`
	`136`	`+ }`
`132`	`137`	`}`
`133`	`138`
`134`	`139`	`const info = getInstance().webgpuRegisterDevice!(customDevice);`
Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`// Copyright (c) Microsoft Corporation. All rights reserved.`
`2`	`2`	`// Licensed under the MIT License.`
`3`	`3`
`4`		`-#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME()`
	`4`	`+#include "core/common/cpuid_info.h" // for CPUIDInfo::GetCPUIDInfo().HasArm_SME2()`
`5`	`5`	`#include "core/common/narrow.h"`
`6`	`6`	`#include "core/common/safeint.h"`
`7`	`7`	`#include "core/mlas/inc/mlas.h"`
`@@ -213,9 +213,9 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {`
`213`	`213`	`}`
`214`	`214`	`}`
`215`	`215`
`216`		`- // Currently, MlasDynamicQGemmBatch() and associated functions require SME or else they are no-ops.`
	`216`	`+ // Currently, MlasDynamicQGemmBatch() and associated functions require SME2 or else they are no-ops.`
`217`	`217`	`// We check that here too before attempting to use them.`
`218`		`- if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME()) {`
	`218`	`+ if (!CPUIDInfo::GetCPUIDInfo().HasArm_SME2()) {`
`219`	`219`	`can_use_dynamic_quant_mlas_ = false;`
`220`	`220`	`}`
`221`	`221`