microsoft
diff --git a/‎.github/workflows/windows_qnn_x64.yml‎
Lines changed: 82 additions & 0 deletions b/‎.github/workflows/windows_qnn_x64.yml‎
Lines changed: 82 additions & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_cxx_inline.h‎
Lines changed: 18 additions & 14 deletions b/‎include/onnxruntime/core/session/onnxruntime_cxx_inline.h‎
Lines changed: 18 additions & 14 deletions
diff --git a/‎java/build.gradle‎
Lines changed: 1 addition & 1 deletion b/‎java/build.gradle‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/core/providers/cpu/llm/rotary_embedding.cc‎
Lines changed: 9 additions & 4 deletions b/‎onnxruntime/core/providers/cpu/llm/rotary_embedding.cc‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎onnxruntime/core/providers/cuda/llm/rotary_embedding.cc‎
Lines changed: 9 additions & 0 deletions b/‎onnxruntime/core/providers/cuda/llm/rotary_embedding.cc‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc‎
Lines changed: 4 additions & 0 deletions b/‎onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc‎
Lines changed: 27 additions & 23 deletions b/‎onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc‎
Lines changed: 27 additions & 23 deletions
diff --git a/‎onnxruntime/core/session/inference_session.cc‎
Lines changed: 61 additions & 16 deletions b/‎onnxruntime/core/session/inference_session.cc‎
Lines changed: 61 additions & 16 deletions
diff --git a/‎onnxruntime/test/framework/inference_session_test.cc‎
Lines changed: 2 additions & 1 deletion b/‎onnxruntime/test/framework/inference_session_test.cc‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎onnxruntime/test/onnx/TestCase.cc‎
Lines changed: 3 additions & 0 deletions b/‎onnxruntime/test/onnx/TestCase.cc‎
Lines changed: 3 additions & 0 deletions
@@ -0,0 +1,82 @@
+name: Windows x64 QNN CI Pipeline
+
+on:
+  push:
+    branches:
+      - main
+      - rel-*
+  pull_request:
+    branches:
+      - main
+      - rel-*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
+  cancel-in-progress: true
+
+jobs:
+  build_test_qnn_ep:
+    name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
+    runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
+    timeout-minutes: 120
+    strategy:
+      matrix: 
+        QnnLibKind: [shared_lib, static_lib]
+    env:
+      AZCOPY_AUTO_LOGIN_TYPE: MSI
+      AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
+      DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
+      ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v5
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.12'
+          architecture: x64
+
+      - name: Locate vcvarsall and Setup Env
+        uses: ./.github/actions/locate-vcvarsall-and-setup-env
+        with:
+          architecture: x64
+
+      - name: Download QNN SDK
+        working-directory: ${{ runner.temp }}
+        run: |
+          azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/qnnsdk/qnn-v2.38.0.250901 .
+          dir
+        shell: pwsh
+            
+      - name: Set QNN_SDK_ROOT environment variable
+        shell: pwsh
+        run: |
+          $qnn_sdk_path = Join-Path $env:RUNNER_TEMP "qnn-v2.38.0.250901"
+          echo "QNN_SDK_ROOT=$qnn_sdk_path" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
+          echo "QNN SDK Root: $qnn_sdk_path"
+          dir $qnn_sdk_path
+
+      - name: Build and Test
+        shell: cmd
+        run: |
+          python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --build_dir ${{ runner.temp }}\build --cmake_generator "Visual Studio 17 2022" --build_java --build_shared_lib --use_qnn ${{ matrix.QnnLibKind }} --qnn_home %QNN_SDK_ROOT% --use_binskim_compliant_compile_flags --update --build --test --enable_onnx_tests --parallel
+
+      - name: Run ONNX Tests
+        shell: cmd
+        working-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo
+        run: |
+          .\onnx_test_runner -j 1 -e qnn -i "backend_path|%QNN_SDK_ROOT%\lib\x86_64-windows-msvc\QnnCpu.dll" ${{ github.workspace }}\cmake\external\onnx\onnx\backend\test\data\node
+
+      - name: Run float32 model tests
+        shell: cmd
+        working-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo
+        run: |
+          rem This step assumes the model data exists at C:\data\float32_models on the runner
+          if exist C:\data\float32_models (
+            .\onnx_test_runner -j 1 -e qnn -i "backend_path|%QNN_SDK_ROOT%\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
+          ) else (
+            echo "Skipping float32 model tests: C:\data\float32_models not found."
+          )
@@ -1582,11 +1582,13 @@ inline std::vector<ConstMemoryInfo> ConstSessionImpl<T>::GetMemoryInfoForInputs(
 
   auto num_inputs = GetInputCount();
   std::vector<ConstMemoryInfo> mem_infos;
-  mem_infos.resize(num_inputs);
+  if (num_inputs > 0) {
+    mem_infos.resize(num_inputs);
 
-  ThrowOnError(GetApi().SessionGetMemoryInfoForInputs(this->p_,
-                                                      reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
-                                                      num_inputs));
+    ThrowOnError(GetApi().SessionGetMemoryInfoForInputs(this->p_,
+                                                        reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
+                                                        num_inputs));
+  }
 
   return mem_infos;
 }
@@ -1598,11 +1600,13 @@ inline std::vector<ConstMemoryInfo> ConstSessionImpl<T>::GetMemoryInfoForOutputs
 
   auto num_outputs = GetOutputCount();
   std::vector<ConstMemoryInfo> mem_infos;
-  mem_infos.resize(num_outputs);
+  if (num_outputs > 0) {
+    mem_infos.resize(num_outputs);
 
-  ThrowOnError(GetApi().SessionGetMemoryInfoForOutputs(this->p_,
-                                                       reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
-                                                       num_outputs));
+    ThrowOnError(GetApi().SessionGetMemoryInfoForOutputs(this->p_,
+                                                         reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
+                                                         num_outputs));
+  }
   return mem_infos;
 }
 
@@ -1631,12 +1635,12 @@ template <typename T>
 inline std::vector<ConstEpDevice> ConstSessionImpl<T>::GetEpDeviceForInputs() const {
   auto num_inputs = GetInputCount();
   std::vector<ConstEpDevice> input_devices;
-  input_devices.resize(num_inputs);
-
-  ThrowOnError(GetApi().SessionGetEpDeviceForInputs(this->p_,
-                                                    reinterpret_cast<const OrtEpDevice**>(input_devices.data()),
-                                                    num_inputs));
-
+  if (num_inputs > 0) {
+    input_devices.resize(num_inputs);
+    ThrowOnError(GetApi().SessionGetEpDeviceForInputs(this->p_,
+                                                      reinterpret_cast<const OrtEpDevice**>(input_devices.data()),
+                                                      num_inputs));
+  }
   return input_devices;
 }
 
 
@@ -3,7 +3,7 @@ plugins {
 	id 'maven-publish'
 	id 'signing'
 	id 'jacoco'
-	id "com.diffplug.spotless" version "6.25.0"
+	id "com.diffplug.spotless" version "7.2.1"
 }
 
 allprojects {
 
@@ -30,10 +30,6 @@ RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : OpKernel(info) {
   num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
   rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
   interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);  // Turn 0/1 into bool
-
-  if (rotary_embedding_dim > 0) {
-    ORT_ENFORCE(num_heads > 0, "num_heads must be provided if rotary_embedding_dim is specified");
-  }
 }
 
 // TODO: rotary embedding in place
@@ -111,6 +107,15 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
   // Optional position_ids input, can be nullptr
   const Tensor* position_ids = context->Input<Tensor>(3);
 
+  // If rotary_embedding_dim is set (>0) and num_heads attribute not provided (==0),
+  // we can only proceed if input is 4D (B, num_heads, S, head_size) so num_heads can be inferred.
+  if (rotary_embedding_dim > 0 && num_heads <= 0) {
+    const auto& dims = X->Shape().GetDims();
+    ORT_ENFORCE(dims.size() == 4,
+                "Attribute 'num_heads' must be provided when 'rotary_embedding_dim' is specified "
+                "and input is not rank-4 (batch, num_heads, sequence, head).");
+  }
+
   RotaryParameters parameters = {};
   ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(X,
                                                                    position_ids,
 
@@ -44,6 +44,15 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
   const Tensor* sin_cache = context->Input<Tensor>(2);
   const Tensor* position_ids = context->Input<Tensor>(3);  // Optional, can be nullptr
 
+  // If rotary_embedding_dim is set (>0) and num_heads attribute not provided (==0),
+  // we can only proceed if input is 4D (B, num_heads, S, head_size) so num_heads can be inferred.
+  if (rotary_embedding_dim > 0 && num_heads <= 0) {
+    const auto& dims = input->Shape().GetDims();
+    ORT_ENFORCE(dims.size() == 4,
+                "Attribute 'num_heads' must be provided when 'rotary_embedding_dim' is specified "
+                "and input is not rank-4 (batch, num_heads, sequence, head).");
+  }
+
   RotaryParameters parameters = {};
   ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(input,
                                                                    position_ids,
 
@@ -114,6 +114,10 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
         FillShapeInputData(shape_data, shape_size, static_cast<float>(1.0));
         break;
       }
+      case QNN_DATATYPE_FLOAT_16: {
+        FillShapeInputData(shape_data, shape_size, static_cast<MLFloat16>(1.0f));
+        break;
+      }
       case QNN_DATATYPE_INT_64: {
         // QNN-EP doesn't support INT64 shape input.
         qnn_data_type = QNN_DATATYPE_INT_32;
 
@@ -173,33 +173,37 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
                       "QNN EP: Resize does not support nearest_mode ", nearest_mode.c_str());
 
     if (is_npu_backend) {
-      // QNN only supports the following nearest_mode values on HTP:
-      // - QNN 2.19: "round_prefer_floor" via QNN's Resize operator
-      // - QNN 2.20 (API version 2.14): "round_prefer_ceil" via QNN's Resize operator
-      // - "floor" via QNN's ResizeNearestNeighbor operator
-#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 14
-      ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_ceil" || nearest_mode == "floor",
-                        "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
-#else
-      ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor",
-                        "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
-#endif
-
-      // Use ResizeNearestNeighbor for rank-4 inputs.
+      // For better performance with HTP backend, use QNN's ResizeNearestNeighbor for rank-4 input.
       const bool use_resize_nn_op = input_rank == 4;
 
-      // If HTP uses ResizeNearestNeighbor ("floor"), then the "pytorch_half_pixel" coordinate_transformation_mode
-      // is not supported.
-      ORT_RETURN_IF(!use_resize_nn_op && nearest_mode == "floor" && transformation_mode == "pytorch_half_pixel",
-                    "QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
-                    " and coordinate_transformation_mode == 'pytorch_half_pixel'.");
-
+      if (!use_resize_nn_op) {
+        // QNN only supports the following nearest_mode values on HTP:
+        // - QNN 2.19: "round_prefer_floor" via QNN's Resize operator
+        // - QNN 2.20 (API version 2.14): "round_prefer_ceil" via QNN's Resize operator
 #if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 14
-      // QNN's Resize only supports "round_prefer_ceil" if transformation_mode is "align_corners".
-      ORT_RETURN_IF(!use_resize_nn_op && transformation_mode != "align_corners",
-                    "QNN EP: Resize on the NPU only supports 'round_prefer_ceil' if "
-                    "transformation mode is 'align_corners'");
+        ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_ceil" || nearest_mode == "floor",
+                          "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
+
+        // QNN HTP Resize only supports "round_prefer_ceil" if transformation_mode is "align_corners".
+        ORT_RETURN_IF(nearest_mode == "round_prefer_ceil" && transformation_mode != "align_corners",
+                      "QNN EP: Resize on the NPU only supports 'round_prefer_ceil' if "
+                      "transformation mode is 'align_corners'");
+#else
+        ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor",
+                          "QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
 #endif
+        // If HTP uses Resize ("floor"), then the transformation_mode "pytorch_half_pixel" is not supported.
+        ORT_RETURN_IF(nearest_mode == "floor" && transformation_mode == "pytorch_half_pixel",
+                      "QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
+                      " and transformation_mode == 'pytorch_half_pixel'.");
+      } else {
+        // If HTP uses ResizeNearestNeighbor "ceil" or "round_prefer_floor", then the
+        // transformation_mode "asymmetric" is not supported.
+        // This is verified in unit test but not be documented in QNN SDK.
+        ORT_RETURN_IF((nearest_mode == "ceil" || nearest_mode == "round_prefer_floor") && transformation_mode == "asymmetric",
+                      "QNN EP: ResizeNearestNeighbor on the NPU does not support the combination of ",
+                      "nearest_mode == 'ceil' or 'round_prefer_floor' and transformation_mode == 'asymmetric'.");
+      }
     }
   }
 
 
@@ -3383,17 +3383,58 @@ common::Status InferenceSession::GetInputOutputMemoryInfo(SessionInputOutputType
 
   for (const auto* def : def_list) {
     InlinedVector<SessionState::NodeInfo> node_info_vec;
+    Status status;
     if (type == SessionInputOutputType::kOutput) {
-      ORT_RETURN_IF_ERROR(session_state_->GetOutputNodeInfo(def->Name(), node_info_vec));
+      status = session_state_->GetOutputNodeInfo(def->Name(), node_info_vec);
     } else {
-      ORT_RETURN_IF_ERROR(session_state_->GetInputNodeInfo(def->Name(), node_info_vec));
+      status = session_state_->GetInputNodeInfo(def->Name(), node_info_vec);
     }
 
-    // all entries are for the same OrtDevice so use the first one.
-    // we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
-    // from the session state and use its OrtMemoryInfo.
-    auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
-    memory_info.push_back(&allocator->Info());
+    if (!status.IsOK()) {
+      if (type == SessionInputOutputType::kInput) {
+        return status;
+      }
+
+      // Check first if this output is produced by an input that directly
+      // propagates to output with the same name.
+      status = session_state_->GetInputNodeInfo(def->Name(), node_info_vec);
+      if (status.IsOK()) {
+        // all entries are for the same OrtDevice so use the first one.
+        // we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
+        // from the session state and use its OrtMemoryInfo.
+        auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
+        memory_info.push_back(&allocator->Info());
+      } else {
+        // Check if this output is produced by a constant initializer
+        // Pick the MemoryInfo from the initializer's OrtValue
+        const auto& ort_value_map = session_state_->GetOrtValueNameIdxMap();
+
+        OrtValueIndex ort_value_index;
+        status = ort_value_map.GetIdx(def->Name(), ort_value_index);
+        if (!status.IsOK()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                 "Failed to find node output or a constant initializer producing output: ",
+                                 def->Name(), ".");
+        }
+
+        const auto& idx_to_ort_value = session_state_->GetInitializedTensors();
+        auto it = idx_to_ort_value.find(ort_value_index);
+        if (it == idx_to_ort_value.end()) {
+          return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                                 "Failed to find node output or a constant initializer producing output: ",
+                                 def->Name(), ".");
+        }
+        const auto& tensor = it->second.Get<Tensor>();
+        auto allocator = session_state_->GetAllocator(tensor.Location());
+        memory_info.push_back(&allocator->Info());
+      }
+    } else {
+      // all entries are for the same OrtDevice so use the first one.
+      // we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
+      // from the session state and use its OrtMemoryInfo.
+      auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
+      memory_info.push_back(&allocator->Info());
+    }
   }
 
   return Status::OK();
@@ -3422,15 +3463,19 @@ common::Status InferenceSession::GetEpDeviceForInputs(InlinedVector<const OrtEpD
   for (const auto* def : def_list) {
     InlinedVector<SessionState::NodeInfo> node_info_vec;
     ORT_RETURN_IF_ERROR(session_state_->GetInputNodeInfo(def->Name(), node_info_vec));
-
-    // if we have a lot of inputs or there are a lot of execution providers it may be worth creating a map
-    // instead of doing a linear search each time.
-    const auto& ep_name = node_info_vec.front().p_node->GetExecutionProviderType();
-    auto it = std::find_if(available_eps.begin(), available_eps.end(), [&ep_name](const OrtEpDevice* entry) {
-      return entry->ep_name == ep_name;
-    });
-
-    ep_devices.push_back(it != available_eps.end() ? *it : nullptr);
+    assert(!node_info_vec.empty());
+    // If we have an input that is not consumed by any node,
+    // including nodes in subgraphs, then we return nullptr.
+    const auto* p_node = node_info_vec.front().p_node;
+    if (p_node != nullptr) {
+      const auto ep_name = p_node->GetExecutionProviderType();
+      auto it = std::find_if(available_eps.begin(), available_eps.end(), [&ep_name](const OrtEpDevice* entry) {
+        return entry->ep_name == ep_name;
+      });
+      ep_devices.push_back(it != available_eps.end() ? *it : nullptr);
+    } else {
+      ep_devices.push_back(nullptr);
+    }
   }
 
   return Status::OK();
 
@@ -630,7 +630,8 @@ TEST(InferenceSessionTests, CheckRunLogger) {
 }
 
 // WebAssembly will emit profiling data into console
-#if !defined(__wasm__)
+// TODO(hasesh): Investigate why this test fails on Windows CUDA builds
+#if (!defined(__wasm__) && !defined(_WIN32))
 TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
   SessionOptions so;
 
 
@@ -1408,6 +1408,9 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
     broken_tests->insert({"gridsample_volumetric_nearest_align_corners_1", "unknown version"});
     broken_tests->insert({"rotary_embedding_no_position_ids_expanded", "unknown version"});
     broken_tests->insert({"rotary_embedding_no_position_ids_interleaved_expanded", "unknown version"});
+    broken_tests->insert({"rotary_embedding_no_position_ids_rotary_dim", "unknown version"});
+    broken_tests->insert({"rotary_embedding_with_interleaved_rotary_dim", "unknown version"});
+    broken_tests->insert({"rotary_embedding_with_rotary_dim", "unknown version"});
     // Fails since QNN SDK 2.17.0:
     // expected 7.70947 (40f6b3f3), got 7.84096 (40fae920), diff: 0.131491, tol=0.00870947 idx=419. 100 of 1715 differ
     broken_tests->insert({"facedetection_op8_qdq", "result differs"});
Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@ plugins {`
`3`	`3`	`id 'maven-publish'`
`4`	`4`	`id 'signing'`
`5`	`5`	`id 'jacoco'`
`6`		`- id "com.diffplug.spotless" version "6.25.0"`
	`6`	`+ id "com.diffplug.spotless" version "7.2.1"`
`7`	`7`	`}`
`8`	`8`
`9`	`9`	`allprojects {`
Original file line number	Diff line number	Diff line change
`@@ -630,7 +630,8 @@ TEST(InferenceSessionTests, CheckRunLogger) {`
`630`	`630`	`}`
`631`	`631`
`632`	`632`	`// WebAssembly will emit profiling data into console`
`633`		`-#if !defined(__wasm__)`
	`633`	`+// TODO(hasesh): Investigate why this test fails on Windows CUDA builds`
	`634`	`+#if (!defined(__wasm__) && !defined(_WIN32))`
`634`	`635`	`TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {`
`635`	`636`	`SessionOptions so;`
`636`	`637`