Skip to content

Commit 0333a2a

Browse files
committed
Merge remote-tracking branch 'origin/main' into snnn/rn
2 parents 6b9724c + d251f3a commit 0333a2a

File tree

18 files changed

+468
-227
lines changed

18 files changed

+468
-227
lines changed
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
name: Windows x64 QNN CI Pipeline
2+
3+
on:
4+
push:
5+
branches:
6+
- main
7+
- rel-*
8+
pull_request:
9+
branches:
10+
- main
11+
- rel-*
12+
workflow_dispatch:
13+
14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event_name == 'pull_request' && github.ref || github.sha }}
16+
cancel-in-progress: true
17+
18+
jobs:
19+
build_test_qnn_ep:
20+
name: Windows x64 QNN CI Pipeline (${{ matrix.QnnLibKind }})
21+
runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"]
22+
timeout-minutes: 120
23+
strategy:
24+
matrix:
25+
QnnLibKind: [shared_lib, static_lib]
26+
env:
27+
AZCOPY_AUTO_LOGIN_TYPE: MSI
28+
AZCOPY_MSI_CLIENT_ID: 63b63039-6328-442f-954b-5a64d124e5b4
29+
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
30+
ALLOW_RELEASED_ONNX_OPSET_ONLY: '1'
31+
32+
steps:
33+
- name: Checkout repository
34+
uses: actions/checkout@v5
35+
36+
- name: Setup Python
37+
uses: actions/setup-python@v5
38+
with:
39+
python-version: '3.12'
40+
architecture: x64
41+
42+
- name: Locate vcvarsall and Setup Env
43+
uses: ./.github/actions/locate-vcvarsall-and-setup-env
44+
with:
45+
architecture: x64
46+
47+
- name: Download QNN SDK
48+
working-directory: ${{ runner.temp }}
49+
run: |
50+
azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/qnnsdk/qnn-v2.38.0.250901 .
51+
dir
52+
shell: pwsh
53+
54+
- name: Set QNN_SDK_ROOT environment variable
55+
shell: pwsh
56+
run: |
57+
$qnn_sdk_path = Join-Path $env:RUNNER_TEMP "qnn-v2.38.0.250901"
58+
echo "QNN_SDK_ROOT=$qnn_sdk_path" | Out-File -FilePath $env:GITHUB_ENV -Encoding utf8 -Append
59+
echo "QNN SDK Root: $qnn_sdk_path"
60+
dir $qnn_sdk_path
61+
62+
- name: Build and Test
63+
shell: cmd
64+
run: |
65+
python ${{ github.workspace }}\tools\ci_build\build.py --config RelWithDebInfo --build_dir ${{ runner.temp }}\build --cmake_generator "Visual Studio 17 2022" --build_java --build_shared_lib --use_qnn ${{ matrix.QnnLibKind }} --qnn_home %QNN_SDK_ROOT% --use_binskim_compliant_compile_flags --update --build --test --enable_onnx_tests --parallel
66+
67+
- name: Run ONNX Tests
68+
shell: cmd
69+
working-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo
70+
run: |
71+
.\onnx_test_runner -j 1 -e qnn -i "backend_path|%QNN_SDK_ROOT%\lib\x86_64-windows-msvc\QnnCpu.dll" ${{ github.workspace }}\cmake\external\onnx\onnx\backend\test\data\node
72+
73+
- name: Run float32 model tests
74+
shell: cmd
75+
working-directory: ${{ runner.temp }}\build\RelWithDebInfo\RelWithDebInfo
76+
run: |
77+
rem This step assumes the model data exists at C:\data\float32_models on the runner
78+
if exist C:\data\float32_models (
79+
.\onnx_test_runner -j 1 -e qnn -i "backend_path|%QNN_SDK_ROOT%\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models
80+
) else (
81+
echo "Skipping float32 model tests: C:\data\float32_models not found."
82+
)

include/onnxruntime/core/session/onnxruntime_cxx_inline.h

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1582,11 +1582,13 @@ inline std::vector<ConstMemoryInfo> ConstSessionImpl<T>::GetMemoryInfoForInputs(
15821582

15831583
auto num_inputs = GetInputCount();
15841584
std::vector<ConstMemoryInfo> mem_infos;
1585-
mem_infos.resize(num_inputs);
1585+
if (num_inputs > 0) {
1586+
mem_infos.resize(num_inputs);
15861587

1587-
ThrowOnError(GetApi().SessionGetMemoryInfoForInputs(this->p_,
1588-
reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
1589-
num_inputs));
1588+
ThrowOnError(GetApi().SessionGetMemoryInfoForInputs(this->p_,
1589+
reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
1590+
num_inputs));
1591+
}
15901592

15911593
return mem_infos;
15921594
}
@@ -1598,11 +1600,13 @@ inline std::vector<ConstMemoryInfo> ConstSessionImpl<T>::GetMemoryInfoForOutputs
15981600

15991601
auto num_outputs = GetOutputCount();
16001602
std::vector<ConstMemoryInfo> mem_infos;
1601-
mem_infos.resize(num_outputs);
1603+
if (num_outputs > 0) {
1604+
mem_infos.resize(num_outputs);
16021605

1603-
ThrowOnError(GetApi().SessionGetMemoryInfoForOutputs(this->p_,
1604-
reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
1605-
num_outputs));
1606+
ThrowOnError(GetApi().SessionGetMemoryInfoForOutputs(this->p_,
1607+
reinterpret_cast<const OrtMemoryInfo**>(mem_infos.data()),
1608+
num_outputs));
1609+
}
16061610
return mem_infos;
16071611
}
16081612

@@ -1631,12 +1635,12 @@ template <typename T>
16311635
inline std::vector<ConstEpDevice> ConstSessionImpl<T>::GetEpDeviceForInputs() const {
16321636
auto num_inputs = GetInputCount();
16331637
std::vector<ConstEpDevice> input_devices;
1634-
input_devices.resize(num_inputs);
1635-
1636-
ThrowOnError(GetApi().SessionGetEpDeviceForInputs(this->p_,
1637-
reinterpret_cast<const OrtEpDevice**>(input_devices.data()),
1638-
num_inputs));
1639-
1638+
if (num_inputs > 0) {
1639+
input_devices.resize(num_inputs);
1640+
ThrowOnError(GetApi().SessionGetEpDeviceForInputs(this->p_,
1641+
reinterpret_cast<const OrtEpDevice**>(input_devices.data()),
1642+
num_inputs));
1643+
}
16401644
return input_devices;
16411645
}
16421646

java/build.gradle

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ plugins {
33
id 'maven-publish'
44
id 'signing'
55
id 'jacoco'
6-
id "com.diffplug.spotless" version "6.25.0"
6+
id "com.diffplug.spotless" version "7.2.1"
77
}
88

99
allprojects {

onnxruntime/core/providers/cpu/llm/rotary_embedding.cc

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,6 @@ RotaryEmbedding<T>::RotaryEmbedding(const OpKernelInfo& info) : OpKernel(info) {
3030
num_heads = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
3131
rotary_embedding_dim = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
3232
interleaved = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1); // Turn 0/1 into bool
33-
34-
if (rotary_embedding_dim > 0) {
35-
ORT_ENFORCE(num_heads > 0, "num_heads must be provided if rotary_embedding_dim is specified");
36-
}
3733
}
3834

3935
// TODO: rotary embedding in place
@@ -111,6 +107,15 @@ Status RotaryEmbedding<T>::Compute(OpKernelContext* context) const {
111107
// Optional position_ids input, can be nullptr
112108
const Tensor* position_ids = context->Input<Tensor>(3);
113109

110+
// If rotary_embedding_dim is set (>0) and num_heads attribute not provided (==0),
111+
// we can only proceed if input is 4D (B, num_heads, S, head_size) so num_heads can be inferred.
112+
if (rotary_embedding_dim > 0 && num_heads <= 0) {
113+
const auto& dims = X->Shape().GetDims();
114+
ORT_ENFORCE(dims.size() == 4,
115+
"Attribute 'num_heads' must be provided when 'rotary_embedding_dim' is specified "
116+
"and input is not rank-4 (batch, num_heads, sequence, head).");
117+
}
118+
114119
RotaryParameters parameters = {};
115120
ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(X,
116121
position_ids,

onnxruntime/core/providers/cuda/llm/rotary_embedding.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,15 @@ Status RotaryEmbedding<T>::ComputeInternal(OpKernelContext* context) const {
4444
const Tensor* sin_cache = context->Input<Tensor>(2);
4545
const Tensor* position_ids = context->Input<Tensor>(3); // Optional, can be nullptr
4646

47+
// If rotary_embedding_dim is set (>0) and num_heads attribute not provided (==0),
48+
// we can only proceed if input is 4D (B, num_heads, S, head_size) so num_heads can be inferred.
49+
if (rotary_embedding_dim > 0 && num_heads <= 0) {
50+
const auto& dims = input->Shape().GetDims();
51+
ORT_ENFORCE(dims.size() == 4,
52+
"Attribute 'num_heads' must be provided when 'rotary_embedding_dim' is specified "
53+
"and input is not rank-4 (batch, num_heads, sequence, head).");
54+
}
55+
4756
RotaryParameters parameters = {};
4857
ORT_RETURN_IF_ERROR(rotary_embedding_helper::CheckInputs<Tensor>(input,
4958
position_ids,

onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper,
114114
FillShapeInputData(shape_data, shape_size, static_cast<float>(1.0));
115115
break;
116116
}
117+
case QNN_DATATYPE_FLOAT_16: {
118+
FillShapeInputData(shape_data, shape_size, static_cast<MLFloat16>(1.0f));
119+
break;
120+
}
117121
case QNN_DATATYPE_INT_64: {
118122
// QNN-EP doesn't support INT64 shape input.
119123
qnn_data_type = QNN_DATATYPE_INT_32;

onnxruntime/core/providers/qnn/builder/opbuilder/resize_op_builder.cc

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -173,33 +173,37 @@ Status ResizeOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper,
173173
"QNN EP: Resize does not support nearest_mode ", nearest_mode.c_str());
174174

175175
if (is_npu_backend) {
176-
// QNN only supports the following nearest_mode values on HTP:
177-
// - QNN 2.19: "round_prefer_floor" via QNN's Resize operator
178-
// - QNN 2.20 (API version 2.14): "round_prefer_ceil" via QNN's Resize operator
179-
// - "floor" via QNN's ResizeNearestNeighbor operator
180-
#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 14
181-
ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_ceil" || nearest_mode == "floor",
182-
"QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
183-
#else
184-
ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor",
185-
"QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
186-
#endif
187-
188-
// Use ResizeNearestNeighbor for rank-4 inputs.
176+
// For better performance with HTP backend, use QNN's ResizeNearestNeighbor for rank-4 input.
189177
const bool use_resize_nn_op = input_rank == 4;
190178

191-
// If HTP uses ResizeNearestNeighbor ("floor"), then the "pytorch_half_pixel" coordinate_transformation_mode
192-
// is not supported.
193-
ORT_RETURN_IF(!use_resize_nn_op && nearest_mode == "floor" && transformation_mode == "pytorch_half_pixel",
194-
"QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
195-
" and coordinate_transformation_mode == 'pytorch_half_pixel'.");
196-
179+
if (!use_resize_nn_op) {
180+
// QNN only supports the following nearest_mode values on HTP:
181+
// - QNN 2.19: "round_prefer_floor" via QNN's Resize operator
182+
// - QNN 2.20 (API version 2.14): "round_prefer_ceil" via QNN's Resize operator
197183
#if QNN_API_VERSION_MAJOR >= 2 && QNN_API_VERSION_MINOR >= 14
198-
// QNN's Resize only supports "round_prefer_ceil" if transformation_mode is "align_corners".
199-
ORT_RETURN_IF(!use_resize_nn_op && transformation_mode != "align_corners",
200-
"QNN EP: Resize on the NPU only supports 'round_prefer_ceil' if "
201-
"transformation mode is 'align_corners'");
184+
ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_ceil" || nearest_mode == "floor",
185+
"QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
186+
187+
// QNN HTP Resize only supports "round_prefer_ceil" if transformation_mode is "align_corners".
188+
ORT_RETURN_IF(nearest_mode == "round_prefer_ceil" && transformation_mode != "align_corners",
189+
"QNN EP: Resize on the NPU only supports 'round_prefer_ceil' if "
190+
"transformation mode is 'align_corners'");
191+
#else
192+
ORT_RETURN_IF_NOT(nearest_mode == "round_prefer_floor" || nearest_mode == "floor",
193+
"QNN EP: Resize on the NPU does not support nearest_mode ", nearest_mode.c_str());
202194
#endif
195+
// If HTP uses Resize ("floor"), then the transformation_mode "pytorch_half_pixel" is not supported.
196+
ORT_RETURN_IF(nearest_mode == "floor" && transformation_mode == "pytorch_half_pixel",
197+
"QNN EP: Resize on the NPU does not support the combination of nearest_mode == 'floor' ",
198+
" and transformation_mode == 'pytorch_half_pixel'.");
199+
} else {
200+
// If HTP uses ResizeNearestNeighbor "ceil" or "round_prefer_floor", then the
201+
// transformation_mode "asymmetric" is not supported.
202+
// This is verified in unit test but not be documented in QNN SDK.
203+
ORT_RETURN_IF((nearest_mode == "ceil" || nearest_mode == "round_prefer_floor") && transformation_mode == "asymmetric",
204+
"QNN EP: ResizeNearestNeighbor on the NPU does not support the combination of ",
205+
"nearest_mode == 'ceil' or 'round_prefer_floor' and transformation_mode == 'asymmetric'.");
206+
}
203207
}
204208
}
205209

onnxruntime/core/session/inference_session.cc

Lines changed: 61 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3383,17 +3383,58 @@ common::Status InferenceSession::GetInputOutputMemoryInfo(SessionInputOutputType
33833383

33843384
for (const auto* def : def_list) {
33853385
InlinedVector<SessionState::NodeInfo> node_info_vec;
3386+
Status status;
33863387
if (type == SessionInputOutputType::kOutput) {
3387-
ORT_RETURN_IF_ERROR(session_state_->GetOutputNodeInfo(def->Name(), node_info_vec));
3388+
status = session_state_->GetOutputNodeInfo(def->Name(), node_info_vec);
33883389
} else {
3389-
ORT_RETURN_IF_ERROR(session_state_->GetInputNodeInfo(def->Name(), node_info_vec));
3390+
status = session_state_->GetInputNodeInfo(def->Name(), node_info_vec);
33903391
}
33913392

3392-
// all entries are for the same OrtDevice so use the first one.
3393-
// we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
3394-
// from the session state and use its OrtMemoryInfo.
3395-
auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
3396-
memory_info.push_back(&allocator->Info());
3393+
if (!status.IsOK()) {
3394+
if (type == SessionInputOutputType::kInput) {
3395+
return status;
3396+
}
3397+
3398+
// Check first if this output is produced by an input that directly
3399+
// propagates to output with the same name.
3400+
status = session_state_->GetInputNodeInfo(def->Name(), node_info_vec);
3401+
if (status.IsOK()) {
3402+
// all entries are for the same OrtDevice so use the first one.
3403+
// we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
3404+
// from the session state and use its OrtMemoryInfo.
3405+
auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
3406+
memory_info.push_back(&allocator->Info());
3407+
} else {
3408+
// Check if this output is produced by a constant initializer
3409+
// Pick the MemoryInfo from the initializer's OrtValue
3410+
const auto& ort_value_map = session_state_->GetOrtValueNameIdxMap();
3411+
3412+
OrtValueIndex ort_value_index;
3413+
status = ort_value_map.GetIdx(def->Name(), ort_value_index);
3414+
if (!status.IsOK()) {
3415+
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
3416+
"Failed to find node output or a constant initializer producing output: ",
3417+
def->Name(), ".");
3418+
}
3419+
3420+
const auto& idx_to_ort_value = session_state_->GetInitializedTensors();
3421+
auto it = idx_to_ort_value.find(ort_value_index);
3422+
if (it == idx_to_ort_value.end()) {
3423+
return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
3424+
"Failed to find node output or a constant initializer producing output: ",
3425+
def->Name(), ".");
3426+
}
3427+
const auto& tensor = it->second.Get<Tensor>();
3428+
auto allocator = session_state_->GetAllocator(tensor.Location());
3429+
memory_info.push_back(&allocator->Info());
3430+
}
3431+
} else {
3432+
// all entries are for the same OrtDevice so use the first one.
3433+
// we need to get an OrtMemoryInfo* that will remain valid, so we get the allocator for the OrtDevice
3434+
// from the session state and use its OrtMemoryInfo.
3435+
auto allocator = session_state_->GetAllocator(*node_info_vec.front().device);
3436+
memory_info.push_back(&allocator->Info());
3437+
}
33973438
}
33983439

33993440
return Status::OK();
@@ -3422,15 +3463,19 @@ common::Status InferenceSession::GetEpDeviceForInputs(InlinedVector<const OrtEpD
34223463
for (const auto* def : def_list) {
34233464
InlinedVector<SessionState::NodeInfo> node_info_vec;
34243465
ORT_RETURN_IF_ERROR(session_state_->GetInputNodeInfo(def->Name(), node_info_vec));
3425-
3426-
// if we have a lot of inputs or there are a lot of execution providers it may be worth creating a map
3427-
// instead of doing a linear search each time.
3428-
const auto& ep_name = node_info_vec.front().p_node->GetExecutionProviderType();
3429-
auto it = std::find_if(available_eps.begin(), available_eps.end(), [&ep_name](const OrtEpDevice* entry) {
3430-
return entry->ep_name == ep_name;
3431-
});
3432-
3433-
ep_devices.push_back(it != available_eps.end() ? *it : nullptr);
3466+
assert(!node_info_vec.empty());
3467+
// If we have an input that is not consumed by any node,
3468+
// including nodes in subgraphs, then we return nullptr.
3469+
const auto* p_node = node_info_vec.front().p_node;
3470+
if (p_node != nullptr) {
3471+
const auto ep_name = p_node->GetExecutionProviderType();
3472+
auto it = std::find_if(available_eps.begin(), available_eps.end(), [&ep_name](const OrtEpDevice* entry) {
3473+
return entry->ep_name == ep_name;
3474+
});
3475+
ep_devices.push_back(it != available_eps.end() ? *it : nullptr);
3476+
} else {
3477+
ep_devices.push_back(nullptr);
3478+
}
34343479
}
34353480

34363481
return Status::OK();

onnxruntime/test/framework/inference_session_test.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,8 @@ TEST(InferenceSessionTests, CheckRunLogger) {
630630
}
631631

632632
// WebAssembly will emit profiling data into console
633-
#if !defined(__wasm__)
633+
// TODO(hasesh): Investigate why this test fails on Windows CUDA builds
634+
#if (!defined(__wasm__) && !defined(_WIN32))
634635
TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
635636
SessionOptions so;
636637

onnxruntime/test/onnx/TestCase.cc

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1408,6 +1408,9 @@ std::unique_ptr<std::set<BrokenTest>> GetBrokenTests(const std::string& provider
14081408
broken_tests->insert({"gridsample_volumetric_nearest_align_corners_1", "unknown version"});
14091409
broken_tests->insert({"rotary_embedding_no_position_ids_expanded", "unknown version"});
14101410
broken_tests->insert({"rotary_embedding_no_position_ids_interleaved_expanded", "unknown version"});
1411+
broken_tests->insert({"rotary_embedding_no_position_ids_rotary_dim", "unknown version"});
1412+
broken_tests->insert({"rotary_embedding_with_interleaved_rotary_dim", "unknown version"});
1413+
broken_tests->insert({"rotary_embedding_with_rotary_dim", "unknown version"});
14111414
// Fails since QNN SDK 2.17.0:
14121415
// expected 7.70947 (40f6b3f3), got 7.84096 (40fae920), diff: 0.131491, tol=0.00870947 idx=419. 100 of 1715 differ
14131416
broken_tests->insert({"facedetection_op8_qdq", "result differs"});

0 commit comments

Comments
 (0)