From c18625a820b98275f673db6e57a96f7952529d7d Mon Sep 17 00:00:00 2001
From: minfhong <minfhong@qti.qualcomm.com>
Date: Fri, 27 Jun 2025 13:59:10 +0800
Subject: [PATCH] [QNN EP] Support NonZero.

- Implement NonZero op builder and regsiter QDQ selector.
- Implement ShapeNonZero QNN preprocess to fix shape.
Test: UTs.
---
 .../selectors_actions/shared/utils.cc         |   3 +-
 .../qnn/builder/op_builder_factory.cc         |   4 +
 .../qnn/builder/op_builder_factory.h          |   2 +
 .../qnn/builder/opbuilder/base_op_builder.h   |   1 +
 .../builder/opbuilder/nonzero_op_builder.cc   | 117 +++++++++++++
 .../execution_providers/qnn/preprocess.py     |   6 +-
 .../execution_providers/qnn/shape_nonzero.py  |  85 +++++++++
 .../test/providers/qnn/nonzero_op_test.cc     | 161 ++++++++++++++++++
 .../quantization/test_qnn_preprocess_model.py |  51 ++++++
 9 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 onnxruntime/core/providers/qnn/builder/opbuilder/nonzero_op_builder.cc
 create mode 100644 onnxruntime/python/tools/quantization/execution_providers/qnn/shape_nonzero.py
 create mode 100644 onnxruntime/test/providers/qnn/nonzero_op_test.cc

diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
index 0116dec5170f0..fd3d43f21c8f2 100644
--- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
+++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/shared/utils.cc
@@ -48,7 +48,8 @@ static const OpVersionsAndSelector::OpVersionsMap GetMiscOpVersionsMap() {
 // These produce int64 indices output, which can't be quantized, so there's no downstream Q node.
 static const OpVersionsAndSelector::OpVersionsMap GetDropDQOpVersionsMap() {
   return {{"ArgMax", {}},
-          {"ArgMin", {}}};
+          {"ArgMin", {}},
+          {"NonZero", {}}};
 }
 
 static const OpVersionsAndSelector::OpVersionsMap GetUnaryOpVersionsMap() {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
index cdc7c401ba25e..bc701050c5526 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.cc
@@ -223,6 +223,10 @@ OpBuilderRegistrations::OpBuilderRegistrations() {
   {
     CreateInverseOpBuilder("Inverse", *this);
   }
+
+  {
+    CreateNonZeroOpBuilder("NonZero", *this);
+  }
 }
 
 const IOpBuilder* GetOpBuilder(const std::string& onnx_op_type) {
diff --git a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
index 0c12474c784eb..4fc2d09332530 100644
--- a/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
+++ b/onnxruntime/core/providers/qnn/builder/op_builder_factory.h
@@ -125,5 +125,7 @@ void CreateSTFTOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_
 
 void CreateInverseOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
 
+void CreateNonZeroOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations);
+
 }  // namespace qnn
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
index 83c226115aa84..de6f44f55917a 100644
--- a/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/base_op_builder.h
@@ -156,6 +156,7 @@ class BaseOpBuilder : public IOpBuilder {
         {"Max", QNN_OP_ELEMENT_WISE_MAXIMUM},
         {"Min", QNN_OP_ELEMENT_WISE_MINIMUM},
         {"Neg", QNN_OP_ELEMENT_WISE_NEG},
+        {"NonZero", QNN_OP_NON_ZERO},
         {"Not", QNN_OP_ELEMENT_WISE_NOT},
         {"Or", QNN_OP_ELEMENT_WISE_OR},
         {"Pow", QNN_OP_ELEMENT_WISE_POWER},
diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/nonzero_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/nonzero_op_builder.cc
new file mode 100644
index 0000000000000..b18de47f79b61
--- /dev/null
+++ b/onnxruntime/core/providers/qnn/builder/opbuilder/nonzero_op_builder.cc
@@ -0,0 +1,117 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <string>
+#include <vector>
+
+#include "core/providers/qnn/builder/opbuilder/base_op_builder.h"
+#include "core/providers/qnn/builder/op_builder_factory.h"
+#include "core/providers/qnn/builder/qnn_utils.h"
+
+namespace onnxruntime {
+namespace qnn {
+
+class NonZeroOpBuilder : public BaseOpBuilder {
+ public:
+  NonZeroOpBuilder() : BaseOpBuilder("NonZeroOpBuilder") {}
+
+ protected:
+  Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                     const NodeUnit& node_unit,
+                                     std::vector<std::string>&& input_names,
+                                     const logging::Logger& logger,
+                                     bool do_op_validation) const override ORT_MUST_USE_RESULT;
+};
+
+Status NonZeroOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper,
+                                                     const NodeUnit& node_unit,
+                                                     std::vector<std::string>&& input_names,
+                                                     const logging::Logger& logger,
+                                                     bool do_op_validation) const {
+  // Handle a corner case explicitly, which can pass backend validation but in fact not executable.
+  const std::vector<uint32_t>& input_shape = qnn_model_wrapper.GetQnnTensorWrapper(input_names[0]).GetTensorDims();
+  for (const uint32_t& dim : input_shape) {
+    ORT_RETURN_IF(dim == 0, "QNN does not support NonZero with empty input.");
+  }
+
+  const auto& output = node_unit.Outputs()[0];
+  const std::string& output_name = output.node_arg.Name();
+
+  TensorInfo output_info = {};
+  Status status = qnn_model_wrapper.GetTensorInfo(output, output_info);
+  if (!status.IsOK()) {
+    LOGS(logger, ERROR) << "Encountering NonZero " << node_unit.Name() << " which has dynamically shaped output tensor."
+                        << "QNN supports NonZero by allocating maximum possible size (i.e., all elements != 0), "
+                        << "and fills only the detected nonzero elements in the output tensor."
+                        << "The model must be preproceesed to eliminate the dynamic shapes first for QNN to support.";
+    return status;
+  }
+
+  // ONNX NonZero has shape [input_rank, #input_elements].
+  uint32_t rank = output_info.shape[0];
+  uint32_t num_elements = output_info.shape[1];
+
+  // QNN NonZero has shape [#input elements, input_rank], and thus an extra Transpose must be inserted afterwards.
+  const std::string transpose_input_name = utils::GetUniqueName(output_name, +"_transpose");
+  const std::vector<uint32_t> transpose_input_shape{num_elements, rank};
+  QnnTensorWrapper output_tensorwrapper(transpose_input_name,
+                                        QNN_TENSOR_TYPE_NATIVE,
+                                        output_info.qnn_data_type,
+                                        output_info.quant_param.Copy(),
+                                        std::vector<uint32_t>(transpose_input_shape));
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(output_tensorwrapper)), "Failed to add tensor.");
+  ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetUniqueName(node_unit),
+                                                    QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                    GetQnnOpType(node_unit.OpType()),
+                                                    std::move(input_names),
+                                                    {transpose_input_name},
+                                                    {},
+                                                    do_op_validation),
+                    "Failed to add NonZero node.");
+
+  // NonZero's output is indices which is INT64 dtype. If it happens to be graph output as well, add a Cast node to
+  // cast the dtype back to INT64 since wrapper construction implicitly changes the dtype to INT32.
+  const bool is_cast_required = output_info.qnn_data_type == QNN_DATATYPE_INT_64 &&
+                                qnn_model_wrapper.IsGraphOutput(output_name);
+  const std::string transpose_output_name = is_cast_required ? utils::GetUniqueName(output_name, "_cast") : output_name;
+
+  std::vector<uint32_t> transpose_perm{1, 0};
+  ORT_RETURN_IF_ERROR(qnn_model_wrapper.AddTransposeNode(node_unit.Index(),
+                                                         transpose_input_name,
+                                                         transpose_output_name,
+                                                         transpose_input_shape,
+                                                         transpose_perm,
+                                                         output_info.shape,
+                                                         output_info.qnn_data_type,
+                                                         output_info.quant_param,
+                                                         do_op_validation,
+                                                         false,
+                                                         false));
+
+  if (is_cast_required) {
+    QnnTensorWrapper cast_output_tensorwrapper(output_name,
+                                               QNN_TENSOR_TYPE_APP_READ,
+                                               output_info.qnn_data_type,
+                                               output_info.quant_param.Copy(),
+                                               std::vector<uint32_t>(output_info.shape));
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.AddTensorWrapper(std::move(cast_output_tensorwrapper)),
+                      "Failed to add tensor.");
+    ORT_RETURN_IF_NOT(qnn_model_wrapper.CreateQnnNode(utils::GetUniqueName(node_unit, QNN_OP_CAST),
+                                                      QNN_OP_PACKAGE_NAME_QTI_AISW,
+                                                      QNN_OP_CAST,
+                                                      {transpose_output_name},
+                                                      {output_name},
+                                                      {},
+                                                      do_op_validation),
+                      "Failed to add node");
+  }
+
+  return Status::OK();
+}
+
+void CreateNonZeroOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) {
+  op_registrations.AddOpBuilder(op_type, std::make_unique<NonZeroOpBuilder>());
+}
+
+}  // namespace qnn
+}  // namespace onnxruntime
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index a12aca47f5b65..584463b3bc44c 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -17,6 +17,7 @@
 from ...onnx_model import ONNXModel
 from .fusion_lpnorm import FusionLpNormalization
 from .fusion_spacetodepth import FusionSpaceToDepth
+from .shape_nonzero import ShapeNonZero
 
 
 def qnn_preprocess_model(
@@ -108,6 +109,9 @@ def qnn_preprocess_model(
     if exclude_initializer_from_input:
         modified |= remove_initializer_from_input(onnx_model.model)
 
+    # Shape dynamic-shaped NonZero.
+    modified |= ShapeNonZero(onnx_model).apply()
+
     # Fuse Erf sequence into a single Gelu
     fusion_gelu = FusionGelu(onnx_model)
     if fusion_gelu.apply():
@@ -166,7 +170,7 @@ def qnn_preprocess_model(
     if modified:
         onnx_model.topological_sort()
         onnx.save_model(
-            model,
+            onnx_model.model,
             model_output,
             save_as_external_data=save_as_external_data,
             all_tensors_to_one_file=all_tensors_to_one_file,
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/shape_nonzero.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/shape_nonzero.py
new file mode 100644
index 0000000000000..8733da618f361
--- /dev/null
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/shape_nonzero.py
@@ -0,0 +1,85 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+"""Define NonZero shape inference."""
+
+import logging
+
+import numpy as np
+import onnx
+
+from ... import fusions, onnx_model
+
+
+class ShapeNonZero(fusions.Fusion):
+    """Shape inference for NonZero.
+
+    NonZero node produces dynamically shaped output tensor, causing the tensor shapes of following nodes undetermined
+    as well. QNN expects NonZero having its shape set to maximum size (i.e., number of total input elements) and let
+    runtime handle the dynamic shape later.
+    """
+
+    def __init__(self, model: onnx_model.ONNXModel):
+        """Initialize.
+        Args:
+            model: An onnx_model.ONNXModel instance.
+        """
+        super().__init__(model, "", "NonZero")
+
+    def fuse(
+        self,
+        node: onnx.NodeProto,
+        input_name_to_nodes: dict[str, list[onnx.NodeProto]],
+        output_name_to_node: dict[str, onnx.NodeProto],
+    ) -> bool:
+        """Infer shape for NonZero.
+
+        Args:
+            node: An onnx.NodeProto matching the specified search type (i.e., NonZero).
+            input_name_to_nodes: A dict mapping tensor name to consumed nodes.
+            output_name_to_node: A dict mapping tensor name to produced node.
+
+        Returns:
+            A bool indicating whether the node is updated.
+        """
+        logging.warning(
+            "The model contains a NonZero node which produces a dynamically shaped output tensor."
+            "Following QNN requirements, its output shape will be deliberately set to the maximum size."
+        )
+
+        if (input_tensor_type := self.model.get_tensor_type(node.input[0])) is None or (
+            output_tensor_type := self.model.get_tensor_type(node.output[0])
+        ) is None:
+            return False
+
+        if not (input_tensor_shape := self.tensor_shape_to_list(input_tensor_type)):
+            return False
+
+        if not all(isinstance(dim, int) for dim in input_tensor_shape):
+            return False
+
+        output_tensor_type.shape.dim[1].dim_value = np.prod(input_tensor_shape)
+        return True
+
+    def apply(self) -> bool:
+        """Apply fusion.
+
+        This method is overridden to execute shape inference again since NonZero will have fixed shape.
+
+        Returns:
+            A bool indicating whether the model is updated.
+        """
+        input_name_to_nodes = self.model.input_name_to_nodes()
+        output_name_to_node = self.model.output_name_to_node()
+
+        updated = False
+        for node in self.model.nodes():
+            if node.op_type == self.search_op_type:
+                updated |= self.fuse(node, input_name_to_nodes, output_name_to_node)
+
+        if updated:
+            self.model.model = onnx.shape_inference.infer_shapes(self.model.model)
+
+        return updated
diff --git a/onnxruntime/test/providers/qnn/nonzero_op_test.cc b/onnxruntime/test/providers/qnn/nonzero_op_test.cc
new file mode 100644
index 0000000000000..4ef246045bb90
--- /dev/null
+++ b/onnxruntime/test/providers/qnn/nonzero_op_test.cc
@@ -0,0 +1,161 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#if !defined(ORT_MINIMAL_BUILD)
+
+#include <vector>
+
+#include "gtest/gtest.h"
+
+#include "core/graph/node_attr_utils.h"
+#include "core/graph/onnx_protobuf.h"
+#include "test/providers/qnn/qnn_test_utils.h"
+
+namespace onnxruntime {
+namespace test {
+
+template <typename DataType>
+inline GetTestModelFn BuildNonZeroTestCase(const TestInputDef<DataType>& input_def, const bool fix_shape) {
+  return [input_def, fix_shape](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput<DataType>(builder, input_def);
+
+    NodeArg* output;
+    if (fix_shape) {
+      // Fix NonZero output shape to maximum possible size.
+      const std::vector<int64_t>& shape = input_def.GetShape();
+      std::vector<int64_t> output_shape;
+      output_shape.push_back(static_cast<int64_t>(shape.size()));
+      output_shape.push_back(std::accumulate(shape.begin(),
+                                             shape.end(),
+                                             static_cast<int64_t>(1),
+                                             std::multiplies<int64_t>()));
+
+      output = builder.MakeOutput<int64_t>(output_shape);
+    } else {
+      output = builder.MakeOutput();
+    }
+
+    builder.AddNode("NonZero", {input}, {output});
+  };
+}
+
+template <typename DataType>
+static void RunNonZeroTestOnCPU(const TestInputDef<DataType>& input_def,
+                                const bool fix_shape,
+                                ExpectedEPNodeAssignment expected_ep_assignment) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "cpu";
+
+  // Note that since QNN supported fixed-shape NonZero is in fact not align with ONNX opdef, it could not be executed
+  // by CPU EP.
+  RunQnnModelTest(BuildNonZeroTestCase<DataType>(input_def, fix_shape),
+                  provider_options,
+                  13,
+                  expected_ep_assignment,
+                  /*fp32_abs_err*/ 1e-5f,
+                  /*log_severity*/ logging::Severity::kERROR,
+                  /*verify_outputs*/ false);
+}
+
+// Test NonZero having static shape which is supported by QNN.
+TEST_F(QnnCPUBackendTests, NonZero_StaticShape) {
+  RunNonZeroTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                             true,
+                             ExpectedEPNodeAssignment::All);
+}
+
+// Test NonZero having dynamic shape which is not supported by QNN.
+TEST_F(QnnCPUBackendTests, NonZero_DynamicShape) {
+  RunNonZeroTestOnCPU<float>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                             false,
+                             ExpectedEPNodeAssignment::None);
+}
+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+//
+// HTP tests:
+//
+
+template <typename QuantType>
+inline GetTestModelFn BuildQDQNonZeroTestCase(const TestInputDef<float>& input_def, const bool fix_shape) {
+  return [input_def, fix_shape](ModelTestBuilder& builder) {
+    NodeArg* input = MakeTestInput(builder, input_def);
+    QuantParams<QuantType> input_qparams = GetTestInputQuantParams<QuantType>(input_def);
+    NodeArg* input_qdq = AddQDQNodePair<QuantType>(builder,
+                                                   input,
+                                                   input_qparams.scale,
+                                                   input_qparams.zero_point);
+
+    NodeArg* output;
+    if (fix_shape) {
+      // Fix NonZero output shape to maximum possible size.
+      const std::vector<int64_t>& shape = input_def.GetShape();
+      std::vector<int64_t> output_shape;
+      output_shape.push_back(static_cast<int64_t>(shape.size()));
+      output_shape.push_back(std::accumulate(shape.begin(),
+                                             shape.end(),
+                                             static_cast<int64_t>(1),
+                                             std::multiplies<int64_t>()));
+
+      output = builder.MakeOutput<int64_t>(output_shape);
+    } else {
+      output = builder.MakeOutput();
+    }
+
+    builder.AddNode("NonZero", {input_qdq}, {output});
+  };
+}
+
+template <typename QuantType>
+static void RunQDQNonZeroTestOnHTP(const TestInputDef<float>& input_def,
+                                   const bool fix_shape,
+                                   ExpectedEPNodeAssignment expected_ep_assignment,
+                                   int opset = 13) {
+  ProviderOptions provider_options;
+  provider_options["backend_type"] = "htp";
+  provider_options["offload_graph_io_quantization"] = "0";
+
+  // Note that since QNN supported fixed-shape NonZero is in fact not align with ONNX opdef, it could not be executed
+  // by CPU EP.
+  RunQnnModelTestHTPNoVerify(BuildQDQNonZeroTestCase<QuantType>(input_def, fix_shape),
+                             provider_options,
+                             opset,
+                             expected_ep_assignment);
+}
+
+// Test 8-bit NonZero having static shape which is supported by QNN.
+TEST_F(QnnHTPBackendTests, NonZero_U8_StaticShape) {
+  RunQDQNonZeroTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                  true,
+                                  ExpectedEPNodeAssignment::All);
+}
+
+// Test 8-bit NonZero having dynamic shape which is not supported by QNN.
+TEST_F(QnnHTPBackendTests, NonZero_U8_DynamicShape) {
+  RunQDQNonZeroTestOnHTP<uint8_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                  false,
+                                  ExpectedEPNodeAssignment::None);
+}
+
+// Test 16-bit NonZero having static shape which is supported by QNN.
+TEST_F(QnnHTPBackendTests, NonZero_U16_StaticShape) {
+  RunQDQNonZeroTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                   true,
+                                   ExpectedEPNodeAssignment::All,
+                                   21);
+}
+
+// Test 16-bit NonZero having dynamic shape which is not supported by QNN.
+TEST_F(QnnHTPBackendTests, NonZero_U16_DynamicShape) {
+  RunQDQNonZeroTestOnHTP<uint16_t>(TestInputDef<float>({1, 3, 4, 4}, false, GetFloatDataInRange(-10.0f, 10.0f, 48)),
+                                   false,
+                                   ExpectedEPNodeAssignment::None,
+                                   21);
+}
+
+#endif  // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__)
+
+}  // namespace test
+}  // namespace onnxruntime
+
+#endif  // !defined(ORT_MINIMAL_BUILD)
diff --git a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
index 7e0a8496b8bfb..da3170ebdcb14 100644
--- a/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
+++ b/onnxruntime/test/python/quantization/test_qnn_preprocess_model.py
@@ -266,5 +266,56 @@ def test_make_io_channel_last_rank_error(self):
         self.assertIn("to be of rank >= 3", str(context.exception))
 
 
+class TestQNNPreprocessBase(unittest.TestCase):
+    """Test base class for QNN preprocess."""
+
+    __test__ = False
+
+    def setUp(self):
+        """Set up."""
+        self._model_path = Path("model.onnx")
+        self._preprocessed_model_path = Path("model_preprocessed.onnx")
+
+    def tearDown(self):
+        """Tear down."""
+        if self._model_path.exists():
+            self._model_path.unlink()
+        if self._preprocessed_model_path.exists():
+            self._preprocessed_model_path.unlink()
+
+
+class TestShapeNonZero(TestQNNPreprocessBase):
+    """Test ShapeNonZero preprocess."""
+
+    def test_basic(self):
+        """Test basic case."""
+
+        def build_model():
+            """Build model."""
+            input_ = onnx.helper.make_tensor_value_info("input", onnx.TensorProto.FLOAT, [1, 3, 4, 4])
+            output = onnx.helper.make_tensor_value_info("output", onnx.TensorProto.INT64, None)
+
+            nonzero_node = onnx.helper.make_node("NonZero", ["input"], ["nonzero_output"])
+            transpose_node = onnx.helper.make_node("Transpose", ["nonzero_output"], ["output"], perm=[0, 1])
+
+            graph = onnx.helper.make_graph([nonzero_node, transpose_node], "model", inputs=[input_], outputs=[output])
+            model = onnx.helper.make_model(graph)
+            return onnx.shape_inference.infer_shapes(model)
+
+        onnx.save_model(build_model(), self._model_path)
+
+        modified = qnn_preprocess_model(self._model_path, self._preprocessed_model_path)
+        self.assertTrue(modified)
+
+        preprocessed_model = onnx.load(self._preprocessed_model_path)
+
+        def get_shape(vi):
+            """Get shape for value info."""
+            return [dim.dim_value for dim in vi.type.tensor_type.shape.dim]
+
+        self.assertEqual(get_shape(preprocessed_model.graph.value_info[0]), [4, 48])
+        self.assertEqual(get_shape(preprocessed_model.graph.output[0]), [4, 48])
+
+
 if __name__ == "__main__":
     unittest.main()