diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
index 6df77cb132..33394eb924 100644
--- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc
@@ -83,6 +83,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1,
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int16_t, QuantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, UInt4x2, QuantizeLinear);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Int4x2, QuantizeLinear);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLUT);
+class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLUT);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearLeakyRelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu);
 class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearSigmoid);
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc
index c4c738960b..80460e77e7 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc
@@ -6,6 +6,7 @@
 #include "core/providers/common.h"
 #include "core/mlas/inc/mlas.h"
 #include "core/platform/threadpool.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 using onnxruntime::concurrency::ThreadPool;
 
@@ -95,45 +96,95 @@ void QLinearImpl(OpKernelContext& context, double unit_cost, const ProcessBroadc
 
 template <typename T>
 Status QLinearAdd<T>::Compute(OpKernelContext* context) const {
-  const ProcessBroadcastSpanFuncs functors = {
-      [](BroadcastHelper& per_iter_bh) {
-        QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
-        const T input0 = per_iter_bh.ScalarInput0<T>();
-        auto input1 = per_iter_bh.SpanInput1<T>();
-        auto output = per_iter_bh.OutputSpan<T>();
-
-        MlasQLinearAdd(input1.data(),
-                       qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
-                       &input0,
-                       qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
-                       qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
-                       output.data(), output.size(), true);
-      },
-      [](BroadcastHelper& per_iter_bh) {
-        QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
-        auto input0 = per_iter_bh.SpanInput0<T>();
-        const T input1 = per_iter_bh.ScalarInput1<T>();
-        auto output = per_iter_bh.OutputSpan<T>();
-        MlasQLinearAdd(input0.data(),
-                       qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
-                       &input1,
-                       qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
-                       qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
-                       output.data(), output.size(), true);
-      },
-      [](BroadcastHelper& per_iter_bh) {
-        QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
-        auto input0 = per_iter_bh.SpanInput0<T>();
-        auto input1 = per_iter_bh.SpanInput1<T>();
-        auto output = per_iter_bh.OutputSpan<T>();
-
-        MlasQLinearAdd(input0.data(),
-                       qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
-                       input1.data(),
-                       qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
-                       qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
-                       output.data(), output.size(), false);
-      }};
+  auto* internal_context = dynamic_cast<OpKernelContextInternal*>(context);
+  if (!internal_context) {
+      return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal");
+  }
+  const auto& session_options = internal_context->GetSessionState().GetSessionOptions();
+  // Test to see if we have access to enable_gpnpu flag
+  const bool gpnpu_flag = session_options.enable_gpnpu;
+
+  const ProcessBroadcastSpanFuncs functors = gpnpu_flag ? ProcessBroadcastSpanFuncs{
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            const T input0 = per_iter_bh.ScalarInput0<T>();
+            auto input1 = per_iter_bh.SpanInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAddFixedPoint(input1.data(),
+                                     qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                                     &input0,
+                                     qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                                     qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                                     output.data(), output.size(), true);
+        },
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            auto input0 = per_iter_bh.SpanInput0<T>();
+            const T input1 = per_iter_bh.ScalarInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAddFixedPoint(input0.data(),
+                                     qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                                     &input1,
+                                     qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                                     qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                                     output.data(), output.size(), true);
+        },
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            auto input0 = per_iter_bh.SpanInput0<T>();
+            auto input1 = per_iter_bh.SpanInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAddFixedPoint(input0.data(),
+                                     qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                                     input1.data(),
+                                     qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                                     qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                                     output.data(), output.size(), false);
+        }
+    } : ProcessBroadcastSpanFuncs{
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            const T input0 = per_iter_bh.ScalarInput0<T>();
+            auto input1 = per_iter_bh.SpanInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAdd(input1.data(),
+                           qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                           &input0,
+                           qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                           qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                           output.data(), output.size(), true);
+        },
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            auto input0 = per_iter_bh.SpanInput0<T>();
+            const T input1 = per_iter_bh.ScalarInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAdd(input0.data(),
+                           qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                           &input1,
+                           qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                           qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                           output.data(), output.size(), true);
+        },
+        [](BroadcastHelper& per_iter_bh) {
+            QLinearBroadcastHelper& qlbh = static_cast<QLinearBroadcastHelper&>(per_iter_bh);
+            auto input0 = per_iter_bh.SpanInput0<T>();
+            auto input1 = per_iter_bh.SpanInput1<T>();
+            auto output = per_iter_bh.OutputSpan<T>();
+
+            MlasQLinearAdd(input0.data(),
+                           qlbh.A_scale, static_cast<T>(qlbh.A_zero_point),
+                           input1.data(),
+                           qlbh.B_scale, static_cast<T>(qlbh.B_zero_point),
+                           qlbh.C_scale, static_cast<T>(qlbh.C_zero_point),
+                           output.data(), output.size(), false);
+        }
+    };
 
   QLinearImpl<T>(*context, 1.0, functors);
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
index e9924bf616..8f3ebcec7c 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc
@@ -9,6 +9,7 @@
 #include "core/util/math.h"
 #include "core/mlas/inc/mlas.h"
 #include <functional>
+#include "core/framework/op_kernel_context_internal.h"
 
 using onnxruntime::concurrency::ThreadPool;
 
@@ -55,6 +56,46 @@ Status ComputeQLinearGlobalAvgPool(
   return Status::OK();
 }
 
+template <typename T8Bits>
+Status ComputeQLinearGlobalAvgPoolFixedPoint(
+    const T8Bits* x,
+    float x_scale,
+    T8Bits x_zero_point,
+    T8Bits* y,
+    float y_scale,
+    T8Bits y_zero_point,
+    int64_t N,
+    int64_t C,
+    int64_t image_size,
+    bool channels_last,
+    concurrency::ThreadPool* tp) {
+  if (!channels_last || C == 1) {
+    auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
+      const T8Bits* input = (const T8Bits*)(x + (first * image_size));
+      T8Bits* output = (T8Bits*)(y + first);
+      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), last - first));
+      MlasQLinearGlobalAveragePoolNchwFixedPoint(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, narrow<size_t>(image_size), acc_buffer.data());
+    };
+    concurrency::ThreadPool::TryParallelFor(
+        tp, static_cast<std::ptrdiff_t>(N * C), {1.0 * image_size, 1.0, 8.0 * image_size}, worker);
+  } else {
+    auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) {
+      const T8Bits* input = x + first * C * image_size;
+      T8Bits* output = y + first * C;
+      std::vector<int32_t> acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), narrow<size_t>(C)));
+      std::vector<T8Bits> zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), narrow<size_t>(C)), 0);
+      MlasQLinearGlobalAveragePoolNhwcFixedPoint(
+          input, x_scale, x_zero_point, output, y_scale, y_zero_point,
+          last - first, narrow<size_t>(image_size), narrow<size_t>(C), narrow<size_t>(C), acc_buffer.data(), zero_buffer.data());
+    };
+    concurrency::ThreadPool::TryParallelFor(
+        tp, static_cast<std::ptrdiff_t>(N),
+        {1.0 * image_size * C, 1.0 * C, 8.0 * image_size * C},
+        worker);
+  }
+  return Status::OK();
+}
+
 // GCC's unexplained behavior:
 // GCC wouldn't generate corresponding symbols versus function instances below when "--disable-exceptions"
 // and "--minimal-build" are combined on linux build.
@@ -87,6 +128,32 @@ template Status ComputeQLinearGlobalAvgPool<uint8_t>(
     bool channels_last,
     concurrency::ThreadPool* tp);
 
+template Status ComputeQLinearGlobalAvgPoolFixedPoint<int8_t>(
+    const int8_t* x,
+    float x_scale,
+    int8_t x_zero_point,
+    int8_t* y,
+    float y_scale,
+    int8_t y_zero_point,
+    int64_t N,
+    int64_t C,
+    int64_t image_size,
+    bool channels_last,
+    concurrency::ThreadPool* tp);
+
+template Status ComputeQLinearGlobalAvgPoolFixedPoint<uint8_t>(
+    const uint8_t* x,
+    float x_scale,
+    uint8_t x_zero_point,
+    uint8_t* y,
+    float y_scale,
+    uint8_t y_zero_point,
+    int64_t N,
+    int64_t C,
+    int64_t image_size,
+    bool channels_last,
+    concurrency::ThreadPool* tp);
+
 Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const {
   const auto tensor_x_scale = context->Input<Tensor>(1);
   const auto tensor_x_zero_point = context->Input<Tensor>(2);
@@ -124,14 +191,35 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const {
   const float y_scale = *(tensor_y_scale->Data<float>());
 
   auto dtype = X.GetElementType();
-  if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
-    return ComputeQLinearGlobalAvgPool(X.Data<uint8_t>(), x_scale, *(tensor_x_zero_point->Data<uint8_t>()),
-                                       Y.MutableData<uint8_t>(), y_scale, *(tensor_y_zero_point->Data<uint8_t>()),
-                                       N, C, image_size, channels_last_, tp);
+
+  auto* internal_context = dynamic_cast<OpKernelContextInternal*>(context);
+  if (!internal_context) {
+      return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal");
+  }
+  const auto& session_options = internal_context->GetSessionState().GetSessionOptions();
+  // Test to see if we have access to enable_gpnpu flag
+  const bool gpnpu_flag = session_options.enable_gpnpu;
+
+  if (gpnpu_flag) {
+    if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
+      return ComputeQLinearGlobalAvgPoolFixedPoint(X.Data<uint8_t>(), x_scale, *(tensor_x_zero_point->Data<uint8_t>()),
+                                        Y.MutableData<uint8_t>(), y_scale, *(tensor_y_zero_point->Data<uint8_t>()),
+                                        N, C, image_size, channels_last_, tp);
+    } else {
+      return ComputeQLinearGlobalAvgPoolFixedPoint(X.Data<int8_t>(), x_scale, *(tensor_x_zero_point->Data<int8_t>()),
+                                        Y.MutableData<int8_t>(), y_scale, *(tensor_y_zero_point->Data<int8_t>()),
+                                        N, C, image_size, channels_last_, tp);
+    }
   } else {
-    return ComputeQLinearGlobalAvgPool(X.Data<int8_t>(), x_scale, *(tensor_x_zero_point->Data<int8_t>()),
-                                       Y.MutableData<int8_t>(), y_scale, *(tensor_y_zero_point->Data<int8_t>()),
-                                       N, C, image_size, channels_last_, tp);
+    if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) {
+      return ComputeQLinearGlobalAvgPool(X.Data<uint8_t>(), x_scale, *(tensor_x_zero_point->Data<uint8_t>()),
+                                        Y.MutableData<uint8_t>(), y_scale, *(tensor_y_zero_point->Data<uint8_t>()),
+                                        N, C, image_size, channels_last_, tp);
+    } else {
+      return ComputeQLinearGlobalAvgPool(X.Data<int8_t>(), x_scale, *(tensor_x_zero_point->Data<int8_t>()),
+                                        Y.MutableData<int8_t>(), y_scale, *(tensor_y_zero_point->Data<int8_t>()),
+                                        N, C, image_size, channels_last_, tp);
+    }
   }
 }
 
diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h
index 2f491328a4..c80333fbf5 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h
+++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h
@@ -35,5 +35,19 @@ Status ComputeQLinearGlobalAvgPool(
     bool channels_last,
     concurrency::ThreadPool* tp);
 
+template <typename T8Bits>
+Status ComputeQLinearGlobalAvgPoolFixedPoint(
+    const T8Bits* x,
+    float x_scale,
+    T8Bits x_zero_point,
+    T8Bits* y,
+    float y_scale,
+    T8Bits y_zero_point,
+    int64_t N,
+    int64_t C,
+    int64_t image_size,
+    bool channels_last,
+    concurrency::ThreadPool* tp);
+
 }  // namespace contrib
 }  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc
index ff8ad09082..9ad6dfb568 100644
--- a/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc
+++ b/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc
@@ -8,6 +8,9 @@
 #include "core/providers/cpu/quantization/matmul_integer_base.h"
 #include "core/quantization/quantization.h"
 #include "core/util/math_cpuonly.h"
+#include "core/util/qmath.h"
+#include "core/mlas/inc/mlas.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -18,6 +21,14 @@ class QGemm : protected GemmBase, public MatMulIntegerBase {
   }
 
   Status Compute(OpKernelContext* context) const override {
+    auto* internal_context = dynamic_cast<OpKernelContextInternal*>(context);
+    if (!internal_context) {
+        return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal");
+    }
+    const auto& session_options = internal_context->GetSessionState().GetSessionOptions();
+    // Test to see if we have access to enable_gpnpu flag
+    const bool gpnpu_flag = session_options.enable_gpnpu;
+
     const auto* a = context->Input<Tensor>(IN_A);
     const auto* b = packed_b_ ? nullptr : context->Input<Tensor>(IN_B);
     const auto& b_shape = b ? b->Shape() : b_shape_;
@@ -106,9 +117,17 @@ class QGemm : protected GemmBase, public MatMulIntegerBase {
     gemm_param.PerColumnZeroPoints = !IsScalarOr1ElementVector(b_zp);
 
     std::vector<float> output_scales = ComputeOutputScale(a_scale, b_scale, y_scale);
-    std::optional<MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR> scale_bias_proc_ptr;
+
+    std::optional<MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT> requant_proc_ptr_fixedpoint;
+    std::optional<MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT> scale_bias_proc_ptr_fixedpoint;
     std::optional<MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR> requant_proc_ptr;
-    SetPostProcessor(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr, requant_proc_ptr);
+    std::optional<MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR> scale_bias_proc_ptr;
+
+    if (gpnpu_flag) {
+      SetPostProcessorFixedPoint(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr_fixedpoint, requant_proc_ptr_fixedpoint);
+    } else {
+      SetPostProcessor(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr, requant_proc_ptr);
+    }
 
     MlasGemmBatch(gemm_shape, &gemm_param, 1, context->GetOperatorThreadPool());
     return Status::OK();
@@ -210,6 +229,36 @@ class QGemm : protected GemmBase, public MatMulIntegerBase {
       gemm_param.OutputProcessor = &*scale_bias_proc_ptr;
     }
   }
+  static void SetPostProcessorFixedPoint(const Tensor* y_zp,
+                               size_t out_lda,
+                               const std::vector<float>& output_scales,
+                               Tensor* y,
+                               MLAS_GEMM_QUANT_DATA_PARAMS& gemm_param,
+                               std::optional<MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT>& scale_bias_proc_ptr,
+                               std::optional<MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT>& requant_proc_ptr) {
+    if (nullptr != y_zp) {
+      bool is_y_signed = y->IsDataType<int8_t>();
+      int32_t y_zero_point = is_y_signed ? *y_zp->Data<int8_t>() : *y_zp->Data<uint8_t>();
+      requant_proc_ptr.emplace(
+          y->MutableDataRaw(),
+          out_lda,
+          nullptr,
+          output_scales.data(),
+          output_scales.size() > 1,
+          y_zero_point,
+          is_y_signed);
+      gemm_param.OutputProcessor = &*requant_proc_ptr;
+    } else {
+      scale_bias_proc_ptr.emplace(
+          static_cast<float*>(y->MutableDataRaw()),
+          out_lda,
+          output_scales.data(),
+          nullptr,
+          MLAS_QGEMM_OUTPUT_MODE::ZeroMode,
+          output_scales.size() > 1 ? MLAS_QUANTIZATION_GRANULARITY::PerColumn : MLAS_QUANTIZATION_GRANULARITY::PerMatrix);
+      gemm_param.OutputProcessor = &*scale_bias_proc_ptr;
+    }
+  }
 };
 
 ONNX_OPERATOR_TYPED_KERNEL_EX(
diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h
index 64bd70465a..f0e7ff32a2 100644
--- a/onnxruntime/core/framework/op_kernel_context_internal.h
+++ b/onnxruntime/core/framework/op_kernel_context_internal.h
@@ -42,6 +42,11 @@ class OpKernelContextInternal : public OpKernelContext {
     return session_state_.GetUseDeterministicCompute();
   }
 
+  // Add a getter method for session_state_
+  const SessionState& GetSessionState() const {
+    return session_state_;
+  }
+
   const SessionState* SubgraphSessionState(const std::string& attribute_name) {
     return session_state_.GetSubgraphSessionState(GetNodeIndex(), attribute_name);
   }
diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h
index 8d4db36106..5bb3562240 100644
--- a/onnxruntime/core/framework/session_options.h
+++ b/onnxruntime/core/framework/session_options.h
@@ -80,6 +80,9 @@ struct SessionOptions {
   // set the execution order of the graph
   ExecutionOrder execution_order = ExecutionOrder::DEFAULT;
 
+  // set to true if emulating gpnpu
+  bool enable_gpnpu = false;
+
   // enable profiling for this session.
   bool enable_profiling = false;
 
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 28ae64c4d5..c2533c797e 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -569,6 +569,56 @@ class MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSO
     MLAS_QUANTIZATION_GRANULARITY QuantGran_;
 };
 
+class MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT : public MLAS_QGEMM_OUTPUT_PROCESSOR {
+public:
+    MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT(
+        float* Output,
+        size_t LeadingDimensionOutput,
+        const float* Scale,
+        const float* Bias,
+        MLAS_QGEMM_OUTPUT_MODE Mode = MLAS_QGEMM_OUTPUT_MODE::ZeroMode,
+        MLAS_QUANTIZATION_GRANULARITY QuantGran = MLAS_QUANTIZATION_GRANULARITY::PerMatrix) :
+            Output_(Output),
+            LeadingDimensionOutput_(LeadingDimensionOutput),
+            Scale_(Scale),
+            Bias_(Bias),
+            OutputMode_(Mode),
+            QuantGran_(QuantGran)
+    {
+    }
+
+    void
+    Process(
+        const int32_t* C,
+        size_t StartM,
+        size_t StartN,
+        size_t CountM,
+        size_t CountN,
+        size_t ldc
+        ) const override;
+
+private:
+    template<bool HasBias, MLAS_QGEMM_OUTPUT_MODE Mode, MLAS_QUANTIZATION_GRANULARITY QuantGran>
+    inline
+    void
+    ProcessImpl(
+        const int32_t* C,
+        size_t StartM,
+        size_t StartN,
+        size_t CountM,
+        size_t CountN,
+        size_t ldc
+        ) const;
+
+private:
+    float* Output_;
+    size_t LeadingDimensionOutput_;
+    const float* Scale_;
+    const float* Bias_;
+    MLAS_QGEMM_OUTPUT_MODE OutputMode_;
+    MLAS_QUANTIZATION_GRANULARITY QuantGran_;
+};
+
 /**
  * @brief Supply matrices shape and data type information to quantized gemm functions
  *
@@ -1268,6 +1318,24 @@ MlasRequantizeOutput(
     size_t CountN
     );
 
+template<typename OutputType>
+void
+MLASCALL
+MlasRequantizeOutputFixedPoint(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    OutputType* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    OutputType ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    );
+
 class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR
 {
    public:
@@ -1318,6 +1386,56 @@ class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR
     bool OutputIsSigned_;
 };
 
+class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT : public MLAS_QGEMM_OUTPUT_PROCESSOR
+{
+   public:
+    MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT(
+        void* Output,
+        size_t OutputLeadingDimension,
+        const int32_t* Bias,
+        const float* Scale,
+        bool PerColumnScale,
+        int32_t ZeroPoint,
+        bool OutputIsSigned)
+        : Output_(Output),
+          OutputLeadingDimension_(OutputLeadingDimension),
+          Bias_(Bias),
+          Scale_(Scale),
+          PerColumnScale_(PerColumnScale),
+          ZeroPoint_(ZeroPoint),
+          OutputIsSigned_(OutputIsSigned)
+    {
+    }
+
+    void Process(const int32_t* C,
+                 size_t StartM,
+                 size_t StartN,
+                 size_t CountM,
+                 size_t CountN,
+                 size_t ldc) const override
+    {
+        if(OutputIsSigned_){
+            MlasRequantizeOutputFixedPoint(C, ldc, reinterpret_cast<int8_t*>(Output_), OutputLeadingDimension_,
+                                 Bias_, Scale_, PerColumnScale_, static_cast<int8_t>(ZeroPoint_),
+                                 StartM, StartN, CountM, CountN);
+        } else {
+            MlasRequantizeOutputFixedPoint(C, ldc, reinterpret_cast<uint8_t*>(Output_), OutputLeadingDimension_,
+                                 Bias_, Scale_, PerColumnScale_, static_cast<uint8_t>(ZeroPoint_),
+                                 StartM, StartN, CountM, CountN);
+        }
+    }
+
+
+   private:
+    void* Output_;
+    size_t OutputLeadingDimension_;
+    const int32_t* Bias_;
+    const float* Scale_;
+    bool PerColumnScale_;
+    int32_t ZeroPoint_;
+    bool OutputIsSigned_;
+};
+
 
 void
 MLASCALL
@@ -1368,6 +1486,39 @@ MlasQLinearGlobalAveragePoolNhwc(
     const T8Bits* ZeroBuffer
     );
 
+template<typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchwFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    );
+
+template <typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwcFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    );
+
 //
 // InputA is of size N,
 // Input B is of size 1 if IsScalarB == true, otherwise it is of size N
@@ -1389,6 +1540,23 @@ MlasQLinearAdd(
     bool IsScalarB
     );
 
+template<typename DataType>
+void
+MLASCALL
+MlasQLinearAddFixedPoint(
+    const DataType* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const DataType* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    DataType* OutputC,
+    size_t N,
+    bool IsScalarB
+    );
+
 template<typename DataType>
 void
 MLASCALL
diff --git a/onnxruntime/core/mlas/lib/qfunctions_helper.cpp b/onnxruntime/core/mlas/lib/qfunctions_helper.cpp
new file mode 100644
index 0000000000..08bad5119a
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qfunctions_helper.cpp
@@ -0,0 +1,47 @@
+#include "mlasi.h"
+#include <iostream>
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+#include <algorithm>
+#include <numeric>
+#include <iterator>
+#include <type_traits>
+
+// Copying logic from data_to_qfp in tvm/python/tvm/target/epu_fx_util.py
+// For purposes of calculating number of frac bits needed to represent scale in quantize ops
+
+// Function to derive fractional bits
+int deriveFractionalBits(double scalar, int qfpSize) {
+    int valueBits = qfpSize - 1;
+
+    double intPart;
+    ::modf(scalar, &intPart); // Returns the frac part which we dont care about, int part gets stored in pointer
+    intPart = std::abs(intPart);
+
+    int intBits = (intPart == 0) ? 0 : static_cast<int>(std::log2f(intPart)) + 1;
+    int fracBits = valueBits - intBits;
+
+    assert(fracBits >= 0 && "Scalar cannot be represented in qfp format.");
+
+    return fracBits;
+}
+
+// Function to convert scalar to qfp
+int scalarToQfp(double value, int fracBits) {
+    double frac, integer;
+    frac = ::modf(value, &integer);
+
+    integer = static_cast<int>(std::abs(integer)) << fracBits;
+    frac = std::roundf(std::abs(frac) * (1 << fracBits));
+
+    int qfp = static_cast<int>(integer + frac);
+    if (value < 0) {
+        qfp *= -1;
+    }
+
+    return qfp;
+}
diff --git a/onnxruntime/core/mlas/lib/qfunctions_helper.h b/onnxruntime/core/mlas/lib/qfunctions_helper.h
new file mode 100644
index 0000000000..ab0ea38e6e
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/qfunctions_helper.h
@@ -0,0 +1,80 @@
+#include <cstdint> // For uint8_t, int32_t, etc.
+#include <vector>
+#include <utility>
+#include <cmath>
+#include <cassert>
+#include <algorithm>
+#include <numeric>
+#include <iterator>
+#include <type_traits>
+
+// Copying logic from fxRoundPosInf in cgc_ccl.hpp for custom round
+template <uint8_t aFracBits>
+inline int32_t customRound(const int32_t a) {
+    const int32_t zp5 = 1 << (aFracBits - 1);
+    return (a + zp5) >> aFracBits;
+}
+
+// Function to derive fractional bits
+int deriveFractionalBits(double scalar, int qfpSize);
+
+// Function to convert scalar to qfp
+int scalarToQfp(double value, int fracBits);
+
+// Function to convert data to qfp
+template <typename T>
+std::pair<std::vector<int>, int> dataToQfp(
+    const std::vector<T>& data, int fracBits = -1, int qfpSize = 32, bool scalarAsFloat = true
+) {
+    auto deriveFractionalBits = [qfpSize](double scalar) {
+        int valueBits = qfpSize - 1;
+
+        double intPart;
+        ::modf(scalar, &intPart);
+        intPart = std::abs(intPart);
+
+        int intBits = (intPart == 0) ? 0 : static_cast<int>(std::log2f(intPart)) + 1;
+        int fracBits = valueBits - intBits;
+
+        assert(fracBits >= 0 && "Scalar cannot be represented in qfp format.");
+
+        return fracBits;
+    };
+
+    auto scalarToQfp = [](double value, int fracBits) {
+        double frac, integer;
+        frac = ::modf(value, &integer);
+
+        integer = static_cast<int>(std::abs(integer)) << fracBits;
+        frac = std::roundf(std::abs(frac) * (1 << fracBits));
+
+        int qfp = static_cast<int>(integer + frac);
+        if (value < 0) {
+            qfp *= -1;
+        }
+
+        return qfp;
+    };
+
+    std::vector<int> qfp;
+    if (data.size() != 1) {
+        if (fracBits == -1) {
+            fracBits = deriveFractionalBits(*std::max_element(data.begin(), data.end(), [](T a, T b) { return std::abs(a) < std::abs(b); }));
+        }
+        qfp.reserve(data.size());
+        std::transform(data.begin(), data.end(), std::back_inserter(qfp), [fracBits, &scalarToQfp](T value) {
+            return scalarToQfp(value, fracBits);
+        });
+    } else {
+        if (fracBits == -1) {
+            fracBits = deriveFractionalBits(data[0]);
+        }
+        if (scalarAsFloat) {
+            qfp.push_back(static_cast<int>(data[0]));
+        } else {
+            qfp.push_back(scalarToQfp(data[0], fracBits));
+        }
+    }
+
+    return std::make_pair(qfp, fracBits);
+}
diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h
index 1ef5b5f741..2ae291d957 100644
--- a/onnxruntime/core/mlas/lib/qgemm.h
+++ b/onnxruntime/core/mlas/lib/qgemm.h
@@ -168,6 +168,22 @@ MlasGemmQuantKernel(
     bool ZeroMode
 );
 
+template<typename KernelType>
+size_t
+MlasGemmQuantKernelFixedPoint(
+    const typename KernelType::PackedAType* A,
+    const typename KernelType::PackedBType* B,
+    int32_t* C,
+    size_t PackedCountK,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc,
+    const int32_t* RowSumBuffer,
+    const int32_t* ColumnSumBuffer,
+    const int32_t* ZeroPointB,
+    bool ZeroMode
+);
+
 /**
  * @brief Usually a wrapper of assembly/intrinsic kernel
  *        of symmetric quant gemm
diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
index 8f4baaa0ff..769464ebbf 100644
--- a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
+++ b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp
@@ -214,6 +214,63 @@ MlasGemmQuantKernel<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
     return 1;
 }
 
+template<>
+size_t
+MlasGemmQuantKernelFixedPoint<MLAS_GEMM_QUANT_KERNEL_DEFAULT>(
+    const MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedAType* A,
+    const MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedBType* B,
+    int32_t* C,
+    size_t PackedCountK,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc,
+    const int32_t* RowSumBuffer,
+    const int32_t* ColumnSumBuffer,
+    const int32_t* ZeroPointB,
+    bool ZeroMode
+    )
+{
+    MLAS_UNREFERENCED_PARAMETER(CountM);
+    MLAS_UNREFERENCED_PARAMETER(ldc);
+
+    //
+    // Process a single column of matrix B in a loop.
+    //
+
+    while (CountN-- > 0) {
+
+        int32_t Accumulator = *RowSumBuffer;
+
+        if (ZeroPointB != nullptr) {
+            Accumulator *= *ZeroPointB++;
+        }
+
+        Accumulator += *ColumnSumBuffer++;
+
+        const auto* a = A;
+
+        for (size_t k = 0; k < PackedCountK; k++) {
+
+            Accumulator += a[0] * B[0];
+            Accumulator += a[1] * B[1];
+            Accumulator += a[2] * B[2];
+            Accumulator += a[3] * B[3];
+
+            a += 4;
+            B += 4;
+        }
+
+        if (!ZeroMode) {
+            Accumulator += C[0];
+        }
+
+        C[0] = Accumulator;
+        C += 1;
+    }
+
+    return 1;
+}
+
 const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault = {
     MlasGemmQuantOperation<MLAS_GEMM_QUANT_KERNEL_DEFAULT>,
     nullptr,
diff --git a/onnxruntime/core/mlas/lib/qladd.cpp b/onnxruntime/core/mlas/lib/qladd.cpp
index 5dafa17c2a..73f148bb9c 100644
--- a/onnxruntime/core/mlas/lib/qladd.cpp
+++ b/onnxruntime/core/mlas/lib/qladd.cpp
@@ -19,6 +19,23 @@ Module Name:
 --*/
 
 #include "qladd.h"
+// #include "qfunctions_helper.h"
+
+#include "mlasi.h"
+#include <iostream>
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+#include <algorithm>
+#include <numeric>
+#include <iterator>
+#include <type_traits>
+
+#include "qfunctions_helper.h"
+
 
 // Pure C++ helper, back off here in rare case.
 template<typename DataType, bool IsScalarB>
@@ -58,6 +75,69 @@ MlasQLinearAddKernelRawHelper(
     }
 }
 
+template<typename DataType, bool IsScalarB>
+MLAS_FORCEINLINE
+static
+void
+MlasQLinearAddKernelRawHelperFixedPoint(
+    const DataType* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const DataType* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    DataType* OutputC,
+    size_t N
+    )
+{
+    int dequant_frac_bits = 16; // hard coded in tvm python
+
+    std::vector<double> ScaleValueVecA = {ScaleA/ScaleC};  // Create single-element vector
+    auto pairA = dataToQfp(ScaleValueVecA, -1, 32, false); // Returns std::make_pair(qfp, fracBits)
+    int fracBitsA = pairA.second;
+    int mulScaleA = fracBitsA - dequant_frac_bits;
+    int64_t* fpScaleA = new int64_t;
+
+    std::vector<double> ScaleValueVecB = {ScaleB/ScaleC};  // Create single-element vector
+    auto pairB = dataToQfp(ScaleValueVecB, -1, 32, false); // Returns std::make_pair(qfp, fracBits)
+    int fracBitsB = pairB.second;
+    int mulScaleB = fracBitsB - dequant_frac_bits;
+    int64_t* fpScaleB = new int64_t;
+
+    int fracBits = (fracBitsA > fracBitsB) ? fracBitsA : fracBitsB;
+
+    *fpScaleA = static_cast<int64_t>((ScaleA/ScaleC) * (1LL << fracBits));
+    *fpScaleB = static_cast<int64_t>((ScaleB/ScaleC) * (1LL << fracBits));
+
+    const int32_t MinimumValue = std::numeric_limits<DataType>::lowest();
+    const int32_t MaximumValue = std::numeric_limits<DataType>::max();
+
+    int64_t ValueB;
+
+    int mulScale = (fracBitsA > fracBitsB) ? mulScaleA : mulScaleB;
+
+    if (IsScalarB) {
+        ValueB = ((int64_t(InputB[0]) - ZeroPointB) * (*fpScaleB)) >> mulScale;
+    }
+
+    // here
+    for (size_t n = 0; n < N; n++) {
+        int64_t ValueA = ((*fpScaleA) * (int64_t(InputA[n]) - ZeroPointA)) >> mulScale;
+        if (!IsScalarB) {
+            ValueB = ((*fpScaleB) * (int64_t(InputB[n]) - ZeroPointB)) >> mulScale;
+        }
+        int64_t ValueC = ValueA + ValueB;
+
+        // ValueC = ValueC >> mulScaleC;
+        ValueC = customRound<16>(static_cast<int32_t>(ValueC));
+        int32_t ValueCInt = static_cast<int32_t>(ValueC + ZeroPointC);
+        ValueCInt = std::min(std::max(ValueCInt, MinimumValue), MaximumValue);
+        OutputC[n] = (DataType)(ValueCInt);
+    }
+}
+
 #if defined(MLAS_NEON_INTRINSICS)
 
 bool MlasCalcQLinearAddParameters(
@@ -717,6 +797,33 @@ MlasQLinearAddKernel(
     }
 }
 
+template<typename DataType>
+static
+void
+MLASCALL
+MlasQLinearAddKernelFixedPoint(
+    const DataType* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const DataType* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    DataType* OutputC,
+    size_t N,
+    bool IsScalarB
+    )
+{
+    if (IsScalarB) {
+        MlasQLinearAddKernelRawHelperFixedPoint<DataType, true>(
+            InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N);
+    } else {
+        MlasQLinearAddKernelRawHelperFixedPoint<DataType, false>(
+            InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N);
+    }
+}
+
 template<>
 void
 MLASCALL
@@ -767,6 +874,46 @@ MlasQLinearAdd<uint8_t>(
             InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB);
 }
 
+template<>
+void
+MLASCALL
+MlasQLinearAddFixedPoint<int8_t>(
+    const int8_t* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const int8_t* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    int8_t* OutputC,
+    size_t N,
+    bool IsScalarB
+    )
+{
+    MlasQLinearAddKernelFixedPoint<int8_t>(InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB);
+}
+
+template<>
+void
+MLASCALL
+MlasQLinearAddFixedPoint<uint8_t>(
+    const uint8_t* InputA,
+    float ScaleA,
+    int32_t ZeroPointA,
+    const uint8_t* InputB,
+    float ScaleB,
+    int32_t ZeroPointB,
+    float ScaleC,
+    int32_t ZeroPointC,
+    uint8_t* OutputC,
+    size_t N,
+    bool IsScalarB
+    )
+{
+    MlasQLinearAddKernelFixedPoint<uint8_t>(InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB);
+}
+
 //
 // Function definition for platform usage
 //
diff --git a/onnxruntime/core/mlas/lib/qlgavgpool.cpp b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
index e44d7ad25c..4905cd4cee 100644
--- a/onnxruntime/core/mlas/lib/qlgavgpool.cpp
+++ b/onnxruntime/core/mlas/lib/qlgavgpool.cpp
@@ -15,6 +15,9 @@ Module Name:
 --*/
 
 #include "mlasi.h"
+#include "qfunctions_helper.h"
+#include <iostream>
+
 
 size_t
 MLASCALL
@@ -240,8 +243,617 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
 
             CALCULATE_ACCUMULATE_VECTORS();
 
-            vst1q_s32(acc, vacc_lo);
-            vst1q_s32(acc + 4, vacc_hi);
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+        }
+        finish_one_pass = true;
+
+        i0 += step_next_group;
+        i1 += step_next_group;
+        i2 += step_next_group;
+        i3 += step_next_group;
+        i4 += step_next_group;
+        i5 += step_next_group;
+        i6 += step_next_group;
+    }
+
+    if (ImageSize > 0) {
+
+        switch (ImageSize) {
+            case 1:
+                i1 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 2:
+                i2 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 3:
+                i3 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 4:
+                i4 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 5:
+                i5 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 6:
+                i6 = (const uint8_t*)ZeroBuffer; /* fall through */
+            default:
+                break;
+        }
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+            acc += 8;
+        }
+
+        if (c > 0) {
+
+            const uint8x8_t vi0 =
+                vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
+            const uint8x8_t vi1 = vld1_u8(
+                ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
+            const uint8x8_t vi2 = vld1_u8(
+                ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
+            const uint8x8_t vi3 = vld1_u8(
+                ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
+            const uint8x8_t vi4 = vld1_u8(
+                ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
+            const uint8x8_t vi5 = vld1_u8(
+                ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
+            const uint8x8_t vi6 = vld1_u8(
+                ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+        }
+    }
+    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
+                         Output_zero_point, 0, 0, 1, Channels);
+}
+
+template <typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchwFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    int32_t bias[] = {-ZeroPointInput * static_cast<int32_t>(ImageSize), 0, 0, 0};
+    const int32x4_t vbias = vld1q_s32(bias);
+    const int32x4_t vzero = vmovq_n_s32(0);
+    const uint8_t* InputU8 = (const uint8_t*)(Input);
+
+    int32_t* sum_buffer = AccumulateBuffer;
+    uint8_t tail_buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    for (size_t c = Channels; c > 0; c--) {
+
+        int32x4_t vacc_lo = vbias;
+        int32x4_t vacc_hi = vzero;
+        auto Len = ImageSize;
+        for (; Len >= 32; Len -= 32) {
+
+            const uint8x8_t vi0 = vld1_u8(InputU8);
+            const uint8x8_t vi1 = vld1_u8(InputU8 + 8);
+            const uint8x8_t vi2 = vld1_u8(InputU8 + 16);
+            const uint8x8_t vi3 = vld1_u8(InputU8 + 24);
+
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const int16x8_t vs01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1));
+                const int16x8_t vs23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3));
+                vsum = vaddq_s16(vs01, vs23);
+            } else {
+
+                const uint16x8_t vs01 = vaddl_u8(vi0, vi1);
+                const uint16x8_t vs23 = vaddl_u8(vi2, vi3);
+                vsum = vreinterpretq_s16_u16(vaddq_u16(vs01, vs23));
+            }
+
+            vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
+            vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
+            InputU8 += 32;
+        }
+        for (; Len >= 8; Len -= 8) {
+
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+                vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(InputU8)));
+            } else {
+                vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(InputU8)));
+            }
+            vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
+            vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
+            InputU8 += 8;
+        }
+
+        if (Len > 0) {
+
+            memcpy(tail_buffer, InputU8, Len);
+            int16x8_t vsum;
+            if constexpr (std::is_signed<T8Bits>::value) {
+                vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(tail_buffer)));
+            } else {
+                vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(tail_buffer)));
+            }
+
+            vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));
+            vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum));
+            InputU8 += Len;
+        }
+
+        vacc_lo = vaddq_s32(vacc_lo, vacc_hi);
+        int32x2_t vacc = vadd_s32(vget_high_s32(vacc_lo), vget_low_s32(vacc_lo));
+        *sum_buffer++ = vget_lane_s32(vpadd_s32(vacc, vacc), 0);
+    }
+
+    MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
+                         static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
+}
+
+template <typename T8Bits>
+MLAS_FORCEINLINE
+void
+MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint(
+    const T8Bits* Input,
+    T8Bits* Output,
+    const T8Bits* LastOf8,
+    size_t ImageSize,
+    size_t Channels,
+    size_t Stride,
+    int32_t Bias,
+    float Scale,
+    T8Bits Output_zero_point,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    )
+{
+#define LOAD_FULL_CHANNELS()           \
+    const uint8x8_t vi0 = vld1_u8(i0); \
+    i0 += 8;                           \
+    const uint8x8_t vi1 = vld1_u8(i1); \
+    i1 += 8;                           \
+    const uint8x8_t vi2 = vld1_u8(i2); \
+    i2 += 8;                           \
+    const uint8x8_t vi3 = vld1_u8(i3); \
+    i3 += 8;                           \
+    const uint8x8_t vi4 = vld1_u8(i4); \
+    i4 += 8;                           \
+    const uint8x8_t vi5 = vld1_u8(i5); \
+    i5 += 8;                           \
+    const uint8x8_t vi6 = vld1_u8(i6); \
+    i6 += 8
+
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    int32x4_t vacc_lo = finish_one_pass ? vld1q_s32(acc) : vbias;                              \
+    int32x4_t vacc_hi = finish_one_pass ? vld1q_s32(acc + 4) : vbias;                          \
+    int16x8_t vsum;                                                                            \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        const int16x8_t vsum01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1));   \
+        const int16x8_t vsum23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3));   \
+        const int16x8_t vsum45 = vaddl_s8(vreinterpret_s8_u8(vi4), vreinterpret_s8_u8(vi5));   \
+        const int16x8_t vsum016 = vaddw_s8(vsum01, vreinterpret_s8_u8(vi6));                   \
+        const int16x8_t vsum2345 = vaddq_s16(vsum23, vsum45);                                  \
+        vsum = vaddq_s16(vsum016, vsum2345);                                                   \
+    } else {                                                                                   \
+        const uint16x8_t vsum01 = vaddl_u8(vi0, vi1);                                          \
+        const uint16x8_t vsum23 = vaddl_u8(vi2, vi3);                                          \
+        const uint16x8_t vsum45 = vaddl_u8(vi4, vi5);                                          \
+        const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6);                                      \
+        const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45);                                 \
+        vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345));                            \
+    }                                                                                          \
+    vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum));                                          \
+    vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum))
+
+    uint8_t tail[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    const int32x4_t vbias = vld1q_dup_s32(&Bias);
+    bool finish_one_pass = false;
+    const size_t step_next_group = 7 * Stride - (Channels & ~size_t{7});
+
+    const uint8_t* LastOf8U8 = (const uint8_t*)LastOf8;
+    const uint8_t* i0 = (const uint8_t*)Input;
+    const uint8_t* i1 = i0 + Stride;
+    const uint8_t* i4 = i0 + Stride * 4;
+    const uint8_t* i2 = i1 + Stride;
+    const uint8_t* i5 = i4 + Stride;
+    const uint8_t* i3 = i2 + Stride;
+    const uint8_t* i6 = i5 + Stride;
+
+    for (; ImageSize > 7; ImageSize -= 7) {
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+            acc += 8;
+        }
+        if (c > 0) {
+
+            const uint8x8_t vi0 = vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
+            const uint8x8_t vi1 = vld1_u8(((i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
+            const uint8x8_t vi2 = vld1_u8(((i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
+            const uint8x8_t vi3 = vld1_u8(((i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
+            const uint8x8_t vi4 = vld1_u8(((i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
+            const uint8x8_t vi5 = vld1_u8(((i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
+            const uint8x8_t vi6 = vld1_u8(((i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+        }
+        finish_one_pass = true;
+
+        i0 += step_next_group;
+        i1 += step_next_group;
+        i2 += step_next_group;
+        i3 += step_next_group;
+        i4 += step_next_group;
+        i5 += step_next_group;
+        i6 += step_next_group;
+    }
+
+    if (ImageSize > 0) {
+
+        switch (ImageSize) {
+            case 1:
+                i1 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 2:
+                i2 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 3:
+                i3 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 4:
+                i4 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 5:
+                i5 = (const uint8_t*)ZeroBuffer; /* fall through */
+            case 6:
+                i6 = (const uint8_t*)ZeroBuffer; /* fall through */
+            default:
+                break;
+        }
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+            acc += 8;
+        }
+
+        if (c > 0) {
+
+            const uint8x8_t vi0 =
+                vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
+            const uint8x8_t vi1 = vld1_u8(
+                ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
+            const uint8x8_t vi2 = vld1_u8(
+                ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
+            const uint8x8_t vi3 = vld1_u8(
+                ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
+            const uint8x8_t vi4 = vld1_u8(
+                ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
+            const uint8x8_t vi5 = vld1_u8(
+                ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
+            const uint8x8_t vi6 = vld1_u8(
+                ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            vst1q_s32(acc, vacc_lo);
+            vst1q_s32(acc + 4, vacc_hi);
+        }
+    }
+    MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
+                         Output_zero_point, 0, 0, 1, Channels);
+}
+
+#elif defined(MLAS_SSE2_INTRINSICS)
+
+template <typename T8Bits>
+void MLASCALL
+MlasQLinearGlobalAveragePoolNchw(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    const int32_t bias[] = {-ZeroPointInput * static_cast<int32_t>(ImageSize), 0, 0, 0};
+    const auto vbias = _mm_loadu_si128((const __m128i*)&bias);
+    const auto vzero = _mm_setzero_si128();
+    uint8_t buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    int32_t* sum_buffer = AccumulateBuffer;
+    for (size_t c = Channels; c > 0; c--) {
+
+        __m128i vacc_lo = vbias;
+        __m128i vacc_hi = vzero;
+        auto Len = ImageSize;
+        for (; Len >= 32; Len -= 32) {
+
+            const __m128i vi0 = _mm_loadl_epi64((const __m128i*)Input);
+            const __m128i vi1 = _mm_loadl_epi64((const __m128i*)(Input + 8));
+            const __m128i vi2 = _mm_loadl_epi64((const __m128i*)(Input + 16));
+            const __m128i vi3 = _mm_loadl_epi64((const __m128i*)(Input + 24));
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);
+                const __m128i vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);
+                const __m128i vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);
+                const __m128i vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);
+                const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1),
+                                                   _mm_add_epi16(vxi2, vxi3));
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero);
+                const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero);
+                const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero);
+                const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero);
+                const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1),
+                                                   _mm_add_epi16(vxi2, vxi3));
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
+
+            Input += 32;
+        }
+        for (; Len >= 8; Len -= 8) {
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)Input)), 8);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)Input), vzero);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
+
+            Input += 8;
+        }
+        if (Len > 0) {
+
+            memcpy(buffer, Input, Len);
+
+            if constexpr (std::is_signed<T8Bits>::value) {
+
+                const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)buffer)), 8);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16));
+            } else {
+
+                const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)buffer), vzero);
+                vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));
+                vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));
+            }
+
+            Input += Len;
+        }
+
+        __m128i vacc = _mm_add_epi32(vacc_lo, vacc_hi);                    // [ D C | B A ]
+        __m128i vshuf = _mm_shuffle_epi32(vacc, _MM_SHUFFLE(2, 3, 0, 1));  // [ C D | A B ]
+        __m128i vsums = _mm_add_epi32(vacc, vshuf);                        // [ D+C C+D | B+A A+B ]
+        vshuf = _mm_shuffle_epi32(vsums, _MM_SHUFFLE(1, 0, 3, 2));         // [ B+A A+B | D+C C+D ]
+        vsums = _mm_add_epi32(vsums, vshuf);
+        *sum_buffer++ = _mm_cvtsi128_si32(vsums);
+    }
+
+    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
+                         static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
+}
+
+template <typename T8Bits>
+MLAS_FORCEINLINE
+void
+MlasQLinearGlobalAveragePoolNhwcSingleBatch(
+    const T8Bits* Input,
+    T8Bits* Output,
+    const T8Bits* LastOf8,
+    size_t ImageSize,
+    size_t Channels,
+    size_t Stride,
+    int32_t Bias,
+    float Scale,
+    T8Bits Output_zero_point,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    )
+{
+#if defined(MLAS_TARGET_IX86)
+
+    constexpr size_t PixelsPerIteration = 4;
+
+#define LOAD_FULL_CHANNELS()                                 \
+    const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0); \
+    i0 += 8;                                                 \
+    const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1); \
+    i1 += 8;                                                 \
+    const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2); \
+    i2 += 8;                                                 \
+    const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3); \
+    i3 += 8;
+
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                \
+    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;          \
+    __m128i vxi0;                                                                              \
+    __m128i vxi1;                                                                              \
+    __m128i vxi2;                                                                              \
+    __m128i vxi3;                                                                              \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);                               \
+        vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);                               \
+        vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);                               \
+        vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);                               \
+    } else {                                                                                   \
+        vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                  \
+        vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                  \
+        vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                  \
+        vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                  \
+    }                                                                                          \
+    __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                                \
+    __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                                \
+    __m128i vsum = _mm_add_epi16(vsum01, vsum23);                                              \
+                                                                                               \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \
+    } else {                                                                                   \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                     \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));                     \
+    }
+
+#else
+
+    constexpr size_t PixelsPerIteration = 7;
+#define LOAD_FULL_CHANNELS()                                 \
+    const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0); \
+    i0 += 8;                                                 \
+    const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1); \
+    i1 += 8;                                                 \
+    const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2); \
+    i2 += 8;                                                 \
+    const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3); \
+    i3 += 8;                                                 \
+    const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4); \
+    i4 += 8;                                                 \
+    const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5); \
+    i5 += 8;                                                 \
+    const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6); \
+    i6 += 8
+
+#define CALCULATE_ACCUMULATE_VECTORS()                                                         \
+    __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias;                \
+    __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias;          \
+    __m128i vxi0;                                                                              \
+    __m128i vxi1;                                                                              \
+    __m128i vxi2;                                                                              \
+    __m128i vxi3;                                                                              \
+    __m128i vxi4;                                                                              \
+    __m128i vxi5;                                                                              \
+    __m128i vxi6;                                                                              \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8);                               \
+        vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8);                               \
+        vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8);                               \
+        vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8);                               \
+        vxi4 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi4), 8);                               \
+        vxi5 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi5), 8);                               \
+        vxi6 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi6), 8);                               \
+    } else {                                                                                   \
+        vxi0 = _mm_unpacklo_epi8(vi0, vzero);                                                  \
+        vxi1 = _mm_unpacklo_epi8(vi1, vzero);                                                  \
+        vxi2 = _mm_unpacklo_epi8(vi2, vzero);                                                  \
+        vxi3 = _mm_unpacklo_epi8(vi3, vzero);                                                  \
+        vxi4 = _mm_unpacklo_epi8(vi4, vzero);                                                  \
+        vxi5 = _mm_unpacklo_epi8(vi5, vzero);                                                  \
+        vxi6 = _mm_unpacklo_epi8(vi6, vzero);                                                  \
+    }                                                                                          \
+    const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1);                                          \
+    const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3);                                          \
+    const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5);                                          \
+    const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6);                                       \
+    const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45);                                    \
+    const __m128i vsum = _mm_add_epi16(vsum016, vsum2345);                                     \
+    if constexpr (std::is_signed<T8Bits>::value) {                                             \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \
+    } else {                                                                                   \
+        vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero));                     \
+        vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero));                     \
+    }
+
+#endif
+
+    T8Bits tail[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+    bool finish_one_pass = false;
+    const __m128i vbias = _mm_set1_epi32(Bias);
+    const __m128i vzero = _mm_setzero_si128();
+    size_t step_next_group = PixelsPerIteration * Stride - (Channels & ~size_t{7});
+
+    const T8Bits* i0 = Input;
+    const T8Bits* i1 = i0 + Stride;
+    const T8Bits* i2 = i1 + Stride;
+    const T8Bits* i3 = i2 + Stride;
+#if !defined(MLAS_TARGET_IX86)
+    const T8Bits* i4 = i0 + Stride * 4;
+    const T8Bits* i5 = i4 + Stride;
+    const T8Bits* i6 = i5 + Stride;
+#endif
+
+    for (; ImageSize > PixelsPerIteration; ImageSize -= PixelsPerIteration) {
+
+        int32_t* acc = AccumulateBuffer;
+        size_t c = Channels;
+        for (; c >= 8; c -= 8) {
+
+            LOAD_FULL_CHANNELS();
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            _mm_storeu_si128((__m128i*)acc, vacc_lo);
+            _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi);
+            acc += 8;
+        }
+        if (c > 0) {
+            const __m128i vi0 =
+                _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
+            const __m128i vi1 =
+                _mm_loadl_epi64((const __m128i*)(i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
+            const __m128i vi2 =
+                _mm_loadl_epi64((const __m128i*)(i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
+            const __m128i vi3 =
+                _mm_loadl_epi64((const __m128i*)(i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
+#if !defined(MLAS_TARGET_IX86)
+            const __m128i vi4 =
+                _mm_loadl_epi64((const __m128i*)(i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
+            const __m128i vi5 =
+                _mm_loadl_epi64((const __m128i*)(i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
+            const __m128i vi6 =
+                _mm_loadl_epi64((const __m128i*)(i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
+#endif
+
+            CALCULATE_ACCUMULATE_VECTORS();
+
+            _mm_storeu_si128((__m128i*)acc, vacc_lo);
+            _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi);
         }
         finish_one_pass = true;
 
@@ -249,29 +861,52 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
         i1 += step_next_group;
         i2 += step_next_group;
         i3 += step_next_group;
+#if !defined(MLAS_TARGET_IX86)
         i4 += step_next_group;
         i5 += step_next_group;
         i6 += step_next_group;
+#endif
     }
 
     if (ImageSize > 0) {
-
+#if defined(MLAS_TARGET_IX86)
         switch (ImageSize) {
             case 1:
-                i1 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i1 = ZeroBuffer;
+                [[fallthrough]];
             case 2:
-                i2 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i2 = ZeroBuffer;
+                [[fallthrough]];
             case 3:
-                i3 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i3 = ZeroBuffer;
+                [[fallthrough]];
+            default:
+                break;
+        }
+#else
+        switch (ImageSize) {
+            case 1:
+                i1 = ZeroBuffer;
+                [[fallthrough]];
+            case 2:
+                i2 = ZeroBuffer;
+                [[fallthrough]];
+            case 3:
+                i3 = ZeroBuffer;
+                [[fallthrough]];
             case 4:
-                i4 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i4 = ZeroBuffer;
+                [[fallthrough]];
             case 5:
-                i5 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i5 = ZeroBuffer;
+                [[fallthrough]];
             case 6:
-                i6 = (const uint8_t*)ZeroBuffer; /* fall through */
+                i6 = ZeroBuffer;
+                [[fallthrough]];
             default:
                 break;
         }
+#endif
 
         int32_t* acc = AccumulateBuffer;
         size_t c = Channels;
@@ -281,43 +916,42 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
 
             CALCULATE_ACCUMULATE_VECTORS();
 
-            vst1q_s32(acc, vacc_lo);
-            vst1q_s32(acc + 4, vacc_hi);
+            _mm_storeu_si128((__m128i*)acc, vacc_lo);
+            _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi);
             acc += 8;
         }
 
         if (c > 0) {
-
-            const uint8x8_t vi0 =
-                vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0));
-            const uint8x8_t vi1 = vld1_u8(
-                ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1));
-            const uint8x8_t vi2 = vld1_u8(
-                ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2));
-            const uint8x8_t vi3 = vld1_u8(
-                ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3));
-            const uint8x8_t vi4 = vld1_u8(
-                ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4));
-            const uint8x8_t vi5 = vld1_u8(
-                ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5));
-            const uint8x8_t vi6 = vld1_u8(
-                ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6));
+            const __m128i vi0 =
+                _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0));
+            const __m128i vi1 = _mm_loadl_epi64(
+                (const __m128i*)(1 < ImageSize && i1 >= LastOf8 ? memcpy(tail, i1, c) : i1));
+            const __m128i vi2 = _mm_loadl_epi64(
+                (const __m128i*)(2 < ImageSize && i2 >= LastOf8 ? memcpy(tail, i2, c) : i2));
+            const __m128i vi3 = _mm_loadl_epi64(
+                (const __m128i*)(3 < ImageSize && i3 >= LastOf8 ? memcpy(tail, i3, c) : i3));
+#if !defined(MLAS_TARGET_IX86)
+            const __m128i vi4 = _mm_loadl_epi64(
+                (const __m128i*)(4 < ImageSize && i4 >= LastOf8 ? memcpy(tail, i4, c) : i4));
+            const __m128i vi5 = _mm_loadl_epi64(
+                (const __m128i*)(5 < ImageSize && i5 >= LastOf8 ? memcpy(tail, i5, c) : i5));
+            const __m128i vi6 = _mm_loadl_epi64(
+                (const __m128i*)(6 < ImageSize && i6 >= LastOf8 ? memcpy(tail, i6, c) : i6));
+#endif
 
             CALCULATE_ACCUMULATE_VECTORS();
 
-            vst1q_s32(acc, vacc_lo);
-            vst1q_s32(acc + 4, vacc_hi);
+            _mm_storeu_si128((__m128i*)acc, vacc_lo);
+            _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi);
         }
     }
     MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
                          Output_zero_point, 0, 0, 1, Channels);
 }
 
-#elif defined(MLAS_SSE2_INTRINSICS)
-
 template <typename T8Bits>
 void MLASCALL
-MlasQLinearGlobalAveragePoolNchw(
+MlasQLinearGlobalAveragePoolNchwFixedPoint(
     const T8Bits* Input,
     float ScaleInput,
     int32_t ZeroPointInput,
@@ -415,14 +1049,14 @@ MlasQLinearGlobalAveragePoolNchw(
         *sum_buffer++ = _mm_cvtsi128_si32(vsums);
     }
 
-    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
+    MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false,
                          static_cast<T8Bits>(ZeroPointOutput), 0, 0, 1, Channels);
 }
 
 template <typename T8Bits>
 MLAS_FORCEINLINE
 void
-MlasQLinearGlobalAveragePoolNhwcSingleBatch(
+MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint(
     const T8Bits* Input,
     T8Bits* Output,
     const T8Bits* LastOf8,
@@ -685,7 +1319,7 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch(
             _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi);
         }
     }
-    MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
+    MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false,
                          Output_zero_point, 0, 0, 1, Channels);
 }
 
@@ -1079,8 +1713,97 @@ MlasQLinearGlobalAveragePoolNhwc(
     }
 }
 
+template <typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchwFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* /* AccumulateBuffer */
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    std::vector<double> ScaleValueVec = {scale};  // Create single-element vector
+    auto pair = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits)
+    int fracBits = pair.second;
+    int64_t* fpScale = new int64_t;
+    *fpScale = static_cast<int64_t>((scale) * (1LL << fracBits));
+
+    int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
+    for (; Channels > 0; Channels--) {
+
+        int32_t acc = bias;
+        for (size_t i = 0; i < ImageSize; ++i) {
+            acc += static_cast<int32_t>(*Input++);
+        }
+        int32_t v = static_cast<int32_t>((acc * (*fpScale)) >> fracBits) + ZeroPointOutput;
+        v = std::min(static_cast<int32_t>(std::numeric_limits<T8Bits>::max()), v);
+        v = std::max(static_cast<int32_t>(std::numeric_limits<T8Bits>::lowest()), v);
+        *Output++ = static_cast<T8Bits>(v);
+    }
+}
+
+template <typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwcFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const T8Bits* /*ZeroBuffer*/
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    std::vector<double> ScaleValueVec = {scale};  // Create single-element vector
+    auto pair = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits)
+    int fracBits = pair.second;
+    int64_t* fpScale = new int64_t;
+    *fpScale = static_cast<int64_t>((scale) * (1LL << fracBits));
+
+    int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
+    for (; Batch > 0; Batch--) {
+
+        const T8Bits* batch_input = Input;
+        T8Bits* batch_output = Output;
+        Input += Stride * ImageSize;
+        Output += Stride;
+        std::fill_n(AccumulateBuffer, Channels, bias);
+        for (size_t i = 0; i < ImageSize; ++i) {
+
+            for (size_t c = 0; c < Channels; ++c) {
+                AccumulateBuffer[c] += static_cast<int>(batch_input[c]);
+            }
+
+            batch_input += Stride;
+        }
+
+        for (size_t c = 0; c < Channels; ++c) {
+
+            int32_t v = static_cast<int32_t>((AccumulateBuffer[c] * (*fpScale)) >> fracBits) + ZeroPointOutput;
+            v = std::min(static_cast<int32_t>(std::numeric_limits<T8Bits>::max()), v);
+            v = std::max(static_cast<int32_t>(std::numeric_limits<T8Bits>::lowest()), v);
+            *batch_output++ = static_cast<T8Bits>(v);
+        }
+    }
+}
+
 #endif
 
+
 #if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_LSX_INTRINSICS)
 
 template <typename T8Bits>
@@ -1114,6 +1837,37 @@ MlasQLinearGlobalAveragePoolNhwc(
     }
 }
 
+template <typename T8Bits>
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwcFixedPoint(
+    const T8Bits* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    T8Bits* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const T8Bits* ZeroBuffer
+    )
+{
+    float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize);
+    const int32_t bias = -ZeroPointInput * static_cast<int32_t>(ImageSize);
+    const T8Bits* inputLastOf8 = Input + (Batch * ImageSize * Stride - Stride + Channels) - 8;
+
+    for (; Batch > 0; Batch--) {
+        MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint(
+            Input, Output, inputLastOf8, ImageSize, Channels, Stride, bias, scale,
+            static_cast<T8Bits>(ZeroPointOutput), AccumulateBuffer, ZeroBuffer);
+        Input += ImageSize * Stride;
+        Output += Stride;
+    }
+}
+
 #endif
 
 template
@@ -1181,3 +1935,69 @@ MlasQLinearGlobalAveragePoolNhwc<uint8_t>(
     int32_t* AccumulateBuffer,
     const uint8_t* ZeroBuffer
     );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchwFixedPoint<int8_t>(
+    const int8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    int8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNchwFixedPoint<uint8_t>(
+    const uint8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    uint8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Channels,
+    size_t ImageSize,
+    int32_t* AccumulateBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwcFixedPoint<int8_t>(
+    const int8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    int8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const int8_t* ZeroBuffer
+    );
+
+template
+void
+MLASCALL
+MlasQLinearGlobalAveragePoolNhwcFixedPoint<uint8_t>(
+    const uint8_t* Input,
+    float ScaleInput,
+    int32_t ZeroPointInput,
+    uint8_t* Output,
+    float ScaleOutput,
+    int32_t ZeroPointOutput,
+    size_t Batch,
+    size_t ImageSize,
+    size_t Stride,
+    size_t Channels,
+    int32_t* AccumulateBuffer,
+    const uint8_t* ZeroBuffer
+    );
diff --git a/onnxruntime/core/mlas/lib/qpostprocessor.cpp b/onnxruntime/core/mlas/lib/qpostprocessor.cpp
index 97e9000a19..620a5efa50 100644
--- a/onnxruntime/core/mlas/lib/qpostprocessor.cpp
+++ b/onnxruntime/core/mlas/lib/qpostprocessor.cpp
@@ -158,6 +158,227 @@ Return Value:
     Output += StartM * LeadingDimensionOutput_ + StartN;
 
 
+    while (CountM-- > 0) {
+
+        float* c_out = Output;
+        const int32_t* c = C;
+        const float* bias = Bias;
+        const float* scale = Scale;
+
+        size_t n = CountN;
+
+        while (n >= 4) {
+
+            MLAS_FLOAT32X4 FloatVector = MlasCastToFloat32x4(MlasLoadInt32x4(c));
+
+            if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) {
+                ScaleVector = MlasLoadFloat32x4(scale);
+                scale += 4;
+            }
+
+            if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+                FloatVector = MlasMultiplyAddFloat32x4(FloatVector, ScaleVector, MlasLoadFloat32x4(c_out));
+            } else {
+                FloatVector = MlasMultiplyFloat32x4(FloatVector, ScaleVector);
+            }
+
+            if (HasBias) {
+                FloatVector = MlasAddFloat32x4(FloatVector, MlasLoadFloat32x4(bias));
+                bias += 4;
+            }
+
+            MlasStoreFloat32x4(c_out, FloatVector);
+
+            c_out += 4;
+            c += 4;
+            n -= 4;
+        }
+
+        for (size_t offset = 0; offset < n; offset++) {
+
+#if defined(MLAS_SSE2_INTRINSICS)
+            __m128 FloatVector = _mm_set_ss(float(c[offset]));
+
+            if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) {
+                ScaleVector = _mm_load_ss(&scale[offset]);
+            }
+
+            if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+                FloatVector = _mm_add_ps(_mm_mul_ss(FloatVector, ScaleVector), _mm_load_ss(&c_out[offset]));
+            } else {
+                FloatVector = _mm_mul_ss(FloatVector, ScaleVector);
+            }
+
+            if (HasBias) {
+                FloatVector = _mm_add_ss(FloatVector, _mm_load_ss(&bias[offset]));
+            }
+
+            _mm_store_ss(&c_out[offset], FloatVector);
+#else
+            if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) {
+                ScaleValue = scale[offset];
+            }
+
+            float result = float(c[offset]) * ScaleValue;
+            if (HasBias) {
+                result += bias[offset];
+            }
+
+            if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+                c_out[offset] += result;
+            } else {
+                c_out[offset] = result;
+            }
+#endif
+        }
+
+        C += ldc;
+        Output += LeadingDimensionOutput_;
+    }
+}
+
+void MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT::Process(
+    const int32_t* C,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc
+    ) const
+{
+    if (Bias_) {
+        if (QuantGran_ == MLAS_QUANTIZATION_GRANULARITY::PerColumn) {
+            if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+                ProcessImpl<true, MLAS_QGEMM_OUTPUT_MODE::AccumulateMode, MLAS_QUANTIZATION_GRANULARITY::PerColumn>(
+                    C,
+                    StartM,
+                    StartN,
+                    CountM,
+                    CountN,
+                    ldc);
+            } else {
+                ProcessImpl<true, MLAS_QGEMM_OUTPUT_MODE::ZeroMode, MLAS_QUANTIZATION_GRANULARITY::PerColumn>(
+                    C,
+                    StartM,
+                    StartN,
+                    CountM,
+                    CountN,
+                    ldc);
+            }
+        } else if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+            ProcessImpl<true, MLAS_QGEMM_OUTPUT_MODE::AccumulateMode, MLAS_QUANTIZATION_GRANULARITY::PerMatrix>(
+                C,
+                StartM,
+                StartN,
+                CountM,
+                CountN,
+                ldc);
+        } else {
+            ProcessImpl<true, MLAS_QGEMM_OUTPUT_MODE::ZeroMode, MLAS_QUANTIZATION_GRANULARITY::PerMatrix>(
+                C,
+                StartM,
+                StartN,
+                CountM,
+                CountN,
+                ldc);
+        }
+    } else {
+        if (QuantGran_ == MLAS_QUANTIZATION_GRANULARITY::PerColumn) {
+            if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+                ProcessImpl<false, MLAS_QGEMM_OUTPUT_MODE::AccumulateMode, MLAS_QUANTIZATION_GRANULARITY::PerColumn>(
+                    C,
+                    StartM,
+                    StartN,
+                    CountM,
+                    CountN,
+                    ldc);
+            } else {
+                ProcessImpl<false, MLAS_QGEMM_OUTPUT_MODE::ZeroMode, MLAS_QUANTIZATION_GRANULARITY::PerColumn>(
+                    C,
+                    StartM,
+                    StartN,
+                    CountM,
+                    CountN,
+                    ldc);
+            }
+        } else if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) {
+            ProcessImpl<false, MLAS_QGEMM_OUTPUT_MODE::AccumulateMode, MLAS_QUANTIZATION_GRANULARITY::PerMatrix>(
+                C,
+                StartM,
+                StartN,
+                CountM,
+                CountN,
+                ldc);
+        } else {
+            ProcessImpl<false, MLAS_QGEMM_OUTPUT_MODE::ZeroMode, MLAS_QUANTIZATION_GRANULARITY::PerMatrix>(
+                C,
+                StartM,
+                StartN,
+                CountM,
+                CountN,
+                ldc);
+        }
+    }
+}
+
+template<bool HasBias, MLAS_QGEMM_OUTPUT_MODE Mode, MLAS_QUANTIZATION_GRANULARITY QuantGran>
+inline
+void
+MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT::ProcessImpl(
+    const int32_t* C,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN,
+    size_t ldc) const
+/*++
+
+Routine Description:
+
+    This routine converts the output matrix C to a floating point format using
+    the stored scale and bias parameters.
+
+Arguments:
+
+    C - Supplies the address of matrix C.
+
+    StartM - Supplies the starting row offset relative to the matrix.
+
+    StartN - Supplies the starting column offset relative to the matrix.
+
+    CountM - Supplies the number of rows of the output matrix to process.
+
+    CountN - Supplies the number of columns of the output matrix to process.
+
+    ldc - Supplies the leading dimension of C.
+
+Return Value:
+
+    None.
+
+--*/
+{
+    float* Output = Output_;
+    const float* Bias = Bias_;
+    const float* Scale = Scale_;
+
+    if (HasBias) {
+        Bias += StartN;
+    }
+
+    if(QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn){
+        Scale += StartN;
+    }
+
+    MLAS_FLOAT32X4 ScaleVector = MlasBroadcastFloat32x4(Scale_);
+#if !defined(MLAS_SSE2_INTRINSICS)
+    float ScaleValue = MlasExtractLaneFloat32x4<0>(ScaleVector);
+#endif
+
+    C += StartM * ldc + StartN;
+    Output += StartM * LeadingDimensionOutput_ + StartN;
+
+
     while (CountM-- > 0) {
 
         float* c_out = Output;
diff --git a/onnxruntime/core/mlas/lib/quantize.cpp b/onnxruntime/core/mlas/lib/quantize.cpp
index ae638fafee..ccddd8a4ad 100644
--- a/onnxruntime/core/mlas/lib/quantize.cpp
+++ b/onnxruntime/core/mlas/lib/quantize.cpp
@@ -19,12 +19,24 @@ Module Name:
 --*/
 
 #include "mlasi.h"
+#include <iostream>
+
+#include <cmath>
+#include <cstdlib>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+#include <algorithm>
+#include <numeric>
+#include <iterator>
 
 #if defined(MLAS_NEON64_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || \
-    defined(MLAS_LSX_INTRINSICS)
+    defined(MLAS_LSX_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS)
 
 #include <type_traits>
 
+#include "qfunctions_helper.h"
+
 //
 // QuantizeLinear implementation using NEON or SSE2 intrinsics.
 //
@@ -1023,7 +1035,7 @@ MlasRequantizeOutput(
     size_t CountN
     )
 {
-    const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale);
+    const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale); // _mm_load1_ps loads one float into 4 words
     const __m128 MinimumValueVector = _mm_set1_ps(float(std::numeric_limits<OutputType>::lowest() - ZeroPoint));
     const __m128 MaximumValueVector = _mm_set1_ps(float(std::numeric_limits<OutputType>::max() - ZeroPoint));
     const __m128i ZeroPointVector = _mm_set1_epi32(ZeroPoint);
@@ -2047,6 +2059,93 @@ MlasRequantizeOutput(
 
 #endif
 
+template <typename OutputType>
+void
+MLASCALL
+MlasRequantizeOutputFixedPoint(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    OutputType* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    OutputType ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    )
+{
+    // New MlasRequantizeOuput but for fixed point not floating point
+    // Floating point conversion to fixed point is multiply by 2**n where n is the number of decimal places
+    // Then, interpret this number as a 32 bit int
+    // Need to wrap into vector to use function scalarToQfp
+    std::vector<float> ScaleValueVec = {*Scale};  // Create single-element vector
+    auto p = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits)
+    int fracBits = p.second;
+    int mulScale = fracBits - 2;
+
+    int64_t* fpScale = new int64_t;
+    *fpScale = static_cast<int64_t>(*Scale * (1LL << fracBits));
+
+    const int32_t PerMatrixScaleValue = PerColumnScale ? 0 : static_cast<int32_t>(*fpScale);
+    const int32_t MinimumValue = std::numeric_limits<OutputType>::lowest();
+    const int32_t MaximumValue = std::numeric_limits<OutputType>::max();
+
+
+    if (nullptr != Bias) {
+        Bias += StartN;
+    }
+    if (PerColumnScale) {
+        fpScale += StartN;
+    }
+
+    Input += StartM * InputLeadingDimension + StartN;
+    Output += StartM * OutputLeadingDimension + StartN;
+
+    //
+    // Step through each row of the output matrix.
+    //
+
+    while (CountM-- > 0) {
+
+        const int32_t* bias = Bias;
+        const int64_t* fpscale = fpScale;
+        size_t n = CountN;
+
+        auto* RowInput = Input;
+        auto* RowOutput = Output;
+
+        while (n > 0) {
+
+            int32_t IntegerValue = *RowInput++;
+
+            if (bias != nullptr) {
+               IntegerValue += *bias++;
+            }
+
+            int64_t ScaleValue = PerColumnScale ? *fpscale++ : PerMatrixScaleValue;
+
+            int64_t largeInt = static_cast<int64_t>(IntegerValue) * ScaleValue; // This is a 29 fixed point
+            largeInt = largeInt >> mulScale;
+            IntegerValue = customRound<2>(static_cast<int32_t>(largeInt));
+            int32_t Intermediate = IntegerValue + ZeroPoint;
+            Intermediate = std::max(Intermediate, MinimumValue);
+            Intermediate = std::min(Intermediate, MaximumValue);
+
+            *RowOutput++ = OutputType(Intermediate);
+
+            n -= 1;
+        }
+
+        // Next Row
+        Input += InputLeadingDimension;
+        Output += OutputLeadingDimension;
+    }
+    delete fpScale;
+}
+
 template
 void
 MLASCALL
@@ -2083,6 +2182,42 @@ MlasRequantizeOutput<uint8_t>(
     size_t CountN
     );
 
+template
+void
+MLASCALL
+MlasRequantizeOutputFixedPoint<int8_t>(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    int8_t* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    int8_t ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    );
+
+template
+void
+MLASCALL
+MlasRequantizeOutputFixedPoint<uint8_t>(
+    const int32_t* Input,
+    size_t InputLeadingDimension,
+    uint8_t* Output,
+    size_t OutputLeadingDimension,
+    const int32_t* Bias,
+    const float* Scale,
+    bool PerColumnScale,
+    uint8_t ZeroPoint,
+    size_t StartM,
+    size_t StartN,
+    size_t CountM,
+    size_t CountN
+    );
+
 void
 MLASCALL
 MlasFindMinMaxElement(
diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
index 7797cbe678..42291f89c9 100644
--- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
+++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc
@@ -10,6 +10,7 @@
 #include "core/util/math_cpuonly.h"
 #include "core/util/qmath.h"
 #include "core/mlas/inc/mlas.h"
+#include "core/framework/op_kernel_context_internal.h"
 
 namespace onnxruntime {
 
@@ -513,6 +514,15 @@ Status QLinearConv<ActType>::UseSharedPrePackedBuffers(std::vector<BufferUniqueP
 
 template <typename ActType>
 Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
+  // Cast to internal type because we want to access session_options parameter
+  auto* internal_context = dynamic_cast<OpKernelContextInternal*>(context);
+  if (!internal_context) {
+      return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal");
+  }
+  const auto& session_options = internal_context->GetSessionState().GetSessionOptions();
+  // Test to see if we have access to enable_gpnpu flag
+  const bool gpnpu_flag = session_options.enable_gpnpu;
+
   const Tensor* X = context->Input<Tensor>(InputTensors::IN_X);
   const Tensor* W = is_W_packed_ ? nullptr : context->Input<Tensor>(InputTensors::IN_W);
   const auto& W_shape = W ? W->Shape() : W_shape_;
@@ -973,8 +983,9 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
           }
         }
       }
-
-      MlasRequantizeOutput(
+      if (gpnpu_flag) {
+        // New MlasRequantizeOuput but for fixed point not floating point
+        MlasRequantizeOutputFixedPoint(
           worker_gemm_output,
           static_cast<size_t>(M),
           worker_output,
@@ -987,6 +998,21 @@ Status QLinearConv<ActType>::Compute(OpKernelContext* context) const {
           0,
           static_cast<size_t>(output_count),
           static_cast<size_t>(M));
+      } else {
+        MlasRequantizeOutput(
+          worker_gemm_output,
+          static_cast<size_t>(M),
+          worker_output,
+          static_cast<size_t>(M),
+          Bdata,
+          output_scales.data(),
+          output_scales.size() > 1,
+          Y_zero_point_value,
+          0,
+          0,
+          static_cast<size_t>(output_count),
+          static_cast<size_t>(M));
+      }
     };
 
     concurrency::ThreadPool::TrySimpleParallelFor(thread_pool, onnxruntime::narrow<ptrdiff_t>(task_count), conv_worker);
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index 7af659851e..4a6c9a0e9f 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1654,6 +1654,13 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra
           },
           R"pbdoc(Enables the memory arena on CPU. Arena may pre-allocate memory for future usage.
 Set this option to false if you don't want it. Default is True.)pbdoc")
+      .def_property(
+          "enable_gpnpu",
+          [](const PySessionOptions* options) -> bool { return options->value.enable_gpnpu; },
+          [](PySessionOptions* options, bool enable_gpnpu) -> void {
+            options->value.enable_gpnpu = enable_gpnpu;
+          },
+          R"pbdoc(Enable GPNPU mode. Default is false.)pbdoc")
       .def_property(
           "enable_profiling",
           [](const PySessionOptions* options) -> bool { return options->value.enable_profiling; },
diff --git a/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json b/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json
new file mode 100644
index 0000000000..81c449c552
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json
@@ -0,0 +1,237 @@
+[
+{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :24021,"ts" :6,"ph" : "X","name" :"model_loading_uri","args" : {}},
+{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :85073,"ts" :24066,"ph" : "X","name" :"session_initialization","args" : {}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :109329,"ph" : "X","name" :"input_QuantizeLinear_fence_before","args" : {"op_name" : "QuantizeLinear"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1325,"ts" :109332,"ph" : "X","name" :"input_QuantizeLinear_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [109], "core": 2, "Distribution": 4, "DistributionEnqueue": 3, "Run": 1254, "Wait": 21, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 1, "core": 30},"132401808279232": {"num_run": 1, "core": 19},"132401663575744": {"num_run": 1, "core": 16},"132401797793472": {"num_run": 1, "core": 26},"132401787307712": {"num_run": 1, "core": 31},"132401776821952": {"num_run": 1, "core": 17},"132401692935872": {"num_run": 1, "core": 24},"132401682450112": {"num_run": 1, "core": 29},"132401653089984": {"num_run": 1, "core": 9},"132401642604224": {"num_run": 0, "core": -1},"132401558718144": {"num_run": 0, "core": -1},"132401548232384": {"num_run": 0, "core": -1},"132401537746624": {"num_run": 0, "core": -1},"132401527260864": {"num_run": 0, "core": -1},"132401516775104": {"num_run": 0, "core": -1}}},"output_type_shape" : [{"int8":[1,3,224,224]}],"output_size" : "150528","parameter_size" : "5","activation_size" : "602112","node_index" : "0","input_type_shape" : [{"float":[1,3,224,224]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QuantizeLinear"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110663,"ph" : "X","name" :"input_QuantizeLinear_fence_after","args" : {"op_name" : "QuantizeLinear"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110667,"ph" : "X","name" :"Transpose_fence_before","args" : {"op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :36,"ts" :110668,"ph" : "X","name" :"Transpose_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 1, "core": 30},"132401808279232": {"num_run": 1, "core": 19},"132401663575744": {"num_run": 1, "core": 16},"132401797793472": {"num_run": 1, "core": 26},"132401787307712": {"num_run": 1, "core": 31},"132401776821952": {"num_run": 1, "core": 17},"132401692935872": {"num_run": 1, "core": 24},"132401682450112": {"num_run": 1, "core": 29},"132401653089984": {"num_run": 1, "core": 9},"132401642604224": {"num_run": 0, "core": -1},"132401558718144": {"num_run": 0, "core": -1},"132401548232384": {"num_run": 0, "core": -1},"132401537746624": {"num_run": 0, "core": -1},"132401527260864": {"num_run": 0, "core": -1},"132401516775104": {"num_run": 0, "core": -1}}},"output_type_shape" : [{"int8":[1,224,224,3]}],"output_size" : "150528","parameter_size" : "0","activation_size" : "150528","node_index" : "77","input_type_shape" : [{"int8":[1,3,224,224]}],"provider" : "CPUExecutionProvider","op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110709,"ph" : "X","name" :"Transpose_fence_after","args" : {"op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110712,"ph" : "X","name" :"/conv1/Conv_quant_token_1_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1781,"ts" :110713,"ph" : "X","name" :"/conv1/Conv_quant_token_1_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 0, "Distribution": 1, "DistributionEnqueue": 0, "Run": 727, "Wait": 151, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 2, "core": 30},"132401808279232": {"num_run": 2, "core": 19},"132401663575744": {"num_run": 2, "core": 16},"132401797793472": {"num_run": 2, "core": 26},"132401787307712": {"num_run": 2, "core": 31},"132401776821952": {"num_run": 2, "core": 17},"132401692935872": {"num_run": 2, "core": 24},"132401682450112": {"num_run": 2, "core": 29},"132401653089984": {"num_run": 2, "core": 9},"132401642604224": {"num_run": 1, "core": 6},"132401558718144": {"num_run": 1, "core": 3},"132401548232384": {"num_run": 1, "core": 27},"132401537746624": {"num_run": 1, "core": 21},"132401527260864": {"num_run": 1, "core": 25},"132401516775104": {"num_run": 1, "core": 20}}},"output_type_shape" : [{"int8":[1,112,112,64]}],"output_size" : "802816","parameter_size" : "271","activation_size" : "150528","node_index" : "79","input_type_shape" : [{"int8":[1,224,224,3]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112499,"ph" : "X","name" :"/conv1/Conv_quant_token_1_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112503,"ph" : "X","name" :"/maxpool/MaxPool_token_175_fence_before","args" : {"op_name" : "NhwcMaxPool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :68,"ts" :112503,"ph" : "X","name" :"/maxpool/MaxPool_token_175_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 0, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 2, "core": 30},"132401808279232": {"num_run": 2, "core": 19},"132401663575744": {"num_run": 2, "core": 16},"132401797793472": {"num_run": 2, "core": 26},"132401787307712": {"num_run": 2, "core": 31},"132401776821952": {"num_run": 2, "core": 17},"132401692935872": {"num_run": 2, "core": 24},"132401682450112": {"num_run": 2, "core": 29},"132401653089984": {"num_run": 2, "core": 9},"132401642604224": {"num_run": 1, "core": 6},"132401558718144": {"num_run": 1, "core": 3},"132401548232384": {"num_run": 1, "core": 27},"132401537746624": {"num_run": 1, "core": 21},"132401527260864": {"num_run": 1, "core": 25},"132401516775104": {"num_run": 1, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "0","activation_size" : "802816","node_index" : "200","input_type_shape" : [{"int8":[1,112,112,64]}],"provider" : "CPUExecutionProvider","op_name" : "NhwcMaxPool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112574,"ph" : "X","name" :"/maxpool/MaxPool_token_175_fence_after","args" : {"op_name" : "NhwcMaxPool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112577,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :398,"ts" :112577,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 320, "Wait": 64, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 3, "core": 30},"132401808279232": {"num_run": 3, "core": 19},"132401663575744": {"num_run": 3, "core": 16},"132401797793472": {"num_run": 3, "core": 26},"132401787307712": {"num_run": 3, "core": 31},"132401776821952": {"num_run": 3, "core": 17},"132401692935872": {"num_run": 3, "core": 24},"132401682450112": {"num_run": 3, "core": 29},"132401653089984": {"num_run": 3, "core": 9},"132401642604224": {"num_run": 2, "core": 6},"132401558718144": {"num_run": 2, "core": 3},"132401548232384": {"num_run": 2, "core": 27},"132401537746624": {"num_run": 2, "core": 21},"132401527260864": {"num_run": 2, "core": 25},"132401516775104": {"num_run": 2, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "88","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112979,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112981,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :160,"ts" :112981,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 90, "Wait": 60, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 4, "core": 30},"132401808279232": {"num_run": 4, "core": 19},"132401663575744": {"num_run": 4, "core": 16},"132401797793472": {"num_run": 4, "core": 26},"132401787307712": {"num_run": 4, "core": 31},"132401776821952": {"num_run": 4, "core": 17},"132401692935872": {"num_run": 4, "core": 24},"132401682450112": {"num_run": 4, "core": 29},"132401653089984": {"num_run": 4, "core": 9},"132401642604224": {"num_run": 3, "core": 6},"132401558718144": {"num_run": 3, "core": 3},"132401548232384": {"num_run": 3, "core": 27},"132401537746624": {"num_run": 3, "core": 21},"132401527260864": {"num_run": 3, "core": 25},"132401516775104": {"num_run": 3, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "82","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113144,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113145,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :679,"ts" :113146,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 1, "DistributionEnqueue": 0, "Run": 574, "Wait": 48, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 5, "core": 30},"132401808279232": {"num_run": 5, "core": 19},"132401663575744": {"num_run": 5, "core": 16},"132401797793472": {"num_run": 5, "core": 26},"132401787307712": {"num_run": 5, "core": 31},"132401776821952": {"num_run": 5, "core": 17},"132401692935872": {"num_run": 5, "core": 24},"132401682450112": {"num_run": 5, "core": 29},"132401653089984": {"num_run": 5, "core": 9},"132401642604224": {"num_run": 4, "core": 6},"132401558718144": {"num_run": 4, "core": 3},"132401548232384": {"num_run": 4, "core": 27},"132401537746624": {"num_run": 4, "core": 21},"132401527260864": {"num_run": 4, "core": 25},"132401516775104": {"num_run": 4, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "84","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113826,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113828,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :356,"ts" :113828,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 325, "Wait": 21, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 6, "core": 30},"132401808279232": {"num_run": 6, "core": 19},"132401663575744": {"num_run": 6, "core": 16},"132401797793472": {"num_run": 6, "core": 26},"132401787307712": {"num_run": 6, "core": 31},"132401776821952": {"num_run": 6, "core": 17},"132401692935872": {"num_run": 6, "core": 24},"132401682450112": {"num_run": 6, "core": 29},"132401653089984": {"num_run": 6, "core": 9},"132401642604224": {"num_run": 5, "core": 6},"132401558718144": {"num_run": 5, "core": 3},"132401548232384": {"num_run": 5, "core": 27},"132401537746624": {"num_run": 5, "core": 21},"132401527260864": {"num_run": 5, "core": 25},"132401516775104": {"num_run": 5, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "86","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114186,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114186,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :139,"ts" :114187,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 68, "Wait": 59, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 7, "core": 30},"132401808279232": {"num_run": 7, "core": 19},"132401663575744": {"num_run": 7, "core": 16},"132401797793472": {"num_run": 7, "core": 26},"132401787307712": {"num_run": 7, "core": 31},"132401776821952": {"num_run": 7, "core": 17},"132401692935872": {"num_run": 7, "core": 24},"132401682450112": {"num_run": 7, "core": 29},"132401653089984": {"num_run": 7, "core": 9},"132401642604224": {"num_run": 6, "core": 6},"132401558718144": {"num_run": 6, "core": 3},"132401548232384": {"num_run": 6, "core": 27},"132401537746624": {"num_run": 6, "core": 21},"132401527260864": {"num_run": 6, "core": 25},"132401516775104": {"num_run": 6, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "7","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114327,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114328,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :297,"ts" :114328,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 216, "Wait": 73, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 8, "core": 30},"132401808279232": {"num_run": 8, "core": 19},"132401663575744": {"num_run": 8, "core": 16},"132401797793472": {"num_run": 8, "core": 26},"132401787307712": {"num_run": 8, "core": 31},"132401776821952": {"num_run": 8, "core": 17},"132401692935872": {"num_run": 8, "core": 24},"132401682450112": {"num_run": 8, "core": 29},"132401653089984": {"num_run": 8, "core": 9},"132401642604224": {"num_run": 7, "core": 6},"132401558718144": {"num_run": 7, "core": 3},"132401548232384": {"num_run": 7, "core": 27},"132401537746624": {"num_run": 7, "core": 21},"132401527260864": {"num_run": 7, "core": 25},"132401516775104": {"num_run": 7, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "802816","node_index" : "91","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114627,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114629,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :708,"ts" :114630,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 594, "Wait": 66, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 9, "core": 30},"132401808279232": {"num_run": 9, "core": 19},"132401663575744": {"num_run": 9, "core": 16},"132401797793472": {"num_run": 9, "core": 26},"132401787307712": {"num_run": 9, "core": 31},"132401776821952": {"num_run": 9, "core": 17},"132401692935872": {"num_run": 9, "core": 24},"132401682450112": {"num_run": 9, "core": 29},"132401653089984": {"num_run": 9, "core": 9},"132401642604224": {"num_run": 8, "core": 6},"132401558718144": {"num_run": 8, "core": 3},"132401548232384": {"num_run": 8, "core": 27},"132401537746624": {"num_run": 8, "core": 21},"132401527260864": {"num_run": 8, "core": 25},"132401516775104": {"num_run": 8, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "93","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115341,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115341,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :368,"ts" :115342,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 333, "Wait": 26, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 10, "core": 30},"132401808279232": {"num_run": 10, "core": 19},"132401663575744": {"num_run": 10, "core": 16},"132401797793472": {"num_run": 10, "core": 26},"132401787307712": {"num_run": 10, "core": 31},"132401776821952": {"num_run": 10, "core": 17},"132401692935872": {"num_run": 10, "core": 24},"132401682450112": {"num_run": 10, "core": 29},"132401653089984": {"num_run": 10, "core": 9},"132401642604224": {"num_run": 9, "core": 6},"132401558718144": {"num_run": 9, "core": 3},"132401548232384": {"num_run": 9, "core": 27},"132401537746624": {"num_run": 9, "core": 21},"132401527260864": {"num_run": 9, "core": 25},"132401516775104": {"num_run": 9, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "95","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115712,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115713,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :139,"ts" :115713,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 69, "Wait": 63, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 11, "core": 30},"132401808279232": {"num_run": 11, "core": 19},"132401663575744": {"num_run": 11, "core": 16},"132401797793472": {"num_run": 11, "core": 26},"132401787307712": {"num_run": 11, "core": 31},"132401776821952": {"num_run": 11, "core": 17},"132401692935872": {"num_run": 11, "core": 24},"132401682450112": {"num_run": 11, "core": 29},"132401653089984": {"num_run": 11, "core": 9},"132401642604224": {"num_run": 10, "core": 6},"132401558718144": {"num_run": 10, "core": 3},"132401548232384": {"num_run": 10, "core": 27},"132401537746624": {"num_run": 10, "core": 21},"132401527260864": {"num_run": 10, "core": 25},"132401516775104": {"num_run": 10, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "11","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115854,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115855,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :321,"ts" :115855,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 257, "Wait": 56, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 12, "core": 30},"132401808279232": {"num_run": 12, "core": 19},"132401663575744": {"num_run": 12, "core": 16},"132401797793472": {"num_run": 12, "core": 26},"132401787307712": {"num_run": 12, "core": 31},"132401776821952": {"num_run": 12, "core": 17},"132401692935872": {"num_run": 12, "core": 24},"132401682450112": {"num_run": 12, "core": 29},"132401653089984": {"num_run": 12, "core": 9},"132401642604224": {"num_run": 11, "core": 6},"132401558718144": {"num_run": 11, "core": 3},"132401548232384": {"num_run": 11, "core": 27},"132401537746624": {"num_run": 11, "core": 21},"132401527260864": {"num_run": 11, "core": 25},"132401516775104": {"num_run": 11, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "802816","node_index" : "98","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116178,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116179,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :729,"ts" :116179,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 629, "Wait": 51, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 13, "core": 30},"132401808279232": {"num_run": 13, "core": 19},"132401663575744": {"num_run": 13, "core": 16},"132401797793472": {"num_run": 13, "core": 26},"132401787307712": {"num_run": 13, "core": 31},"132401776821952": {"num_run": 13, "core": 17},"132401692935872": {"num_run": 13, "core": 24},"132401682450112": {"num_run": 13, "core": 29},"132401653089984": {"num_run": 13, "core": 9},"132401642604224": {"num_run": 12, "core": 6},"132401558718144": {"num_run": 12, "core": 3},"132401548232384": {"num_run": 12, "core": 27},"132401537746624": {"num_run": 12, "core": 21},"132401527260864": {"num_run": 12, "core": 25},"132401516775104": {"num_run": 12, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "100","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116910,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116912,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :412,"ts" :116912,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 367, "Wait": 36, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 14, "core": 30},"132401808279232": {"num_run": 14, "core": 19},"132401663575744": {"num_run": 14, "core": 16},"132401797793472": {"num_run": 14, "core": 26},"132401787307712": {"num_run": 14, "core": 31},"132401776821952": {"num_run": 14, "core": 17},"132401692935872": {"num_run": 14, "core": 24},"132401682450112": {"num_run": 14, "core": 29},"132401653089984": {"num_run": 14, "core": 9},"132401642604224": {"num_run": 13, "core": 6},"132401558718144": {"num_run": 13, "core": 3},"132401548232384": {"num_run": 13, "core": 27},"132401537746624": {"num_run": 13, "core": 21},"132401527260864": {"num_run": 13, "core": 25},"132401516775104": {"num_run": 13, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "102","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117326,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117327,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :151,"ts" :117327,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 75, "Wait": 69, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 15, "core": 30},"132401808279232": {"num_run": 15, "core": 19},"132401663575744": {"num_run": 15, "core": 16},"132401797793472": {"num_run": 15, "core": 26},"132401787307712": {"num_run": 15, "core": 31},"132401776821952": {"num_run": 15, "core": 17},"132401692935872": {"num_run": 15, "core": 24},"132401682450112": {"num_run": 15, "core": 29},"132401653089984": {"num_run": 15, "core": 9},"132401642604224": {"num_run": 14, "core": 6},"132401558718144": {"num_run": 14, "core": 3},"132401548232384": {"num_run": 14, "core": 27},"132401537746624": {"num_run": 14, "core": 21},"132401527260864": {"num_run": 14, "core": 25},"132401516775104": {"num_run": 14, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "15","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117481,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117482,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1009,"ts" :117483,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 940, "Wait": 52, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 16, "core": 30},"132401808279232": {"num_run": 16, "core": 19},"132401663575744": {"num_run": 16, "core": 16},"132401797793472": {"num_run": 16, "core": 26},"132401787307712": {"num_run": 16, "core": 31},"132401776821952": {"num_run": 16, "core": 17},"132401692935872": {"num_run": 16, "core": 24},"132401682450112": {"num_run": 16, "core": 29},"132401653089984": {"num_run": 16, "core": 9},"132401642604224": {"num_run": 15, "core": 6},"132401558718144": {"num_run": 15, "core": 3},"132401548232384": {"num_run": 15, "core": 27},"132401537746624": {"num_run": 15, "core": 21},"132401527260864": {"num_run": 15, "core": 25},"132401516775104": {"num_run": 15, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "802816","node_index" : "111","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :118495,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :118495,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :671,"ts" :118496,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 529, "Wait": 134, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 17, "core": 30},"132401808279232": {"num_run": 17, "core": 19},"132401663575744": {"num_run": 17, "core": 16},"132401797793472": {"num_run": 17, "core": 26},"132401787307712": {"num_run": 17, "core": 31},"132401776821952": {"num_run": 17, "core": 17},"132401692935872": {"num_run": 17, "core": 24},"132401682450112": {"num_run": 17, "core": 29},"132401653089984": {"num_run": 17, "core": 9},"132401642604224": {"num_run": 16, "core": 6},"132401558718144": {"num_run": 16, "core": 3},"132401548232384": {"num_run": 16, "core": 27},"132401537746624": {"num_run": 16, "core": 21},"132401527260864": {"num_run": 16, "core": 25},"132401516775104": {"num_run": 16, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,128]}],"output_size" : "401408","parameter_size" : "527","activation_size" : "802816","node_index" : "105","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :119169,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :119171,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :972,"ts" :119171,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 862, "Wait": 61, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 18, "core": 30},"132401808279232": {"num_run": 18, "core": 19},"132401663575744": {"num_run": 18, "core": 16},"132401797793472": {"num_run": 18, "core": 26},"132401787307712": {"num_run": 18, "core": 31},"132401776821952": {"num_run": 18, "core": 17},"132401692935872": {"num_run": 18, "core": 24},"132401682450112": {"num_run": 18, "core": 29},"132401653089984": {"num_run": 18, "core": 9},"132401642604224": {"num_run": 17, "core": 6},"132401558718144": {"num_run": 17, "core": 3},"132401548232384": {"num_run": 17, "core": 27},"132401537746624": {"num_run": 17, "core": 21},"132401527260864": {"num_run": 17, "core": 25},"132401516775104": {"num_run": 17, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "107","input_type_shape" : [{"int8":[1,56,56,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120145,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120146,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :545,"ts" :120147,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 506, "Wait": 29, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 19, "core": 30},"132401808279232": {"num_run": 19, "core": 19},"132401663575744": {"num_run": 19, "core": 16},"132401797793472": {"num_run": 19, "core": 26},"132401787307712": {"num_run": 19, "core": 31},"132401776821952": {"num_run": 19, "core": 17},"132401692935872": {"num_run": 19, "core": 24},"132401682450112": {"num_run": 19, "core": 29},"132401653089984": {"num_run": 19, "core": 9},"132401642604224": {"num_run": 18, "core": 6},"132401558718144": {"num_run": 18, "core": 3},"132401548232384": {"num_run": 18, "core": 27},"132401537746624": {"num_run": 18, "core": 21},"132401527260864": {"num_run": 18, "core": 25},"132401516775104": {"num_run": 18, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "109","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120693,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120697,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :120697,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 71, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 20, "core": 30},"132401808279232": {"num_run": 20, "core": 4},"132401663575744": {"num_run": 20, "core": 16},"132401797793472": {"num_run": 20, "core": 26},"132401787307712": {"num_run": 20, "core": 12},"132401776821952": {"num_run": 20, "core": 17},"132401692935872": {"num_run": 20, "core": 24},"132401682450112": {"num_run": 20, "core": 29},"132401653089984": {"num_run": 20, "core": 31},"132401642604224": {"num_run": 19, "core": 6},"132401558718144": {"num_run": 19, "core": 3},"132401548232384": {"num_run": 19, "core": 27},"132401537746624": {"num_run": 18, "core": 21},"132401527260864": {"num_run": 18, "core": 25},"132401516775104": {"num_run": 18, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "20","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120781,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120782,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :408,"ts" :120782,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 392, "Wait": 6, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 21, "core": 30},"132401808279232": {"num_run": 21, "core": 4},"132401663575744": {"num_run": 21, "core": 16},"132401797793472": {"num_run": 21, "core": 26},"132401787307712": {"num_run": 21, "core": 12},"132401776821952": {"num_run": 21, "core": 17},"132401692935872": {"num_run": 21, "core": 24},"132401682450112": {"num_run": 21, "core": 29},"132401653089984": {"num_run": 21, "core": 31},"132401642604224": {"num_run": 20, "core": 6},"132401558718144": {"num_run": 20, "core": 3},"132401548232384": {"num_run": 20, "core": 27},"132401537746624": {"num_run": 19, "core": 21},"132401527260864": {"num_run": 19, "core": 25},"132401516775104": {"num_run": 19, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "114","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :121193,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :121194,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :957,"ts" :121194,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 864, "Wait": 58, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 22, "core": 30},"132401808279232": {"num_run": 22, "core": 4},"132401663575744": {"num_run": 22, "core": 16},"132401797793472": {"num_run": 22, "core": 26},"132401787307712": {"num_run": 22, "core": 12},"132401776821952": {"num_run": 22, "core": 17},"132401692935872": {"num_run": 22, "core": 24},"132401682450112": {"num_run": 22, "core": 29},"132401653089984": {"num_run": 22, "core": 31},"132401642604224": {"num_run": 21, "core": 6},"132401558718144": {"num_run": 21, "core": 3},"132401548232384": {"num_run": 21, "core": 27},"132401537746624": {"num_run": 20, "core": 21},"132401527260864": {"num_run": 20, "core": 25},"132401516775104": {"num_run": 20, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "116","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122153,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122153,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :540,"ts" :122154,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 488, "Wait": 42, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 23, "core": 30},"132401808279232": {"num_run": 23, "core": 4},"132401663575744": {"num_run": 23, "core": 16},"132401797793472": {"num_run": 23, "core": 26},"132401787307712": {"num_run": 23, "core": 12},"132401776821952": {"num_run": 23, "core": 17},"132401692935872": {"num_run": 23, "core": 24},"132401682450112": {"num_run": 23, "core": 29},"132401653089984": {"num_run": 23, "core": 31},"132401642604224": {"num_run": 22, "core": 6},"132401558718144": {"num_run": 22, "core": 3},"132401548232384": {"num_run": 22, "core": 27},"132401537746624": {"num_run": 21, "core": 21},"132401527260864": {"num_run": 21, "core": 25},"132401516775104": {"num_run": 21, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "118","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122695,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122696,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :77,"ts" :122696,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 70, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 24, "core": 30},"132401808279232": {"num_run": 24, "core": 4},"132401663575744": {"num_run": 24, "core": 16},"132401797793472": {"num_run": 24, "core": 26},"132401787307712": {"num_run": 24, "core": 12},"132401776821952": {"num_run": 24, "core": 17},"132401692935872": {"num_run": 24, "core": 24},"132401682450112": {"num_run": 24, "core": 29},"132401653089984": {"num_run": 24, "core": 31},"132401642604224": {"num_run": 23, "core": 6},"132401558718144": {"num_run": 23, "core": 3},"132401548232384": {"num_run": 23, "core": 27},"132401537746624": {"num_run": 21, "core": 21},"132401527260864": {"num_run": 21, "core": 25},"132401516775104": {"num_run": 21, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "24","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122775,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122776,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :395,"ts" :122777,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 294, "Wait": 93, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 25, "core": 30},"132401808279232": {"num_run": 25, "core": 4},"132401663575744": {"num_run": 25, "core": 16},"132401797793472": {"num_run": 25, "core": 26},"132401787307712": {"num_run": 25, "core": 12},"132401776821952": {"num_run": 25, "core": 17},"132401692935872": {"num_run": 25, "core": 24},"132401682450112": {"num_run": 25, "core": 29},"132401653089984": {"num_run": 25, "core": 31},"132401642604224": {"num_run": 24, "core": 6},"132401558718144": {"num_run": 24, "core": 3},"132401548232384": {"num_run": 24, "core": 27},"132401537746624": {"num_run": 22, "core": 21},"132401527260864": {"num_run": 22, "core": 25},"132401516775104": {"num_run": 22, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "121","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :123173,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :123174,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :914,"ts" :123175,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 834, "Wait": 45, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 26, "core": 30},"132401808279232": {"num_run": 26, "core": 4},"132401663575744": {"num_run": 26, "core": 16},"132401797793472": {"num_run": 26, "core": 26},"132401787307712": {"num_run": 26, "core": 12},"132401776821952": {"num_run": 26, "core": 17},"132401692935872": {"num_run": 26, "core": 24},"132401682450112": {"num_run": 26, "core": 29},"132401653089984": {"num_run": 26, "core": 31},"132401642604224": {"num_run": 25, "core": 6},"132401558718144": {"num_run": 25, "core": 3},"132401548232384": {"num_run": 25, "core": 27},"132401537746624": {"num_run": 23, "core": 21},"132401527260864": {"num_run": 23, "core": 25},"132401516775104": {"num_run": 23, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "123","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124090,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124090,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :527,"ts" :124091,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 369, "Wait": 149, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 27, "core": 30},"132401808279232": {"num_run": 27, "core": 4},"132401663575744": {"num_run": 27, "core": 16},"132401797793472": {"num_run": 27, "core": 26},"132401787307712": {"num_run": 27, "core": 12},"132401776821952": {"num_run": 27, "core": 17},"132401692935872": {"num_run": 27, "core": 24},"132401682450112": {"num_run": 27, "core": 29},"132401653089984": {"num_run": 27, "core": 31},"132401642604224": {"num_run": 26, "core": 6},"132401558718144": {"num_run": 26, "core": 3},"132401548232384": {"num_run": 26, "core": 27},"132401537746624": {"num_run": 24, "core": 21},"132401527260864": {"num_run": 24, "core": 25},"132401516775104": {"num_run": 24, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "125","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124620,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124621,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :75,"ts" :124621,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 68, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 28, "core": 30},"132401808279232": {"num_run": 28, "core": 4},"132401663575744": {"num_run": 28, "core": 16},"132401797793472": {"num_run": 28, "core": 26},"132401787307712": {"num_run": 28, "core": 12},"132401776821952": {"num_run": 28, "core": 17},"132401692935872": {"num_run": 28, "core": 24},"132401682450112": {"num_run": 28, "core": 29},"132401653089984": {"num_run": 28, "core": 31},"132401642604224": {"num_run": 27, "core": 6},"132401558718144": {"num_run": 27, "core": 3},"132401548232384": {"num_run": 27, "core": 27},"132401537746624": {"num_run": 24, "core": 21},"132401527260864": {"num_run": 24, "core": 25},"132401516775104": {"num_run": 24, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "28","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124699,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124699,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :396,"ts" :124700,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 382, "Wait": 5, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 29, "core": 30},"132401808279232": {"num_run": 29, "core": 4},"132401663575744": {"num_run": 29, "core": 16},"132401797793472": {"num_run": 29, "core": 26},"132401787307712": {"num_run": 29, "core": 12},"132401776821952": {"num_run": 29, "core": 17},"132401692935872": {"num_run": 29, "core": 24},"132401682450112": {"num_run": 29, "core": 29},"132401653089984": {"num_run": 29, "core": 31},"132401642604224": {"num_run": 28, "core": 6},"132401558718144": {"num_run": 28, "core": 3},"132401548232384": {"num_run": 28, "core": 27},"132401537746624": {"num_run": 25, "core": 21},"132401527260864": {"num_run": 25, "core": 25},"132401516775104": {"num_run": 25, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "128","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125098,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125099,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :873,"ts" :125099,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 824, "Wait": 8, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 30, "core": 30},"132401808279232": {"num_run": 30, "core": 4},"132401663575744": {"num_run": 30, "core": 16},"132401797793472": {"num_run": 30, "core": 26},"132401787307712": {"num_run": 30, "core": 12},"132401776821952": {"num_run": 30, "core": 17},"132401692935872": {"num_run": 30, "core": 24},"132401682450112": {"num_run": 30, "core": 29},"132401653089984": {"num_run": 30, "core": 31},"132401642604224": {"num_run": 29, "core": 6},"132401558718144": {"num_run": 29, "core": 3},"132401548232384": {"num_run": 29, "core": 27},"132401537746624": {"num_run": 26, "core": 21},"132401527260864": {"num_run": 26, "core": 25},"132401516775104": {"num_run": 26, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "130","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125977,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125980,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :528,"ts" :125980,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 362, "Wait": 152, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 31, "core": 30},"132401808279232": {"num_run": 31, "core": 4},"132401663575744": {"num_run": 31, "core": 16},"132401797793472": {"num_run": 31, "core": 26},"132401787307712": {"num_run": 31, "core": 12},"132401776821952": {"num_run": 31, "core": 17},"132401692935872": {"num_run": 31, "core": 24},"132401682450112": {"num_run": 31, "core": 29},"132401653089984": {"num_run": 31, "core": 31},"132401642604224": {"num_run": 30, "core": 6},"132401558718144": {"num_run": 30, "core": 3},"132401548232384": {"num_run": 30, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "132","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126512,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126514,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :76,"ts" :126514,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 67, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 32, "core": 30},"132401808279232": {"num_run": 32, "core": 4},"132401663575744": {"num_run": 32, "core": 16},"132401797793472": {"num_run": 32, "core": 26},"132401787307712": {"num_run": 32, "core": 12},"132401776821952": {"num_run": 32, "core": 17},"132401692935872": {"num_run": 32, "core": 24},"132401682450112": {"num_run": 32, "core": 29},"132401653089984": {"num_run": 32, "core": 31},"132401642604224": {"num_run": 31, "core": 6},"132401558718144": {"num_run": 31, "core": 3},"132401548232384": {"num_run": 31, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "32","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126593,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126594,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1147,"ts" :126594,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 906, "Wait": 222, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 33, "core": 30},"132401808279232": {"num_run": 33, "core": 4},"132401663575744": {"num_run": 33, "core": 16},"132401797793472": {"num_run": 33, "core": 26},"132401787307712": {"num_run": 33, "core": 12},"132401776821952": {"num_run": 33, "core": 17},"132401692935872": {"num_run": 33, "core": 24},"132401682450112": {"num_run": 33, "core": 29},"132401653089984": {"num_run": 33, "core": 31},"132401642604224": {"num_run": 32, "core": 6},"132401558718144": {"num_run": 32, "core": 3},"132401548232384": {"num_run": 32, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "401408","node_index" : "141","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :127744,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :127745,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :725,"ts" :127745,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 715, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 34, "core": 30},"132401808279232": {"num_run": 34, "core": 4},"132401663575744": {"num_run": 34, "core": 16},"132401797793472": {"num_run": 34, "core": 26},"132401787307712": {"num_run": 34, "core": 12},"132401776821952": {"num_run": 34, "core": 17},"132401692935872": {"num_run": 34, "core": 24},"132401682450112": {"num_run": 34, "core": 29},"132401653089984": {"num_run": 34, "core": 31},"132401642604224": {"num_run": 33, "core": 6},"132401558718144": {"num_run": 33, "core": 3},"132401548232384": {"num_run": 33, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,256]}],"output_size" : "200704","parameter_size" : "1039","activation_size" : "401408","node_index" : "135","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :128472,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :128473,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1198,"ts" :128473,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 797, "Wait": 367, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 35, "core": 30},"132401808279232": {"num_run": 35, "core": 4},"132401663575744": {"num_run": 35, "core": 16},"132401797793472": {"num_run": 35, "core": 26},"132401787307712": {"num_run": 35, "core": 12},"132401776821952": {"num_run": 35, "core": 17},"132401692935872": {"num_run": 35, "core": 24},"132401682450112": {"num_run": 35, "core": 29},"132401653089984": {"num_run": 35, "core": 31},"132401642604224": {"num_run": 34, "core": 6},"132401558718144": {"num_run": 34, "core": 3},"132401548232384": {"num_run": 34, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "137","input_type_shape" : [{"int8":[1,28,28,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :129675,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :129676,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :629,"ts" :129676,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 435, "Wait": 185, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 36, "core": 30},"132401808279232": {"num_run": 36, "core": 4},"132401663575744": {"num_run": 36, "core": 16},"132401797793472": {"num_run": 36, "core": 26},"132401787307712": {"num_run": 36, "core": 12},"132401776821952": {"num_run": 36, "core": 17},"132401692935872": {"num_run": 36, "core": 24},"132401682450112": {"num_run": 36, "core": 29},"132401653089984": {"num_run": 36, "core": 31},"132401642604224": {"num_run": 35, "core": 6},"132401558718144": {"num_run": 35, "core": 3},"132401548232384": {"num_run": 35, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "139","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130308,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130309,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :130310,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 73, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 37, "core": 30},"132401808279232": {"num_run": 37, "core": 4},"132401663575744": {"num_run": 37, "core": 16},"132401797793472": {"num_run": 37, "core": 26},"132401787307712": {"num_run": 37, "core": 12},"132401776821952": {"num_run": 36, "core": 17},"132401692935872": {"num_run": 36, "core": 24},"132401682450112": {"num_run": 36, "core": 29},"132401653089984": {"num_run": 36, "core": 31},"132401642604224": {"num_run": 35, "core": 6},"132401558718144": {"num_run": 35, "core": 3},"132401548232384": {"num_run": 35, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "37","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130393,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130394,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :538,"ts" :130394,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 342, "Wait": 187, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 38, "core": 30},"132401808279232": {"num_run": 38, "core": 4},"132401663575744": {"num_run": 38, "core": 16},"132401797793472": {"num_run": 38, "core": 26},"132401787307712": {"num_run": 38, "core": 28},"132401776821952": {"num_run": 37, "core": 17},"132401692935872": {"num_run": 37, "core": 24},"132401682450112": {"num_run": 37, "core": 29},"132401653089984": {"num_run": 37, "core": 31},"132401642604224": {"num_run": 36, "core": 6},"132401558718144": {"num_run": 36, "core": 3},"132401548232384": {"num_run": 36, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "144","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130934,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130935,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1167,"ts" :130935,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 797, "Wait": 345, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 39, "core": 30},"132401808279232": {"num_run": 39, "core": 23},"132401663575744": {"num_run": 39, "core": 16},"132401797793472": {"num_run": 39, "core": 26},"132401787307712": {"num_run": 39, "core": 28},"132401776821952": {"num_run": 38, "core": 17},"132401692935872": {"num_run": 38, "core": 24},"132401682450112": {"num_run": 38, "core": 29},"132401653089984": {"num_run": 38, "core": 31},"132401642604224": {"num_run": 37, "core": 6},"132401558718144": {"num_run": 37, "core": 3},"132401548232384": {"num_run": 37, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "146","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132105,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132106,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :641,"ts" :132106,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 441, "Wait": 191, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 40, "core": 30},"132401808279232": {"num_run": 40, "core": 23},"132401663575744": {"num_run": 40, "core": 16},"132401797793472": {"num_run": 40, "core": 26},"132401787307712": {"num_run": 40, "core": 28},"132401776821952": {"num_run": 39, "core": 17},"132401692935872": {"num_run": 39, "core": 24},"132401682450112": {"num_run": 39, "core": 29},"132401653089984": {"num_run": 39, "core": 31},"132401642604224": {"num_run": 38, "core": 6},"132401558718144": {"num_run": 38, "core": 3},"132401548232384": {"num_run": 38, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "148","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132750,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132750,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :84,"ts" :132755,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 76, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 41, "core": 30},"132401808279232": {"num_run": 41, "core": 23},"132401663575744": {"num_run": 41, "core": 16},"132401797793472": {"num_run": 41, "core": 26},"132401787307712": {"num_run": 41, "core": 28},"132401776821952": {"num_run": 39, "core": 17},"132401692935872": {"num_run": 39, "core": 24},"132401682450112": {"num_run": 39, "core": 29},"132401653089984": {"num_run": 39, "core": 31},"132401642604224": {"num_run": 38, "core": 6},"132401558718144": {"num_run": 38, "core": 3},"132401548232384": {"num_run": 38, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "41","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132841,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132842,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :499,"ts" :132842,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 361, "Wait": 130, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 42, "core": 30},"132401808279232": {"num_run": 42, "core": 23},"132401663575744": {"num_run": 42, "core": 16},"132401797793472": {"num_run": 42, "core": 26},"132401787307712": {"num_run": 42, "core": 28},"132401776821952": {"num_run": 40, "core": 17},"132401692935872": {"num_run": 40, "core": 24},"132401682450112": {"num_run": 40, "core": 29},"132401653089984": {"num_run": 40, "core": 31},"132401642604224": {"num_run": 39, "core": 6},"132401558718144": {"num_run": 39, "core": 3},"132401548232384": {"num_run": 39, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "151","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :133344,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :133345,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1104,"ts" :133345,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 800, "Wait": 280, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 43, "core": 30},"132401808279232": {"num_run": 43, "core": 23},"132401663575744": {"num_run": 43, "core": 16},"132401797793472": {"num_run": 43, "core": 26},"132401787307712": {"num_run": 43, "core": 28},"132401776821952": {"num_run": 41, "core": 17},"132401692935872": {"num_run": 41, "core": 24},"132401682450112": {"num_run": 41, "core": 29},"132401653089984": {"num_run": 41, "core": 31},"132401642604224": {"num_run": 40, "core": 6},"132401558718144": {"num_run": 40, "core": 3},"132401548232384": {"num_run": 40, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "153","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :134451,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :134453,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :605,"ts" :134453,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 439, "Wait": 156, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 44, "core": 30},"132401808279232": {"num_run": 44, "core": 23},"132401663575744": {"num_run": 44, "core": 16},"132401797793472": {"num_run": 44, "core": 26},"132401787307712": {"num_run": 44, "core": 28},"132401776821952": {"num_run": 42, "core": 17},"132401692935872": {"num_run": 42, "core": 24},"132401682450112": {"num_run": 42, "core": 29},"132401653089984": {"num_run": 42, "core": 31},"132401642604224": {"num_run": 41, "core": 6},"132401558718144": {"num_run": 41, "core": 3},"132401548232384": {"num_run": 41, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "155","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135061,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135061,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :85,"ts" :135062,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 78, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 45, "core": 30},"132401808279232": {"num_run": 45, "core": 23},"132401663575744": {"num_run": 45, "core": 16},"132401797793472": {"num_run": 45, "core": 26},"132401787307712": {"num_run": 45, "core": 28},"132401776821952": {"num_run": 42, "core": 17},"132401692935872": {"num_run": 42, "core": 24},"132401682450112": {"num_run": 42, "core": 29},"132401653089984": {"num_run": 42, "core": 31},"132401642604224": {"num_run": 41, "core": 6},"132401558718144": {"num_run": 41, "core": 3},"132401548232384": {"num_run": 41, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "45","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135149,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135150,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :471,"ts" :135150,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 368, "Wait": 94, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 46, "core": 30},"132401808279232": {"num_run": 46, "core": 23},"132401663575744": {"num_run": 46, "core": 16},"132401797793472": {"num_run": 46, "core": 26},"132401787307712": {"num_run": 46, "core": 28},"132401776821952": {"num_run": 43, "core": 17},"132401692935872": {"num_run": 43, "core": 24},"132401682450112": {"num_run": 43, "core": 29},"132401653089984": {"num_run": 43, "core": 31},"132401642604224": {"num_run": 42, "core": 6},"132401558718144": {"num_run": 42, "core": 3},"132401548232384": {"num_run": 42, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "158","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135623,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135624,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1092,"ts" :135624,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 810, "Wait": 258, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 47, "core": 30},"132401808279232": {"num_run": 47, "core": 23},"132401663575744": {"num_run": 47, "core": 16},"132401797793472": {"num_run": 47, "core": 26},"132401787307712": {"num_run": 47, "core": 28},"132401776821952": {"num_run": 44, "core": 17},"132401692935872": {"num_run": 44, "core": 24},"132401682450112": {"num_run": 44, "core": 29},"132401653089984": {"num_run": 44, "core": 31},"132401642604224": {"num_run": 43, "core": 6},"132401558718144": {"num_run": 43, "core": 3},"132401548232384": {"num_run": 43, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "160","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :136718,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :136719,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :468,"ts" :136719,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 434, "Wait": 25, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 48, "core": 30},"132401808279232": {"num_run": 48, "core": 23},"132401663575744": {"num_run": 48, "core": 16},"132401797793472": {"num_run": 48, "core": 26},"132401787307712": {"num_run": 48, "core": 28},"132401776821952": {"num_run": 45, "core": 17},"132401692935872": {"num_run": 45, "core": 24},"132401682450112": {"num_run": 45, "core": 29},"132401653089984": {"num_run": 45, "core": 31},"132401642604224": {"num_run": 44, "core": 6},"132401558718144": {"num_run": 44, "core": 3},"132401548232384": {"num_run": 44, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "162","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137188,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137190,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :80,"ts" :137190,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 73, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 49, "core": 30},"132401808279232": {"num_run": 49, "core": 23},"132401663575744": {"num_run": 49, "core": 16},"132401797793472": {"num_run": 49, "core": 26},"132401787307712": {"num_run": 49, "core": 28},"132401776821952": {"num_run": 45, "core": 17},"132401692935872": {"num_run": 45, "core": 24},"132401682450112": {"num_run": 45, "core": 29},"132401653089984": {"num_run": 45, "core": 31},"132401642604224": {"num_run": 44, "core": 6},"132401558718144": {"num_run": 44, "core": 3},"132401548232384": {"num_run": 44, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "49","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137271,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137272,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :516,"ts" :137272,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 348, "Wait": 160, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 50, "core": 30},"132401808279232": {"num_run": 50, "core": 23},"132401663575744": {"num_run": 50, "core": 16},"132401797793472": {"num_run": 50, "core": 26},"132401787307712": {"num_run": 50, "core": 28},"132401776821952": {"num_run": 46, "core": 17},"132401692935872": {"num_run": 46, "core": 24},"132401682450112": {"num_run": 46, "core": 29},"132401653089984": {"num_run": 46, "core": 31},"132401642604224": {"num_run": 45, "core": 6},"132401558718144": {"num_run": 45, "core": 3},"132401548232384": {"num_run": 45, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "165","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137790,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137790,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :930,"ts" :137791,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 786, "Wait": 120, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 51, "core": 30},"132401808279232": {"num_run": 51, "core": 23},"132401663575744": {"num_run": 51, "core": 16},"132401797793472": {"num_run": 51, "core": 26},"132401787307712": {"num_run": 51, "core": 28},"132401776821952": {"num_run": 47, "core": 17},"132401692935872": {"num_run": 47, "core": 24},"132401682450112": {"num_run": 47, "core": 29},"132401653089984": {"num_run": 47, "core": 31},"132401642604224": {"num_run": 46, "core": 6},"132401558718144": {"num_run": 46, "core": 3},"132401548232384": {"num_run": 46, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "167","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :138723,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :138724,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :465,"ts" :138724,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 431, "Wait": 25, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 52, "core": 30},"132401808279232": {"num_run": 52, "core": 23},"132401663575744": {"num_run": 52, "core": 16},"132401797793472": {"num_run": 52, "core": 26},"132401787307712": {"num_run": 52, "core": 28},"132401776821952": {"num_run": 48, "core": 17},"132401692935872": {"num_run": 48, "core": 24},"132401682450112": {"num_run": 48, "core": 29},"132401653089984": {"num_run": 48, "core": 31},"132401642604224": {"num_run": 47, "core": 6},"132401558718144": {"num_run": 47, "core": 3},"132401548232384": {"num_run": 47, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "169","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139191,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139192,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :139192,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 74, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 53, "core": 30},"132401808279232": {"num_run": 53, "core": 23},"132401663575744": {"num_run": 53, "core": 16},"132401797793472": {"num_run": 53, "core": 26},"132401787307712": {"num_run": 53, "core": 28},"132401776821952": {"num_run": 48, "core": 17},"132401692935872": {"num_run": 48, "core": 24},"132401682450112": {"num_run": 48, "core": 29},"132401653089984": {"num_run": 48, "core": 31},"132401642604224": {"num_run": 47, "core": 6},"132401558718144": {"num_run": 47, "core": 3},"132401548232384": {"num_run": 47, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "53","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139275,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139276,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :485,"ts" :139276,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 352, "Wait": 125, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 54, "core": 30},"132401808279232": {"num_run": 54, "core": 23},"132401663575744": {"num_run": 54, "core": 16},"132401797793472": {"num_run": 54, "core": 26},"132401787307712": {"num_run": 54, "core": 28},"132401776821952": {"num_run": 49, "core": 17},"132401692935872": {"num_run": 49, "core": 24},"132401682450112": {"num_run": 49, "core": 29},"132401653089984": {"num_run": 49, "core": 31},"132401642604224": {"num_run": 48, "core": 6},"132401558718144": {"num_run": 48, "core": 3},"132401548232384": {"num_run": 48, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "172","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139763,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139764,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1137,"ts" :139765,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 800, "Wait": 311, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 55, "core": 30},"132401808279232": {"num_run": 55, "core": 23},"132401663575744": {"num_run": 55, "core": 16},"132401797793472": {"num_run": 55, "core": 26},"132401787307712": {"num_run": 55, "core": 28},"132401776821952": {"num_run": 50, "core": 17},"132401692935872": {"num_run": 50, "core": 24},"132401682450112": {"num_run": 50, "core": 29},"132401653089984": {"num_run": 50, "core": 31},"132401642604224": {"num_run": 49, "core": 6},"132401558718144": {"num_run": 49, "core": 3},"132401548232384": {"num_run": 49, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "174","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :140905,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :140906,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :642,"ts" :140906,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 439, "Wait": 195, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 56, "core": 30},"132401808279232": {"num_run": 56, "core": 23},"132401663575744": {"num_run": 56, "core": 16},"132401797793472": {"num_run": 56, "core": 26},"132401787307712": {"num_run": 56, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "176","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141550,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141552,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :82,"ts" :141552,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 75, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 57, "core": 30},"132401808279232": {"num_run": 57, "core": 23},"132401663575744": {"num_run": 57, "core": 16},"132401797793472": {"num_run": 57, "core": 26},"132401787307712": {"num_run": 57, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "57","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141636,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141637,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :4183,"ts" :141638,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3762, "Wait": 409, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 58, "core": 30},"132401808279232": {"num_run": 58, "core": 23},"132401663575744": {"num_run": 58, "core": 16},"132401797793472": {"num_run": 57, "core": 26},"132401787307712": {"num_run": 57, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "200704","node_index" : "185","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :145823,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :145824,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1348,"ts" :145824,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 825, "Wait": 513, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 59, "core": 30},"132401808279232": {"num_run": 59, "core": 23},"132401663575744": {"num_run": 59, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,512]}],"output_size" : "100352","parameter_size" : "2063","activation_size" : "200704","node_index" : "179","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :147174,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :147175,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3996,"ts" :147176,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3719, "Wait": 256, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 60, "core": 30},"132401808279232": {"num_run": 60, "core": 23},"132401663575744": {"num_run": 60, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "181","input_type_shape" : [{"int8":[1,14,14,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :151174,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :151177,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1972,"ts" :151177,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1845, "Wait": 117, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 61, "core": 30},"132401808279232": {"num_run": 61, "core": 23},"132401663575744": {"num_run": 61, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "183","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153150,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153151,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :144,"ts" :153151,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 61, "core": 30},"132401808279232": {"num_run": 61, "core": 23},"132401663575744": {"num_run": 61, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "62","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153296,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153297,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1740,"ts" :153297,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1617, "Wait": 114, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 62, "core": 30},"132401808279232": {"num_run": 62, "core": 23},"132401663575744": {"num_run": 62, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "188","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :155039,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :155040,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3893,"ts" :155040,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3646, "Wait": 230, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 63, "core": 30},"132401808279232": {"num_run": 63, "core": 5},"132401663575744": {"num_run": 63, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "25088","node_index" : "190","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :158935,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :158936,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1929,"ts" :158936,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1837, "Wait": 82, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 64, "core": 30},"132401808279232": {"num_run": 64, "core": 5},"132401663575744": {"num_run": 64, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "192","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :160868,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :160870,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :143,"ts" :160870,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 64, "core": 30},"132401808279232": {"num_run": 64, "core": 5},"132401663575744": {"num_run": 64, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "66","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :161015,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :161016,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1726,"ts" :161016,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 1, "DistributionEnqueue": 1, "Run": 1609, "Wait": 107, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 65, "core": 30},"132401808279232": {"num_run": 65, "core": 5},"132401663575744": {"num_run": 65, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "195","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :162743,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :162744,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :4001,"ts" :162744,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3559, "Wait": 425, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 66, "core": 30},"132401808279232": {"num_run": 66, "core": 5},"132401663575744": {"num_run": 66, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "25088","node_index" : "197","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :166747,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :166748,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_fence_before","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1999,"ts" :166749,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1765, "Wait": 224, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "199","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168751,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_fence_after","args" : {"op_name" : "QLinearConv"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168753,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :137,"ts" :168753,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "70","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168892,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168895,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_fence_before","args" : {"op_name" : "QLinearGlobalAveragePool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :19,"ts" :168895,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,1,1,2048]}],"output_size" : "2048","parameter_size" : "10","activation_size" : "100352","node_index" : "71","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearGlobalAveragePool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168916,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_fence_after","args" : {"op_name" : "QLinearGlobalAveragePool"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168919,"ph" : "X","name" :"Transpose_token_193_fence_before","args" : {"op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :5,"ts" :168920,"ph" : "X","name" :"Transpose_token_193_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,2048,1,1]}],"output_size" : "2048","parameter_size" : "0","activation_size" : "2048","node_index" : "218","input_type_shape" : [{"int8":[1,1,1,2048]}],"provider" : "CPUExecutionProvider","op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168926,"ph" : "X","name" :"Transpose_token_193_fence_after","args" : {"op_name" : "Transpose"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168927,"ph" : "X","name" :"/Flatten_fence_before","args" : {"op_name" : "Flatten"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3,"ts" :168927,"ph" : "X","name" :"/Flatten_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,2048]}],"output_size" : "2048","parameter_size" : "0","activation_size" : "2048","node_index" : "73","input_type_shape" : [{"int8":[1,2048,1,1]}],"provider" : "CPUExecutionProvider","op_name" : "Flatten"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168932,"ph" : "X","name" :"/Flatten_fence_after","args" : {"op_name" : "Flatten"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168932,"ph" : "X","name" :"/fc/Gemm_quant_fence_before","args" : {"op_name" : "QGemm"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :627,"ts" :168933,"ph" : "X","name" :"/fc/Gemm_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 59, "Wait": 78, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 68, "core": 30},"132401808279232": {"num_run": 68, "core": 5},"132401663575744": {"num_run": 68, "core": 16},"132401797793472": {"num_run": 59, "core": 26},"132401787307712": {"num_run": 59, "core": 28},"132401776821952": {"num_run": 53, "core": 17},"132401692935872": {"num_run": 53, "core": 24},"132401682450112": {"num_run": 53, "core": 29},"132401653089984": {"num_run": 53, "core": 25},"132401642604224": {"num_run": 52, "core": 6},"132401558718144": {"num_run": 52, "core": 3},"132401548232384": {"num_run": 52, "core": 27},"132401537746624": {"num_run": 29, "core": 21},"132401527260864": {"num_run": 29, "core": 31},"132401516775104": {"num_run": 29, "core": 20}}},"output_type_shape" : [{"int8":[1,1000]}],"output_size" : "1000","parameter_size" : "2052015","activation_size" : "2048","node_index" : "75","input_type_shape" : [{"int8":[1,2048]},{"float":[]},{"int8":[]},{"int8":[1000,2048]},{"float":[]},{"int8":[]},{"int32":[1000]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QGemm"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169562,"ph" : "X","name" :"/fc/Gemm_quant_fence_after","args" : {"op_name" : "QGemm"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169564,"ph" : "X","name" :"output_DequantizeLinear_fence_before","args" : {"op_name" : "DequantizeLinear"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :7,"ts" :169565,"ph" : "X","name" :"output_DequantizeLinear_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 68, "core": 30},"132401808279232": {"num_run": 68, "core": 5},"132401663575744": {"num_run": 68, "core": 16},"132401797793472": {"num_run": 59, "core": 26},"132401787307712": {"num_run": 59, "core": 28},"132401776821952": {"num_run": 53, "core": 17},"132401692935872": {"num_run": 53, "core": 24},"132401682450112": {"num_run": 53, "core": 29},"132401653089984": {"num_run": 53, "core": 25},"132401642604224": {"num_run": 52, "core": 6},"132401558718144": {"num_run": 52, "core": 3},"132401548232384": {"num_run": 52, "core": 27},"132401537746624": {"num_run": 29, "core": 21},"132401527260864": {"num_run": 29, "core": 31},"132401516775104": {"num_run": 29, "core": 20}}},"output_type_shape" : [{"float":[1,1000]}],"output_size" : "4000","parameter_size" : "5","activation_size" : "1000","node_index" : "76","input_type_shape" : [{"int8":[1,1000]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "DequantizeLinear"}},
+{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169573,"ph" : "X","name" :"output_DequantizeLinear_fence_after","args" : {"op_name" : "DequantizeLinear"}},
+{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :60252,"ts" :109326,"ph" : "X","name" :"SequentialExecutor::Execute","args" : {}},
+{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :60331,"ts" :109268,"ph" : "X","name" :"model_run","args" : {}}
+]
diff --git a/onnxruntime/test/python/gpnpumode/analyze_json.py b/onnxruntime/test/python/gpnpumode/analyze_json.py
new file mode 100644
index 0000000000..1a7a704210
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/analyze_json.py
@@ -0,0 +1,18 @@
+import os
+import sys
+import numpy as np
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from helper import load_json, json_to_df
+
+def get_time(jsons):
+    times = []
+    for json in jsons:
+        cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True)
+        times.append(cpu_df['duration'].values)
+    print(np.sum(np.array(times)))
+    return np.mean(np.array(times)), np.std(np.array(times))
+
+cpu_mean_time, cpu_std_time = get_time(['onnxruntime_profile__2025-01-28_21-44-59.json'])
+print(f"CPU Time:   {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms")
diff --git a/onnxruntime/test/python/gpnpumode/test_lutop.py b/onnxruntime/test/python/gpnpumode/test_lutop.py
new file mode 100644
index 0000000000..6493b2516e
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_lutop.py
@@ -0,0 +1,114 @@
+# import onnx
+# import numpy as np
+# import onnxruntime as ort
+# from onnx import helper, TensorProto
+
+# # Define the custom op node
+# input_tensor = helper.make_tensor_value_info('input', TensorProto.INT8, [3])
+# lut_tensor = helper.make_tensor_value_info('lut', TensorProto.INT8, [256])
+# output_tensor = helper.make_tensor_value_info('output', TensorProto.INT8, [3])
+
+# node = helper.make_node(
+#     'LookupTable',  # Custom op name
+#     inputs=['input', 'lut'],
+#     outputs=['output'],
+#     domain='test.customop'  # Custom domain
+# )
+
+# # Create the graph and model
+# graph = helper.make_graph(
+#     [node],
+#     'test_graph',
+#     [input_tensor, lut_tensor],
+#     [output_tensor]
+# )
+
+# # Add opset import for the custom domain
+# opset_imports = [
+#     helper.make_opsetid("", 13),  # Default domain (ONNX)
+#     helper.make_opsetid("test.customop", 1)  # Custom domain
+# ]
+
+# model = helper.make_model(graph, opset_imports=opset_imports, producer_name='custom_op_test')
+
+# # Save the model
+# onnx.save(model, 'test_model.onnx')
+
+# # Prepare input data
+# input_data = np.array([-128, 0, 127], dtype=np.int8)
+# lut_data = np.array([127 - i for i in range(256)], dtype=np.int8)  # Example LUT: invert values
+
+# # Run the model with ONNX Runtime
+# so = ort.SessionOptions()
+# so.register_custom_ops_library('/home/maggies/onnxruntime/build/Linux/Release/libcustom_op_library.so')  # Path to your custom op library
+
+# session = ort.InferenceSession('test_model.onnx', so)
+# inputs = {'input': input_data, 'lut': lut_data}
+# outputs = session.run(None, inputs)
+
+# print('Input:', input_data)
+# print('LUT:', lut_data)
+# print('Output:', outputs[0])
+
+
+import onnx
+import numpy as np
+import onnxruntime as ort
+from onnx import helper, TensorProto
+
+# Define the custom op node
+input_tensor = helper.make_tensor_value_info('input', TensorProto.INT8, [3])
+output_tensor = helper.make_tensor_value_info('output', TensorProto.INT8, [3])
+
+# Example LUT: invert values
+lut_data = np.array([127 - i for i in range(256)], dtype=np.int8)
+
+# Create the LUT tensor attribute
+lut_tensor = helper.make_tensor(
+    name='lut',
+    data_type=TensorProto.INT8,
+    dims=[256],
+    vals=lut_data
+)
+
+node = helper.make_node(
+    'LookupTable',  # Custom op name
+    inputs=['input'],
+    outputs=['output'],
+    domain='test.customop',  # Custom domain
+    lut=lut_tensor  # LUT as an attribute
+)
+
+# Create the graph and model
+graph = helper.make_graph(
+    [node],
+    'test_graph',
+    [input_tensor],
+    [output_tensor]
+)
+
+# Add opset import for the custom domain
+opset_imports = [
+    helper.make_opsetid("", 13),  # Default domain (ONNX)
+    helper.make_opsetid("test.customop", 1)  # Custom domain
+]
+
+model = helper.make_model(graph, opset_imports=opset_imports, producer_name='custom_op_test')
+
+# Save the model
+onnx.save(model, 'test_model.onnx')
+
+# Prepare input data
+input_data = np.array([-128, 0, 127], dtype=np.int8)
+
+# Run the model with ONNX Runtime
+so = ort.SessionOptions()
+so.register_custom_ops_library('/home/maggies/onnxruntime/build/Linux/Release/libcustom_op_library.so')  # Path to your custom op library
+
+session = ort.InferenceSession('test_model.onnx', so)
+inputs = {'input': input_data}
+outputs = session.run(None, inputs)
+
+print('Input:', input_data)
+print('LUT:', lut_data)
+print('Output:', outputs[0])
diff --git a/onnxruntime/test/python/gpnpumode/test_qgemm.py b/onnxruntime/test/python/gpnpumode/test_qgemm.py
new file mode 100644
index 0000000000..f0f7d44e20
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_qgemm.py
@@ -0,0 +1,166 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+import numpy as np
+import onnx
+import onnxruntime as ort
+from onnx import helper, TensorProto
+import os
+import sys
+import time
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from helper import get_onnx_const, generate_normal_inputs
+
+
+m = 1
+k = 2024
+n = 1000
+
+
+class TestQGemm(unittest.TestCase):
+    def setUp(self):
+        # Create a specific ONNX model with a single QGemm node
+        self.model_path = "qgemm_model.onnx"
+        self.create_qgemm_model(self.model_path)
+
+    def create_qgemm_model(self, output_model_path):
+        a_scale, a_zero_point = 0.2039528638124466, -14
+        b_scale, b_zero_point = 0.003937007859349251, 0
+        y_scale, y_zero_point = 0.1019764319062233, -6
+
+        # Define node names
+        input_a_name = "input_a"
+        input_a_scale_name = "input_a_scale"
+        input_a_zp_name = "input_a_zero_point"
+        input_b_name = "input_b"
+        input_b_scale_name = "input_b_scale"
+        input_b_zp_name = "input_b_zero_point"
+        output_scale_name = "output_scale"
+        output_zp_name = "output_zero_point"
+        output_name = "output"
+
+        a_sc = get_onnx_const(input_a_scale_name, a_scale, TensorProto.FLOAT)
+        a_zp = get_onnx_const(input_a_zp_name, a_zero_point, TensorProto.INT8)
+        b_sc = get_onnx_const(input_b_scale_name, b_scale, TensorProto.FLOAT)
+        b_zp = get_onnx_const(input_b_zp_name, b_zero_point, TensorProto.INT8)
+        y_sc = get_onnx_const(output_scale_name, y_scale, TensorProto.FLOAT)
+        y_zp = get_onnx_const(output_zp_name, y_zero_point, TensorProto.INT8)
+        # Define input and output tensors
+        input_a_tensor = helper.make_tensor_value_info(input_a_name, TensorProto.INT8, [m, k])
+        output_tensor = helper.make_tensor_value_info("out", TensorProto.INT8, [m, n])
+        b = get_onnx_const(input_b_name, generate_normal_inputs([n, k], np.int8, 0, 32))
+        y = get_onnx_const(output_name, generate_normal_inputs([n, ], np.int32, 0, 32))
+
+
+        # Create QLinearAdd node
+        qlinear_add_node = onnx.helper.make_node(
+            "QGemm",
+            inputs=[input_a_name, input_a_scale_name, input_a_zp_name,
+                input_b_name, input_b_scale_name, input_b_zp_name,
+                output_name,
+                output_scale_name, output_zp_name],
+            outputs=["out"],
+            alpha=0.5,
+            transA=0,
+            transB=1,
+            domain="com.microsoft"
+        )
+
+        # Create graph
+        graph_name = "com.microsoft.QLinearAdd_test"
+        graph = helper.make_graph(
+            [qlinear_add_node],
+            graph_name,
+            [input_a_tensor],
+            [output_tensor],
+            initializer=[a_sc, a_zp, b, b_sc, b_zp, y, y_sc, y_zp],
+        )
+
+        # Create model
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("com.microsoft", 1),helper.make_opsetid("", 12)])
+        model.ir_version = 8  # use stable onnx ir version
+
+        # Save model
+        onnx.checker.check_model(model, True)
+        onnx.save(model, output_model_path)
+
+    def tearDown(self):
+        # Delete the ONNX file after testing
+        if os.path.exists(self.model_path):
+            os.remove(self.model_path)
+
+    def test_qlinearconv_inference(self):
+        session_options = ort.SessionOptions()
+        session_options.enable_gpnpu = False
+        print(f"Flag enable_gpnpu: {session_options.enable_gpnpu}")
+
+        # Create an inference session
+        session1 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"])
+        print(f"Check flag enable_gpnpu: {session1.get_session_options().enable_gpnpu}")
+
+        session_options.enable_gpnpu = True
+        session2 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"])
+        print(f"Check flag enable_gpnpu: {session2.get_session_options().enable_gpnpu}")
+
+        # Get information about both inputs
+        input_a_info = session1.get_inputs()[0]
+        # input_b_info = session.get_inputs()[1]
+
+        # print(f"Model input names: {input_a_info.name}")
+        # print(f"Model input shapes: {input_a_info.shape}")
+
+        # Create random INT8 data matching the input shapes
+        shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape)
+
+        # Generate random data for both inputs
+        x_data_a = np.random.randint(
+            low=-128, high=128, size=shape_tuple_a, dtype=np.int8
+        )
+
+        # Create input dictionary with both inputs
+        input_dict = {
+            input_a_info.name: x_data_a
+        }
+
+        # Run inference
+        output_name1 = session1.get_outputs()[0].name
+        # print(f"Process ID: {os.getpid()}")
+        t1 = time.time()
+        output_data1 = session1.run([output_name1], input_dict)[0]
+        t2 = time.time()
+        output_name2 = session2.get_outputs()[0].name
+        # print(f"Process ID: {os.getpid()}")
+        t3 = time.time()
+        output_data2 = session2.run([output_name2], input_dict)[0]
+        t4 = time.time()
+
+        print("CPU  ", t2-t1)
+        print("GPNPU", t4-t3)
+
+        # Print shapes and types
+        print(f"Input A data shape: {x_data_a.shape}, dtype: {x_data_a.dtype}")
+        print(f"Output data shape: {output_data1.shape}, dtype: {output_data1.dtype}")
+        # print("Output data (truncated):\n", output_data1.flatten()[:50], "...\n")
+        # print("Output data (truncated):\n", output_data2.flatten()[:50], "...\n")
+        # print("hi")
+        difference = output_data1 - output_data2
+        max_diff = np.max(np.abs(difference))
+        print(max_diff)
+
+        difference = output_data1 - output_data2
+
+        max_diff = np.max(np.abs(difference))
+
+        # Check the output shape and type
+        self.assertEqual(output_data1.shape, (m,n))
+        self.assertEqual(output_data1.dtype, np.int8)
+        self.assertLessEqual(max_diff, 1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/onnxruntime/test/python/gpnpumode/test_qlinearadd.py b/onnxruntime/test/python/gpnpumode/test_qlinearadd.py
new file mode 100644
index 0000000000..9a2ee5f0ba
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_qlinearadd.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+import numpy as np
+import onnx
+import onnxruntime as ort
+from onnx import helper, TensorProto
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+from helper import get_onnx_const, generate_normal_inputs
+
+
+batch_size = 1
+h=128
+w=128
+channels=8
+
+class TestQLinearAdd(unittest.TestCase):
+    def setUp(self):
+        # Create a specific ONNX model with a single QLinearConv node
+        self.model_path = "qlinearadd_model.onnx"
+        self.create_qlinearadd_model(self.model_path)
+
+    def create_qlinearadd_model(self, output_model_path):
+        a_scale, a_zero_point = 0.2039528638124466, -14
+        b_scale, b_zero_point = 0.003937007859349251, 0
+        y_scale, y_zero_point = 0.1019764319062233, -6
+
+        # Create input shapes
+        input_shape = [batch_size, channels, h, w]
+
+        # Define node names
+        input_a_name = "input_a"
+        input_a_scale_name = "input_a_scale"
+        input_a_zp_name = "input_a_zero_point"
+        input_b_name = "input_b"
+        input_b_scale_name = "input_b_scale"
+        input_b_zp_name = "input_b_zero_point"
+        output_scale_name = "output_scale"
+        output_zp_name = "output_zero_point"
+        output_name = "output"
+
+        a_sc = get_onnx_const(input_a_scale_name, a_scale)
+        a_zp = get_onnx_const(input_a_zp_name, a_zero_point)
+        b_sc = get_onnx_const(input_b_scale_name, b_scale)
+        b_zp = get_onnx_const(input_b_zp_name, b_zero_point)
+        y_sc = get_onnx_const(output_scale_name, y_scale)
+        y_zp = get_onnx_const(output_zp_name, y_zero_point)
+
+        # Create QLinearAdd node
+        qlinear_add_node = onnx.helper.make_node(
+            "QLinearAdd",
+            inputs=[
+                input_a_name, input_a_scale_name, input_a_zp_name,
+                input_b_name, input_b_scale_name, input_b_zp_name,
+                output_scale_name, output_zp_name
+            ],
+            outputs=[output_name],
+            domain="com.microsoft"
+        )
+
+        # Define input and output tensors
+        input_a_tensor = helper.make_tensor_value_info(input_a_name, TensorProto.INT8, input_shape)
+        b = get_onnx_const(input_b_name, generate_normal_inputs(input_shape, np.int8, 0, 32))
+        input_b_tensor = helper.make_tensor_value_info(input_b_name, TensorProto.INT8, input_shape)
+        output_tensor = helper.make_tensor_value_info(output_name, TensorProto.INT8, input_shape)
+
+        # Create graph
+        graph_name = "com.microsoft.QLinearAdd_test"
+        graph = helper.make_graph(
+            [qlinear_add_node],
+            graph_name,
+            [input_a_tensor],
+            [output_tensor],
+            initializer=[a_sc, a_zp, b, b_sc, b_zp, y_sc, y_zp],
+        )
+
+        # Create model
+        model = helper.make_model(graph, opset_imports=[helper.make_opsetid("com.microsoft", 1),helper.make_opsetid("", 12)])
+        model.ir_version = 8  # use stable onnx ir version
+
+        # Save model
+        onnx.checker.check_model(model, True)
+        onnx.save(model, output_model_path)
+
+    def tearDown(self):
+        # Delete the ONNX file after testing
+        if os.path.exists(self.model_path):
+            os.remove(self.model_path)
+
+    def test_qlinearconv_inference(self):
+        session_options = ort.SessionOptions()
+        session_options.enable_gpnpu = False
+        print(f"Flag enable_gpnpu: {session_options.enable_gpnpu}")
+
+        # Create an inference session
+        session1 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"])
+        print(f"Check flag enable_gpnpu: {session1.get_session_options().enable_gpnpu}")
+
+        session_options.enable_gpnpu = True
+        session2 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"])
+        print(f"Check flag enable_gpnpu: {session2.get_session_options().enable_gpnpu}")
+
+        # Get information about both inputs
+        input_a_info = session1.get_inputs()[0]
+        # input_b_info = session.get_inputs()[1]
+
+        print(f"Model input names: {input_a_info.name}")
+        print(f"Model input shapes: {input_a_info.shape}")
+
+        # Create random INT8 data matching the input shapes
+        shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape)
+
+        # Generate random data for both inputs
+        x_data_a = np.random.randint(
+            low=-128, high=128, size=shape_tuple_a, dtype=np.int8
+        )
+
+        # Create input dictionary with both inputs
+        input_dict = {
+            input_a_info.name: x_data_a
+        }
+
+        # Run inference
+        output_name1 = session1.get_outputs()[0].name
+        print(f"Process ID: {os.getpid()}")
+        output_data1 = session1.run([output_name1], input_dict)[0]
+        output_name2 = session2.get_outputs()[0].name
+        print(f"Process ID: {os.getpid()}")
+        output_data2 = session2.run([output_name2], input_dict)[0]
+
+        # Print shapes and types
+        print(f"Input A data shape: {x_data_a.shape}, dtype: {x_data_a.dtype}")
+        # print(f"Output data shape: {output_data1.shape}, dtype: {output_data1.dtype}")
+        print("Output data (truncated):\n", output_data1.flatten()[:50], "...\n")
+        print("Output data (truncated):\n", output_data2.flatten()[:50], "...\n")
+        # print("hi")
+        difference = output_data1 - output_data2
+        max_diff = np.max(np.abs(difference))
+        print(max_diff)
+
+        difference = output_data1 - output_data2
+
+        max_diff = np.max(np.abs(difference))
+
+        # Check the output shape and type
+        self.assertEqual(output_data1.shape, (batch_size, channels, h, w))
+        self.assertEqual(output_data1.dtype, np.int8)
+        self.assertLessEqual(max_diff, 1)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/onnxruntime/test/python/gpnpumode/test_qlinearconv.py b/onnxruntime/test/python/gpnpumode/test_qlinearconv.py
new file mode 100644
index 0000000000..0a35136f4f
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_qlinearconv.py
@@ -0,0 +1,366 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+import numpy as np
+import onnx
+import onnxruntime as ort
+import os
+import sys
+import glob
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from helper import json_to_df, load_json, get_onnx_const, generate_normal_inputs
+
+x_scale, x_zp = 0.018654844, -14
+w_scale, w_zp = 0.044774472, 0
+y_scale, y_zp = 0.023529412, -30
+
+
+def canonicalize_conv_params(kernel, strides, padding, dilation):
+    kernel = [kernel, kernel] if not isinstance(kernel, (list, tuple)) else kernel
+
+    assert len(kernel) == 2, "Unexpected kernel:\n{call}"
+
+    strides = [strides, strides] if not isinstance(strides, (list, tuple)) else strides
+
+    assert len(strides) == 2, "Unexpected strides:\n{call}"
+
+    padding = (
+        [int(padding or 0), int(padding or 0), int(padding or 0), int(padding or 0)]
+        if not isinstance(padding, (list, tuple))
+        else padding
+    )
+    assert len(padding) == 4, "Unexpected padding:\n{call}"
+
+    dilation = [dilation, dilation] if not isinstance(dilation, (list, tuple)) else dilation
+
+    assert len(dilation) == 2, "Unexpected dilation:\n{call}"
+
+    return kernel, strides, padding, dilation
+
+def conv_output_height_width(kernel, strides, padding, dilation, input_dims):
+    kernel, strides, padding, dilation = canonicalize_conv_params(
+        kernel, strides, padding, dilation
+    )
+    return int(
+        (input_dims[0] + padding[0] + padding[2] - dilation[0] * (kernel[0] - 1) - 1) // strides[0]
+        + 1
+    ), int(
+        (input_dims[1] + padding[1] + padding[3] - dilation[1] * (kernel[1] - 1) - 1) // strides[1]
+        + 1
+    )
+
+def get_onnx_linear_conv(
+    op_name,
+    inp,  # Should be a ValueInfo
+    oc,
+    kernel_shape,
+    strides=[1, 1],
+    auto_pad="NOTSET",
+    padding=None,
+    dilations=[1, 1],
+    groups=1,
+    x_scale=1.0,
+    x_zp=0,
+    w_scale=1.0,
+    w_zp=0,
+    y_scale=1.0,
+    y_zp=0,
+    with_bias=True,
+):
+    kernel_shape = (
+        [kernel_shape, kernel_shape]
+        if not isinstance(kernel_shape, (list, tuple))
+        else kernel_shape
+    )
+
+    if padding is None and auto_pad == "NOTSET":
+        padding = [int(kernel_shape[0]) // 2] * 4
+
+    xs = get_onnx_const(f"{op_name}.x_scale", x_scale)
+    xz = get_onnx_const(f"{op_name}.x_zp", x_zp)
+    ws = get_onnx_const(f"{op_name}.w_scale", w_scale)
+    wz = get_onnx_const(f"{op_name}.w_zp", w_zp)
+    ys = get_onnx_const(f"{op_name}.y_scale", y_scale)
+    yz = get_onnx_const(f"{op_name}.y_zp", y_zp)
+
+    in_dims = [d.dim_value for d in inp.type.tensor_type.shape.dim]
+    # FIXME: Need to take into account padding and what not
+    ic = in_dims[1]
+    if padding:
+        out_height, out_width = conv_output_height_width(
+            kernel_shape, strides, padding, dilations, in_dims[-2:]
+        )
+    else:
+        out_height = in_dims[-2] // strides[-2]
+        out_width = in_dims[-1] // strides[-1]
+    out_dims = [1, oc, out_height, out_width]
+
+    group_size = ic // groups
+    wt_dims = [oc, group_size, kernel_shape[0], kernel_shape[1]]
+    bias_dims = [oc]
+
+    wt = get_onnx_const(f"{op_name}.wt", generate_normal_inputs(wt_dims, np.int8, 0, 32))
+    bias = get_onnx_const(
+        f"{op_name}.bias",
+        generate_normal_inputs(bias_dims, np.int32, 0, 256, -1024, 1024),
+        onnx.TensorProto.INT32,
+    )
+
+    out_name = f"{op_name}.output"
+    out = onnx.helper.make_tensor_value_info(out_name, onnx.TensorProto.INT8, out_dims)
+
+    names = [
+        inp.name,
+        f"{op_name}.x_scale",
+        f"{op_name}.x_zp",
+        f"{op_name}.wt",
+        f"{op_name}.w_scale",
+        f"{op_name}.w_zp",
+        f"{op_name}.y_scale",
+        f"{op_name}.y_zp",
+        f"{op_name}.bias",
+    ]
+    initializers = [xs, xz, wt, ws, wz, ys, yz, bias]
+
+    if auto_pad == "NOTSET":
+        conv = onnx.helper.make_node(
+            "QLinearConv",
+            names,
+            [out_name],
+            name=op_name,
+            dilations=dilations,
+            group=groups,
+            pads=padding,
+            strides=strides,
+            kernel_shape=kernel_shape,
+        )
+    else:
+        conv = onnx.helper.make_node(
+            "QLinearConv",
+            names,
+            [out_name],
+            name=op_name,
+            dilations=dilations,
+            auto_pad=auto_pad,
+            group=groups,
+            strides=strides,
+            kernel_shape=kernel_shape,
+        )
+
+    return conv, out, initializers
+
+def get_onnx(
+    h,
+    w,
+    ic,
+    oc,
+    kernel_size,
+    strides,
+    padding=None,
+    dilation=[1, 1],
+    auto_pad="NOTSET",
+    pad_mode="constant",
+    include_pre_op=False,
+    groups=1,
+):
+    kernel_size = (
+        [kernel_size, kernel_size] if not isinstance(kernel_size, (list, tuple)) else kernel_size
+    )
+
+    inp_dims = (1, ic, h, w)
+
+    ops = []
+    inits = []
+    if include_pre_op:
+        op_name = "inp_relu"
+        relu_min = get_onnx_const(f"{op_name}.min", 0, dtype=onnx.TensorProto.INT8)
+        relu_max = get_onnx_const(f"{op_name}.max", 6, dtype=onnx.TensorProto.INT8)
+        inits = [relu_min, relu_max]
+        inp_pre = onnx.helper.make_tensor_value_info("inp.pre", onnx.TensorProto.INT8, inp_dims)
+        inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, inp_dims)
+
+        relu6 = onnx.helper.make_node(
+            "Clip", ["inp.pre", f"{op_name}.min", f"{op_name}.max"], ["inp"], name=op_name
+        )
+        ops.append(relu6)
+    elif pad_mode == "reflect":
+        # Create a pad node ahead of the conv
+        op_name = "inp_pad"
+        inp_pre = onnx.helper.make_tensor_value_info("inp.pre", onnx.TensorProto.INT8, inp_dims)
+        padded_dims = list(inp_dims)
+        padded_dims[-2] = 2 * (kernel_size[0] // 2)
+        padded_dims[-1] = 2 * (kernel_size[1] // 2)
+        inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, padded_dims)
+        inp_pads = get_onnx_const(
+            "inp.pads",
+            np.array(
+                [
+                    0,
+                    0,
+                    kernel_size[0] // 2,
+                    kernel_size[1] // 2,
+                    0,
+                    0,
+                    kernel_size[0] // 2,
+                    kernel_size[1] // 2,
+                ],
+                dtype=np.int64,
+            ),
+            onnx.TensorProto.INT64,
+        )
+        inits = [inp_pads]
+        pad = onnx.helper.make_node(
+            "Pad", ["inp.pre", "inp.pads"], ["inp"], name=op_name, mode=pad_mode
+        )
+        padding = [0, 0, 0, 0]
+        ops.append(pad)
+    else:
+        inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, inp_dims)
+
+    conv, outp, conv_inits = get_onnx_linear_conv(
+        "conv_0",
+        inp,
+        oc,
+        kernel_size,
+        strides,
+        auto_pad=auto_pad,
+        padding=padding,
+        dilations=dilation,
+        groups=groups,
+        x_scale=x_scale,
+        x_zp=x_zp,
+        w_scale=w_scale,
+        w_zp=w_zp,
+        y_scale=y_scale,
+        y_zp=y_zp,
+    )
+    ops.append(conv)
+    inits = inits + conv_inits
+
+    graph_input = (inp_pre if include_pre_op or pad_mode == "reflect" else inp,)
+    graph = onnx.helper.make_graph(
+        ops,
+        "test_conv",
+        graph_input,
+        [outp],
+        initializer=inits,
+    )
+
+    model = onnx.helper.make_model(
+        graph,
+        opset_imports=[
+            onnx.helper.make_opsetid("com.microsoft", 1),
+            onnx.helper.make_opsetid("", 12),
+        ],
+    )
+    return model
+
+class TestQLinearConv(unittest.TestCase):
+    def setUp(self):
+        # Create a specific ONNX model with a single QLinearConv node
+        self.model_path = "qlinearconv_model.onnx"
+        self.create_qlinearconv_model(self.model_path)
+        self.cpu_jsons = []
+        self.gpnpu_jsons = []
+
+    def create_qlinearconv_model(self, model_path):
+        h = 128
+        w = 128
+        ic = 8
+        oc = 64
+        kernel_size = 3
+        strides = [1, 1]
+        model_def = get_onnx(
+            h,
+            w,
+            ic,
+            oc,
+            kernel_size,
+            strides)
+        onnx.save(model_def, model_path)
+
+    def tearDown(self):
+        # Delete the ONNX file and JSON files after testing
+        if os.path.exists(self.model_path):
+            os.remove(self.model_path)
+        for json_file in glob.glob("*.json"):
+            os.remove(json_file)
+
+    def performance_and_accuracy_test(self, num_iterations=100):
+        for _ in range(num_iterations):
+            # CPU Session
+            session_options_cpu = ort.SessionOptions()
+            session_options_cpu.enable_gpnpu = False
+            session_options_cpu.enable_profiling = True
+            session_options_cpu.profile_file_prefix = "cpu"
+            session_cpu = ort.InferenceSession(
+                self.model_path,
+                sess_options=session_options_cpu,
+                providers=["CPUExecutionProvider"]
+            )
+
+            # GPNPU Session
+            session_options_gpnpu = ort.SessionOptions()
+            session_options_gpnpu.enable_gpnpu = True
+            session_options_gpnpu.enable_profiling = True
+            session_options_gpnpu.profile_file_prefix = "gpnpu"
+            session_gpnpu = ort.InferenceSession(
+                self.model_path,
+                sess_options=session_options_gpnpu,
+                providers=["CPUExecutionProvider"]
+            )
+
+            # Prepare input
+            input_a_info = session_cpu.get_inputs()[0]
+            shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape)
+            x_data_a = np.random.randint(
+                low=-128, high=128, size=shape_tuple_a, dtype=np.int8
+            )
+            input_dict = {input_a_info.name: x_data_a}
+
+            # Time and run CPU inference
+            output_cpu = session_cpu.run(
+                [session_cpu.get_outputs()[0].name],
+                input_dict
+            )[0]
+            json_name_cpu = session_cpu.end_profiling()
+            self.cpu_jsons.append(json_name_cpu)
+
+            # Time and run GPNPU inference
+            output_gpnpu = session_gpnpu.run(
+                [session_gpnpu.get_outputs()[0].name],
+                input_dict
+            )[0]
+            json_name_gpnpu = session_gpnpu.end_profiling()
+            self.gpnpu_jsons.append(json_name_gpnpu)
+
+            # Calculate max difference
+            max_diff = np.max(np.abs(output_cpu - output_gpnpu))
+
+            self.assertLessEqual(max_diff, 1)
+
+    def test_performance_and_accuracy(self):
+        # Run test
+        self.performance_and_accuracy_test(num_iterations=1000)
+        self.json_time_profiling()
+
+    def json_time_profiling(self):
+        def get_time(jsons):
+            times = []
+            for json in jsons:
+                cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True)
+                times.append(cpu_df[cpu_df['name'] == 'QLinearConv']['duration'].values[0])
+            return np.mean(np.array(times)), np.std(np.array(times))
+        cpu_mean_time, cpu_std_time = get_time(self.cpu_jsons)
+        gpnpu_mean_time, gpnpu_std_time = get_time(self.gpnpu_jsons)
+        print(f"CPU Time:   {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms")
+        print(f"GPNPU Time: {gpnpu_mean_time:8.3f} ± {gpnpu_std_time:.3f} ms")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/onnxruntime/test/python/gpnpumode/test_qlineargap.py b/onnxruntime/test/python/gpnpumode/test_qlineargap.py
new file mode 100644
index 0000000000..3eba389272
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_qlineargap.py
@@ -0,0 +1,157 @@
+#!/usr/bin/env python
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See License.txt in the project root for
+# license information.
+# --------------------------------------------------------------------------
+import unittest
+import numpy as np
+import onnx
+import onnxruntime as ort
+from onnx import helper, TensorProto
+import os
+import sys
+import time
+import glob
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from helper import json_to_df, load_json
+
+N, C, H, W = 1, 2048, 7, 7
+
+class TestQGemm(unittest.TestCase):
+    def setUp(self):
+        # Create a specific ONNX model with a single QGemm node
+        self.model_path = "qlineargap.onnx"
+        self.create_qgemm_model(self.model_path)
+        self.cpu_jsons = []
+        self.gpnpu_jsons = []
+
+    def create_qgemm_model(self, output_model_path):
+        # Define the quantization parameters for X
+        x_scale = 0.1
+        x_zero_point = 128
+
+        # Create tensor for input X (quantized data)
+        X = helper.make_tensor_value_info("X", TensorProto.INT8, [N, C, H, W])
+        Y = helper.make_tensor_value_info("Y", TensorProto.INT8, [N, C, 1, 1])
+
+
+        # Define quantization parameters for output Y
+        y_scale = 0.2
+        y_zero_point = 128
+
+        # Step 2: Create the QLinearGlobalAveragePool node
+        node = helper.make_node(
+            'QLinearGlobalAveragePool',  # Operator name
+            inputs=['X', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'],  # Input tensors
+            outputs=['Y'],  # Output tensor
+            channels_last=0,  # Attribute indicating whether the channels are last in the shape (1 = True)
+            domain="com.microsoft"
+        )
+
+        # Step 3: Define the scale and zero point tensors for input/output
+        x_scale_tensor = helper.make_tensor('x_scale', TensorProto.FLOAT, [1], [x_scale])
+        x_zero_point_tensor = helper.make_tensor('x_zero_point', TensorProto.INT8, [1], [x_zero_point])
+        y_scale_tensor = helper.make_tensor('y_scale', TensorProto.FLOAT, [1], [y_scale])
+        y_zero_point_tensor = helper.make_tensor('y_zero_point', TensorProto.INT8, [1], [y_zero_point])
+
+        # Step 4: Create the graph (composed of the node and input/output tensors)
+        graph = helper.make_graph(
+            [node],  # List of nodes (here, just our QLinearGlobalAveragePool node)
+            'QLinearGlobalAveragePoolModel',  # Name of the graph
+            [X],  # Inputs
+            [Y],
+            initializer=[x_scale_tensor, x_zero_point_tensor, y_scale_tensor, y_zero_point_tensor]
+        )
+
+        # Step 5: Create the model (version 1)
+        model = helper.make_model(
+            graph,
+            producer_name='onnx-example',
+            opset_imports=[helper.make_opsetid("com.microsoft", 1), helper.make_opsetid('', 12)]  # Operator set version
+        )
+
+        # Step 6: Save the model to file
+        onnx.save(model, output_model_path)
+
+    def tearDown(self):
+        # Delete the ONNX file and JSON files after testing
+        if os.path.exists(self.model_path):
+            os.remove(self.model_path)
+        for json_file in glob.glob("*.json"):
+            os.remove(json_file)
+
+    def performance_and_accuracy_test(self, num_iterations=100):
+        for _ in range(num_iterations):
+            # CPU Session
+            session_options_cpu = ort.SessionOptions()
+            session_options_cpu.enable_gpnpu = False
+            session_options_cpu.enable_profiling = True
+            session_options_cpu.profile_file_prefix = "cpu"
+            session_cpu = ort.InferenceSession(
+                self.model_path,
+                sess_options=session_options_cpu,
+                providers=["CPUExecutionProvider"]
+            )
+
+            # GPNPU Session
+            session_options_gpnpu = ort.SessionOptions()
+            session_options_gpnpu.enable_gpnpu = True
+            session_options_gpnpu.enable_profiling = True
+            session_options_gpnpu.profile_file_prefix = "gpnpu"
+            session_gpnpu = ort.InferenceSession(
+                self.model_path,
+                sess_options=session_options_gpnpu,
+                providers=["CPUExecutionProvider"]
+            )
+
+            # Prepare input
+            input_a_info = session_cpu.get_inputs()[0]
+            shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape)
+            x_data_a = np.random.randint(
+                low=-128, high=128, size=shape_tuple_a, dtype=np.int8
+            )
+            input_dict = {input_a_info.name: x_data_a}
+
+            # Time and run CPU inference
+            output_cpu = session_cpu.run(
+                [session_cpu.get_outputs()[0].name],
+                input_dict
+            )[0]
+            json_name_cpu = session_cpu.end_profiling()
+            self.cpu_jsons.append(json_name_cpu)
+
+            # Time and run GPNPU inference
+            output_gpnpu = session_gpnpu.run(
+                [session_gpnpu.get_outputs()[0].name],
+                input_dict
+            )[0]
+            json_name_gpnpu = session_gpnpu.end_profiling()
+            self.gpnpu_jsons.append(json_name_gpnpu)
+
+            # Calculate max difference
+            max_diff = np.max(np.abs(output_cpu - output_gpnpu))
+
+            self.assertLessEqual(max_diff, 1)
+
+    def test_performance_and_accuracy(self):
+        # Run test
+        self.performance_and_accuracy_test(num_iterations=1)
+        self.json_time_profiling()
+
+    def json_time_profiling(self):
+        def get_time(jsons):
+            times = []
+            for json in jsons:
+                cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True)
+                times.extend(cpu_df[cpu_df['name'] == 'QLinearGlobalAveragePool']['duration'].values)
+            return np.mean(np.array(times)), np.std(np.array(times))
+        cpu_mean_time, cpu_std_time = get_time(self.cpu_jsons)
+        gpnpu_mean_time, gpnpu_std_time = get_time(self.gpnpu_jsons)
+        print(f"CPU Time:   {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms")
+        print(f"GPNPU Time: {gpnpu_mean_time:8.3f} ± {gpnpu_std_time:.3f} ms")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/onnxruntime/test/python/gpnpumode/test_resnet50.py b/onnxruntime/test/python/gpnpumode/test_resnet50.py
new file mode 100644
index 0000000000..1232e279e0
--- /dev/null
+++ b/onnxruntime/test/python/gpnpumode/test_resnet50.py
@@ -0,0 +1,65 @@
+import numpy as np
+import onnxruntime as ort
+import time
+import os
+import sys
+# from tvm.contrib.epu.chimera_job.chimera_job import ChimeraJob
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from helper import json_to_df, load_json
+
+print(np.__version__)
+def run_ort(x_data, flag, onnx_file_path="resnet_50.onnx"):
+    # Create an inference session
+    session_options = ort.SessionOptions()
+    session_options.enable_gpnpu = flag
+    # session_options.enable_profiling = True
+    session_options.intra_op_num_threads = 16
+    session_options.profile_file_prefix = str(16)+"gpnpu"
+    session = ort.InferenceSession(onnx_file_path, sess_options = session_options, providers=["CPUExecutionProvider"])
+    # Inspect the model's input to get the name and shape
+    inp_info = session.get_inputs()[0]
+    input_name = inp_info.name
+    input_shape = inp_info.shape  # e.g. [1, 8, 128, 128]
+    # print(f"Model input name: {input_name}")- 377
+    # print(f"Model input shape: {input_shape}")
+
+    # If any dimension is None or 'batch size' is variable, adjust accordingly
+    shape_tuple = tuple(dim if isinstance(dim, int) else 1 for dim in input_shape)
+
+    # Run inference
+    output_name = session.get_outputs()[0].name
+    t1 = time.time()
+    output_data = session.run([output_name], {input_name: x_data})[0]
+    # name = session.end_profiling()
+    t2 = time.time()
+
+    # print(t2-t1)
+    # Print shapes and types
+    # print(f"Input data shape: {x_data.shape}, dtype: {x_data.dtype}")
+    # print(f"Output data shape: {output_data.shape}, dtype: {output_data.dtype}")
+    # print("Output data (truncated):\n", output_data.flatten()[:50], "...\n")
+    return output_data.flatten()
+
+if __name__ == "__main__":
+    # total = 0
+    # n = 1
+    # name = ""
+    # for num in range(4, 20, 4):
+    #     total = 0
+    #     for i in range(n):
+    #         t, name = run_qlinearconv_model(num)
+    #         total += t
+
+
+    #     cpu_df, gpu_df = json_to_df(load_json(name), lambda x: True)
+    #     print(str(num) + " - " + str(round(total/n*1000)) + " " + str(round(np.sum(cpu_df["duration"])/1000)))
+    x_data = np.random.rand(1, 3, 224, 224).astype(np.float32)
+    print(x_data)
+    ort_cpu = run_ort(x_data, False)
+    ort_gpnpu = run_ort(x_data, True)
+    np.save("ort_cpu.npy", ort_cpu)
+    np.save("ort_gpnpu.npy", ort_gpnpu)
+
+    # output_tvm = run_tvm(x_data)
+    print(np.max(np.abs(ort_cpu) - ort_gpnpu))
diff --git a/onnxruntime/test/python/helper.py b/onnxruntime/test/python/helper.py
index 2a2c3fc9b4..1ce79cc2cf 100644
--- a/onnxruntime/test/python/helper.py
+++ b/onnxruntime/test/python/helper.py
@@ -1,5 +1,8 @@
 import os
-
+import onnx
+import numpy as np
+import json
+import pandas as pd
 
 def get_name(name):
     if os.path.exists(name):
@@ -13,3 +16,131 @@ def get_name(name):
     if os.path.exists(res):
         return res
     raise FileNotFoundError(f"Unable to find '{name}' or '{rel}' or '{res}'")
+
+def get_onnx_const(name, val, dtype=None):
+    if isinstance(val, np.ndarray):
+        dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[val.dtype]
+        dims = val.shape
+    else:
+        if not dtype:
+            dtype = onnx.TensorProto.INT8 if isinstance(val, int) else onnx.TensorProto.FLOAT
+        dims = ()
+        val = [val]
+
+    return onnx.helper.make_tensor(name=name, data_type=dtype, dims=dims, vals=val)
+
+def generate_normal_inputs(shape, dtype, mu=0, sigma=32, a_min=-127, a_max=127):
+    return np.clip(np.rint(np.random.normal(mu, sigma, shape)).astype(dtype), a_min, a_max)
+
+def load_json(profile_path):
+    with open(profile_path, encoding="utf-8") as file_obj:
+        data = json.load(file_obj)
+    if isinstance(data, dict):
+        data = data["traceEvents"]
+    return data
+
+def _shape_to_string(shape):
+    res = ""
+    for dict_obj in shape:
+        if len(dict_obj) > 1:
+            raise ValueError("Unhandled type in _shape_to_string()")
+        key = next(iter(dict_obj.keys()))
+        value = next(iter(dict_obj.values()))
+        if len(res) != 0:
+            res += ","
+        res += f'{key}({"x".join(str(v) for v in value)})'
+    return res
+
+def json_to_df(data, filter_matcher):
+    cpu_entries = []
+    gpu_entries = []
+
+    most_recent_kernel_launch_event = None
+    num_missing_kernel_launch_events = 0
+    total_kernel_events = 0
+
+    for item in data:
+        cat = item.get("cat")
+        if cat is None:
+            continue
+        dur = item.get("dur")
+        if dur is None:
+            continue
+        arg = item.get("args")
+        if arg is None:
+            continue
+        op_name = arg.get("op_name")
+
+        name = item["name"]
+
+        if not filter_matcher(name) and op_name is not None and not filter_matcher(op_name):
+            continue
+
+        if cat != "Kernel" and not name.endswith("kernel_time"):
+            continue
+        if name.endswith("kernel_time"):
+            most_recent_kernel_launch_event = item
+
+        block_x = arg.get("block_x", -1)
+        block_y = arg.get("block_y", -1)
+        block_z = arg.get("block_z", -1)
+        grid_x = arg.get("grid_x", -1)
+        grid_y = arg.get("grid_y", -1)
+        grid_z = arg.get("grid_z", -1)
+
+        if cat == "Kernel":
+            gpu_entries.append(
+                {
+                    "name": name,
+                    "duration": dur,
+                    "dimensions": f"b{block_x}x{block_y}x{block_z},g{grid_x}x{grid_y}x{grid_z}",
+                    "op_name": op_name,
+                    "input_type_shape": (
+                        _shape_to_string(most_recent_kernel_launch_event["args"]["input_type_shape"])
+                        if most_recent_kernel_launch_event is not None
+                        else "unknown"
+                    ),
+                }
+            )
+            total_kernel_events += 1
+            if gpu_entries[-1]["input_type_shape"] == "unknown" and "hipMem" not in gpu_entries[-1]["name"]:
+                num_missing_kernel_launch_events += 1
+        else:
+            cpu_entries.append(
+                {
+                    "name": item["args"]["op_name"],
+                    "duration": dur,
+                    "input_type_shape": _shape_to_string(item["args"]["input_type_shape"]),
+                    "output_type_shape": _shape_to_string(item["args"]["output_type_shape"]),
+                }
+            )
+
+    if num_missing_kernel_launch_events > 0:
+        print(
+            f"WARNING: Could not resolve shapes for {num_missing_kernel_launch_events} of {total_kernel_events} kernels."
+        )
+
+    cpu_df = pd.DataFrame(cpu_entries)
+    gpu_df = pd.DataFrame(gpu_entries)
+    cpu_df["count"] = 1
+    gpu_df["count"] = 1
+    return cpu_df, gpu_df
+
+def construct_filter_matcher(args):
+    if args.filter is None or len(args.filter) == 0:
+        return lambda x: True
+    filter_list = args.filter
+    concrete_filter_set = set()
+    fnmatch_filter_set = set()
+    for pattern in filter_list:
+        if "*" in pattern or "?" in pattern or "[" in pattern or "]" in pattern:
+            fnmatch_filter_set.add(pattern)
+        else:
+            concrete_filter_set.add(pattern)
+
+    def _match_item(item):
+        if item in concrete_filter_set:
+            return True
+        return any(fnmatch.fnmatch(item, pattern) for pattern in fnmatch_filter_set)
+
+    return _match_item
diff --git a/onnxruntime/test/python/quantization/calibration.cache b/onnxruntime/test/python/quantization/calibration.cache
new file mode 100644
index 0000000000..592d63ff76
--- /dev/null
+++ b/onnxruntime/test/python/quantization/calibration.cache
@@ -0,0 +1 @@
+td 1.100000023841858
diff --git a/onnxruntime/test/python/quantization/calibration.flatbuffers b/onnxruntime/test/python/quantization/calibration.flatbuffers
new file mode 100644
index 0000000000..9bbe626650
Binary files /dev/null and b/onnxruntime/test/python/quantization/calibration.flatbuffers differ
diff --git a/onnxruntime/test/python/quantization/calibration.json b/onnxruntime/test/python/quantization/calibration.json
new file mode 100644
index 0000000000..30a0221f0d
--- /dev/null
+++ b/onnxruntime/test/python/quantization/calibration.json
@@ -0,0 +1 @@
+{"CLS": "TensorsData", "data": {"td": {"lowest": {"data": [0.10000000149011612], "dtype": "float32", "CLS": "numpy.array"}, "highest": {"data": [1.100000023841858], "dtype": "float32", "CLS": "numpy.array"}, "CLS": "TensorData"}}, "calibration_method": {"CLS": "CalibrationMethod", "value": "CalibrationMethod.MinMax"}}
\ No newline at end of file
diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
index ebef441350..b63c35ad5f 100644
--- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#include <iostream>
 #define ORT_API_MANUAL_INIT
 #include "onnxruntime_cxx_api.h"
 #undef ORT_API_MANUAL_INIT
@@ -322,6 +322,117 @@ struct AttrTesterStringOp : Ort::CustomOpBase<AttrTesterStringOp, AttrTesterStri
   }
 };
 
+void LookupTable(const Ort::Custom::Tensor<int8_t>& input,
+                 const Ort::Custom::Tensor<int8_t>& lut,
+                 Ort::Custom::Tensor<int8_t>& output) {
+  auto input_shape = input.Shape();
+  auto input_data = input.Data();
+  auto lut_data = lut.Data();
+  auto output_data = output.Allocate(input_shape);
+  int32_t ind = 0;
+  for (int64_t i = 0; i < input.NumberOfElement(); ++i) {
+    ind = static_cast<int32_t>(input_data[i]) + 128;
+    output_data[i] = lut_data[ind];
+  }
+}
+
+
+struct LookupTableKernel {
+  std::vector<int8_t> lut_values;
+
+  // Initialize from kernel info - this is called during CreateKernel
+  void Init(const OrtApi* api, const OrtKernelInfo* info) {
+    // Get default allocator
+    OrtAllocator* allocator;
+    CUSTOM_ENFORCE(api->GetAllocatorWithDefaultOptions(&allocator) == nullptr,
+                  "Failed to get default allocator");
+
+    // Get the lookup table tensor attribute
+    OrtValue* lut_tensor = nullptr;
+    CUSTOM_ENFORCE(api->KernelInfoGetAttribute_tensor(info, "lut", allocator, &lut_tensor) == nullptr,
+                  "Failed to get lut tensor attribute");
+
+    OrtTensorTypeAndShapeInfo* shape_info;
+    CUSTOM_ENFORCE(api->GetTensorTypeAndShape(lut_tensor, &shape_info) == nullptr,
+                  "Failed to get tensor shape info");
+
+    ONNXTensorElementDataType tensor_type;
+    CUSTOM_ENFORCE(api->GetTensorElementType(shape_info, &tensor_type) == nullptr,
+                  "Failed to get tensor element type");
+
+    size_t num_elements;
+    CUSTOM_ENFORCE(api->GetTensorShapeElementCount(shape_info, &num_elements) == nullptr,
+                  "Failed to get tensor element count");
+
+    CUSTOM_ENFORCE(num_elements == 256, "Lookup table must contain exactly 256 values");
+
+    lut_values.resize(256);
+
+    if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8) {
+      const int8_t* tensor_data;
+      CUSTOM_ENFORCE(api->GetTensorMutableData(lut_tensor, (void**)&tensor_data) == nullptr,
+                    "Failed to get tensor data");
+
+      for (size_t i = 0; i < 256; ++i) {
+        lut_values[i] = tensor_data[i];
+      }
+    }
+    else if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) {
+      const int64_t* tensor_data;
+      CUSTOM_ENFORCE(api->GetTensorMutableData(lut_tensor, (void**)&tensor_data) == nullptr,
+                    "Failed to get tensor data");
+
+      for (size_t i = 0; i < 256; ++i) {
+        lut_values[i] = static_cast<int8_t>(tensor_data[i]);
+      }
+    }
+    else {
+      CUSTOM_ENFORCE(false, "Unsupported tensor type for LUT attribute");
+    }
+
+    api->ReleaseTensorTypeAndShapeInfo(shape_info);
+    api->ReleaseValue(lut_tensor);
+  }
+
+  void Compute(OrtKernelContext* context) {
+    Ort::KernelContext ctx(context);
+    auto input = ctx.GetInput(0);
+    const auto* input_data = input.GetTensorData<int8_t>();
+    auto dimensions = input.GetTensorTypeAndShapeInfo().GetShape();
+    auto output = ctx.GetOutput(0, dimensions);
+    auto* output_data = output.GetTensorMutableData<int8_t>();
+    const size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount();
+
+    for (size_t i = 0; i < size; i++) {
+      uint8_t index = static_cast<uint8_t>(input_data[i]) + 128;
+      output_data[i] = lut_values[index];
+    }
+  }
+};
+
+struct LookupTableOp : Ort::CustomOpBase<LookupTableOp, LookupTableKernel> {
+  void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const {
+    auto kernel = std::make_unique<LookupTableKernel>();
+    kernel->Init(&api, info);
+    return kernel.release();
+  }
+
+  const char* GetName() const { return "LookupTable"; }
+  const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; }
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; }
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; }
+
+  static Ort::Status InferOutputShape(Ort::ShapeInferContext& ctx) {
+    auto input_shape = ctx.GetInputShape(0);
+    ctx.SetOutputShape(0, input_shape);
+    return Ort::Status{nullptr};
+  }
+};
+
+
+
 void RegisterOps(Ort::CustomOpDomain& domain) {
   static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpOne{Ort::Custom::CreateLiteCustomOp<KernelOne>("CustomOpOne", "CPUExecutionProvider")};
   static const std::unique_ptr<OrtLiteCustomOp> c_CustomOpTwo{Ort::Custom::CreateLiteCustomOp("CustomOpTwo", "CPUExecutionProvider", KernelTwo)};
@@ -337,6 +448,8 @@ void RegisterOps(Ort::CustomOpDomain& domain) {
   static const std::unique_ptr<OrtLiteCustomOp> c_AtterTesterIntFloat{Ort::Custom::CreateLiteCustomOp("AttrTesterIntFloat", "CPUExecutionProvider", AttrTesterIntFloatCompute, AttrTesterIntFloatShapeInfer)};
   static const AttrTesterStringOp c_AtterTesterString;
 
+  static const LookupTableOp c_LookupTable;
+
 #if !defined(DISABLE_FLOAT8_TYPES)
   static const CustomOpOneFloat8 c_CustomOpOneFloat8;
   static const std::unique_ptr<OrtLiteCustomOp> c_FilterFloat8{Ort::Custom::CreateLiteCustomOp("FilterFloat8", "CPUExecutionProvider", FilterFloat8)};
@@ -354,6 +467,8 @@ void RegisterOps(Ort::CustomOpDomain& domain) {
   domain.Add(c_CopyTensorArrayCombined.get());
   domain.Add(c_AtterTesterIntFloat.get());
   domain.Add(&c_AtterTesterString);
+  domain.Add(&c_LookupTable);
+
 
 #if !defined(DISABLE_FLOAT8_TYPES)
   domain.Add(&c_CustomOpOneFloat8);
diff --git a/validation/qlinearconv_model.onnx b/validation/qlinearconv_model.onnx
new file mode 100644
index 0000000000..2a474788eb
Binary files /dev/null and b/validation/qlinearconv_model.onnx differ
diff --git a/validation/validate.py b/validation/validate.py
new file mode 100644
index 0000000000..e668797297
--- /dev/null
+++ b/validation/validate.py
@@ -0,0 +1,77 @@
+import numpy as np
+import onnxruntime as ort
+import time
+import os
+import sys
+from tvm.contrib.epu.chimera_job.chimera_job import ChimeraJob
+
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+
+print(np.__version__)
+def run_ort(flag, x_data, onnx_file_path="/Users/maggies/Desktop/resnet50_512_1024_int8_opset11.onnx"):
+    # Create an inference session
+    session_options = ort.SessionOptions()
+    session_options.enable_gpnpu = flag
+    session_options.enable_profiling = True
+    session_options.intra_op_num_threads = 16
+    session_options.profile_file_prefix = str(16)+"gpnpu"
+    session = ort.InferenceSession(onnx_file_path, sess_options = session_options, providers=["CPUExecutionProvider"])
+    # Inspect the model's input to get the name and shape
+    inp_info = session.get_inputs()[0]
+    input_name = inp_info.name
+    input_shape = inp_info.shape  # e.g. [1, 8, 128, 128]
+    # print(f"Model input name: {input_name}")- 377
+    # print(f"Model input shape: {input_shape}")
+
+    # If any dimension is None or 'batch size' is variable, adjust accordingly
+    shape_tuple = tuple(dim if isinstance(dim, int) else 1 for dim in input_shape)
+
+    # Run inference
+    output_name = session.get_outputs()[0].name
+    t1 = time.time()
+    output_data = session.run([output_name], {input_name: x_data})[0]
+    name = session.end_profiling()
+    t2 = time.time()
+
+    # print(t2-t1)
+    # Print shapes and types
+    # print(f"Input data shape: {x_data.shape}, dtype: {x_data.dtype}")
+    # print(f"Output data shape: {output_data.shape}, dtype: {output_data.dtype}")
+    # print("Output data (truncated):\n", output_data.flatten()[:50], "...\n")
+
+    return output_data.flatten()
+
+def run_tvm(img_input, model_path):
+    # Execute retina net with CGC
+    cgc_job = ChimeraJob(model_p=model_path, macs_per_pe=8, quiet_iss=False)
+    cgc_job.analyze_network()
+    cgc_job.compile(quiet=True)
+    print("compile finished!")
+
+    outputs = cgc_job.run_inference_harness(inputs={"input": img_input})
+    # return outputs
+    return outputs['495'].flatten()
+
+if __name__ == "__main__":
+    # total = 0
+    # n = 1
+    # name = ""
+    # for num in range(4, 20, 4):
+    #     total = 0
+    #     for i in range(n):
+    #         t, name = run_qlinearconv_model(num)
+    #         total += t
+
+
+    #     cpu_df, gpu_df = json_to_df(load_json(name), lambda x: True)
+    #     print(str(num) + " - " + str(round(total/n*1000)) + " " + str(round(np.sum(cpu_df["duration"])/1000)))
+    x_data = np.random.rand(1, 8, 128, 128).astype(np.int8)
+    # print(x_data)
+    output_ort_gpnpu = run_ort(True, x_data, "qlinearconv_model.onnx")
+    output_ort_cpu = run_ort(False, x_data, "qlinearconv_model.onnx")
+    np.save("gpnpu.npy", output_ort_gpnpu)
+    np.save("cpu.npy", output_ort_cpu)
+    output_tvm = run_tvm(x_data, "qlinearconv_model.onnx")
+    print(output_tvm)
+    print(output_tvm.keys())
+    np.save("tvm.npy", output_tvm)