diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 6df77cb132..33394eb924 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -83,6 +83,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int16_t, QuantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, UInt4x2, QuantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, Int4x2, QuantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLUT); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLUT); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearLeakyRelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, int8_t, QLinearLeakyRelu); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kMSDomain, 1, uint8_t, QLinearSigmoid); diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc index c4c738960b..80460e77e7 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_binary_op.cc @@ -6,6 +6,7 @@ #include "core/providers/common.h" #include "core/mlas/inc/mlas.h" #include "core/platform/threadpool.h" +#include "core/framework/op_kernel_context_internal.h" using onnxruntime::concurrency::ThreadPool; @@ -95,45 +96,95 @@ void QLinearImpl(OpKernelContext& context, double unit_cost, const ProcessBroadc template Status QLinearAdd::Compute(OpKernelContext* context) const { - const ProcessBroadcastSpanFuncs functors = { - [](BroadcastHelper& per_iter_bh) { - QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); - const T input0 = per_iter_bh.ScalarInput0(); - auto input1 = per_iter_bh.SpanInput1(); - auto output = per_iter_bh.OutputSpan(); - - MlasQLinearAdd(input1.data(), - qlbh.B_scale, static_cast(qlbh.B_zero_point), - &input0, - qlbh.A_scale, static_cast(qlbh.A_zero_point), - qlbh.C_scale, static_cast(qlbh.C_zero_point), - output.data(), output.size(), true); - }, - [](BroadcastHelper& per_iter_bh) { - QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); - auto input0 = per_iter_bh.SpanInput0(); - const T input1 = per_iter_bh.ScalarInput1(); - auto output = per_iter_bh.OutputSpan(); - MlasQLinearAdd(input0.data(), - qlbh.A_scale, static_cast(qlbh.A_zero_point), - &input1, - qlbh.B_scale, static_cast(qlbh.B_zero_point), - qlbh.C_scale, static_cast(qlbh.C_zero_point), - output.data(), output.size(), true); - }, - [](BroadcastHelper& per_iter_bh) { - QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); - auto input0 = per_iter_bh.SpanInput0(); - auto input1 = per_iter_bh.SpanInput1(); - auto output = per_iter_bh.OutputSpan(); - - MlasQLinearAdd(input0.data(), - qlbh.A_scale, static_cast(qlbh.A_zero_point), - input1.data(), - qlbh.B_scale, static_cast(qlbh.B_zero_point), - qlbh.C_scale, static_cast(qlbh.C_zero_point), - output.data(), output.size(), false); - }}; + auto* internal_context = dynamic_cast(context); + if (!internal_context) { + return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal"); + } + const auto& session_options = internal_context->GetSessionState().GetSessionOptions(); + // Test to see if we have access to enable_gpnpu flag + const bool gpnpu_flag = session_options.enable_gpnpu; + + const ProcessBroadcastSpanFuncs functors = gpnpu_flag ? ProcessBroadcastSpanFuncs{ + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + const T input0 = per_iter_bh.ScalarInput0(); + auto input1 = per_iter_bh.SpanInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAddFixedPoint(input1.data(), + qlbh.B_scale, static_cast(qlbh.B_zero_point), + &input0, + qlbh.A_scale, static_cast(qlbh.A_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), true); + }, + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + auto input0 = per_iter_bh.SpanInput0(); + const T input1 = per_iter_bh.ScalarInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAddFixedPoint(input0.data(), + qlbh.A_scale, static_cast(qlbh.A_zero_point), + &input1, + qlbh.B_scale, static_cast(qlbh.B_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), true); + }, + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + auto input0 = per_iter_bh.SpanInput0(); + auto input1 = per_iter_bh.SpanInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAddFixedPoint(input0.data(), + qlbh.A_scale, static_cast(qlbh.A_zero_point), + input1.data(), + qlbh.B_scale, static_cast(qlbh.B_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), false); + } + } : ProcessBroadcastSpanFuncs{ + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + const T input0 = per_iter_bh.ScalarInput0(); + auto input1 = per_iter_bh.SpanInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAdd(input1.data(), + qlbh.B_scale, static_cast(qlbh.B_zero_point), + &input0, + qlbh.A_scale, static_cast(qlbh.A_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), true); + }, + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + auto input0 = per_iter_bh.SpanInput0(); + const T input1 = per_iter_bh.ScalarInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAdd(input0.data(), + qlbh.A_scale, static_cast(qlbh.A_zero_point), + &input1, + qlbh.B_scale, static_cast(qlbh.B_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), true); + }, + [](BroadcastHelper& per_iter_bh) { + QLinearBroadcastHelper& qlbh = static_cast(per_iter_bh); + auto input0 = per_iter_bh.SpanInput0(); + auto input1 = per_iter_bh.SpanInput1(); + auto output = per_iter_bh.OutputSpan(); + + MlasQLinearAdd(input0.data(), + qlbh.A_scale, static_cast(qlbh.A_zero_point), + input1.data(), + qlbh.B_scale, static_cast(qlbh.B_zero_point), + qlbh.C_scale, static_cast(qlbh.C_zero_point), + output.data(), output.size(), false); + } + }; QLinearImpl(*context, 1.0, functors); diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc index e9924bf616..8f3ebcec7c 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.cc @@ -9,6 +9,7 @@ #include "core/util/math.h" #include "core/mlas/inc/mlas.h" #include +#include "core/framework/op_kernel_context_internal.h" using onnxruntime::concurrency::ThreadPool; @@ -55,6 +56,46 @@ Status ComputeQLinearGlobalAvgPool( return Status::OK(); } +template +Status ComputeQLinearGlobalAvgPoolFixedPoint( + const T8Bits* x, + float x_scale, + T8Bits x_zero_point, + T8Bits* y, + float y_scale, + T8Bits y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp) { + if (!channels_last || C == 1) { + auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) { + const T8Bits* input = (const T8Bits*)(x + (first * image_size)); + T8Bits* output = (T8Bits*)(y + first); + std::vector acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), last - first)); + MlasQLinearGlobalAveragePoolNchwFixedPoint(input, x_scale, x_zero_point, output, y_scale, y_zero_point, last - first, narrow(image_size), acc_buffer.data()); + }; + concurrency::ThreadPool::TryParallelFor( + tp, static_cast(N * C), {1.0 * image_size, 1.0, 8.0 * image_size}, worker); + } else { + auto worker = [=](std::ptrdiff_t first, std::ptrdiff_t last) { + const T8Bits* input = x + first * C * image_size; + T8Bits* output = y + first * C; + std::vector acc_buffer(MlasQLinearSafePaddingElementCount(sizeof(int32_t), narrow(C))); + std::vector zero_buffer(MlasQLinearSafePaddingElementCount(sizeof(T8Bits), narrow(C)), 0); + MlasQLinearGlobalAveragePoolNhwcFixedPoint( + input, x_scale, x_zero_point, output, y_scale, y_zero_point, + last - first, narrow(image_size), narrow(C), narrow(C), acc_buffer.data(), zero_buffer.data()); + }; + concurrency::ThreadPool::TryParallelFor( + tp, static_cast(N), + {1.0 * image_size * C, 1.0 * C, 8.0 * image_size * C}, + worker); + } + return Status::OK(); +} + // GCC's unexplained behavior: // GCC wouldn't generate corresponding symbols versus function instances below when "--disable-exceptions" // and "--minimal-build" are combined on linux build. @@ -87,6 +128,32 @@ template Status ComputeQLinearGlobalAvgPool( bool channels_last, concurrency::ThreadPool* tp); +template Status ComputeQLinearGlobalAvgPoolFixedPoint( + const int8_t* x, + float x_scale, + int8_t x_zero_point, + int8_t* y, + float y_scale, + int8_t y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp); + +template Status ComputeQLinearGlobalAvgPoolFixedPoint( + const uint8_t* x, + float x_scale, + uint8_t x_zero_point, + uint8_t* y, + float y_scale, + uint8_t y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp); + Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const { const auto tensor_x_scale = context->Input(1); const auto tensor_x_zero_point = context->Input(2); @@ -124,14 +191,35 @@ Status QLinearGlobalAveragePool::Compute(OpKernelContext* context) const { const float y_scale = *(tensor_y_scale->Data()); auto dtype = X.GetElementType(); - if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) { - return ComputeQLinearGlobalAvgPool(X.Data(), x_scale, *(tensor_x_zero_point->Data()), - Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), - N, C, image_size, channels_last_, tp); + + auto* internal_context = dynamic_cast(context); + if (!internal_context) { + return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal"); + } + const auto& session_options = internal_context->GetSessionState().GetSessionOptions(); + // Test to see if we have access to enable_gpnpu flag + const bool gpnpu_flag = session_options.enable_gpnpu; + + if (gpnpu_flag) { + if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) { + return ComputeQLinearGlobalAvgPoolFixedPoint(X.Data(), x_scale, *(tensor_x_zero_point->Data()), + Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), + N, C, image_size, channels_last_, tp); + } else { + return ComputeQLinearGlobalAvgPoolFixedPoint(X.Data(), x_scale, *(tensor_x_zero_point->Data()), + Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), + N, C, image_size, channels_last_, tp); + } } else { - return ComputeQLinearGlobalAvgPool(X.Data(), x_scale, *(tensor_x_zero_point->Data()), - Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), - N, C, image_size, channels_last_, tp); + if (dtype == ONNX_NAMESPACE::TensorProto_DataType_UINT8) { + return ComputeQLinearGlobalAvgPool(X.Data(), x_scale, *(tensor_x_zero_point->Data()), + Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), + N, C, image_size, channels_last_, tp); + } else { + return ComputeQLinearGlobalAvgPool(X.Data(), x_scale, *(tensor_x_zero_point->Data()), + Y.MutableData(), y_scale, *(tensor_y_zero_point->Data()), + N, C, image_size, channels_last_, tp); + } } } diff --git a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h index 2f491328a4..c80333fbf5 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h +++ b/onnxruntime/contrib_ops/cpu/quantization/qlinear_global_average_pool.h @@ -35,5 +35,19 @@ Status ComputeQLinearGlobalAvgPool( bool channels_last, concurrency::ThreadPool* tp); +template +Status ComputeQLinearGlobalAvgPoolFixedPoint( + const T8Bits* x, + float x_scale, + T8Bits x_zero_point, + T8Bits* y, + float y_scale, + T8Bits y_zero_point, + int64_t N, + int64_t C, + int64_t image_size, + bool channels_last, + concurrency::ThreadPool* tp); + } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc b/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc index ff8ad09082..9ad6dfb568 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/quant_gemm.cc @@ -8,6 +8,9 @@ #include "core/providers/cpu/quantization/matmul_integer_base.h" #include "core/quantization/quantization.h" #include "core/util/math_cpuonly.h" +#include "core/util/qmath.h" +#include "core/mlas/inc/mlas.h" +#include "core/framework/op_kernel_context_internal.h" namespace onnxruntime { namespace contrib { @@ -18,6 +21,14 @@ class QGemm : protected GemmBase, public MatMulIntegerBase { } Status Compute(OpKernelContext* context) const override { + auto* internal_context = dynamic_cast(context); + if (!internal_context) { + return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal"); + } + const auto& session_options = internal_context->GetSessionState().GetSessionOptions(); + // Test to see if we have access to enable_gpnpu flag + const bool gpnpu_flag = session_options.enable_gpnpu; + const auto* a = context->Input(IN_A); const auto* b = packed_b_ ? nullptr : context->Input(IN_B); const auto& b_shape = b ? b->Shape() : b_shape_; @@ -106,9 +117,17 @@ class QGemm : protected GemmBase, public MatMulIntegerBase { gemm_param.PerColumnZeroPoints = !IsScalarOr1ElementVector(b_zp); std::vector output_scales = ComputeOutputScale(a_scale, b_scale, y_scale); - std::optional scale_bias_proc_ptr; + + std::optional requant_proc_ptr_fixedpoint; + std::optional scale_bias_proc_ptr_fixedpoint; std::optional requant_proc_ptr; - SetPostProcessor(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr, requant_proc_ptr); + std::optional scale_bias_proc_ptr; + + if (gpnpu_flag) { + SetPostProcessorFixedPoint(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr_fixedpoint, requant_proc_ptr_fixedpoint); + } else { + SetPostProcessor(y_zp, N, output_scales, y, gemm_param, scale_bias_proc_ptr, requant_proc_ptr); + } MlasGemmBatch(gemm_shape, &gemm_param, 1, context->GetOperatorThreadPool()); return Status::OK(); @@ -210,6 +229,36 @@ class QGemm : protected GemmBase, public MatMulIntegerBase { gemm_param.OutputProcessor = &*scale_bias_proc_ptr; } } + static void SetPostProcessorFixedPoint(const Tensor* y_zp, + size_t out_lda, + const std::vector& output_scales, + Tensor* y, + MLAS_GEMM_QUANT_DATA_PARAMS& gemm_param, + std::optional& scale_bias_proc_ptr, + std::optional& requant_proc_ptr) { + if (nullptr != y_zp) { + bool is_y_signed = y->IsDataType(); + int32_t y_zero_point = is_y_signed ? *y_zp->Data() : *y_zp->Data(); + requant_proc_ptr.emplace( + y->MutableDataRaw(), + out_lda, + nullptr, + output_scales.data(), + output_scales.size() > 1, + y_zero_point, + is_y_signed); + gemm_param.OutputProcessor = &*requant_proc_ptr; + } else { + scale_bias_proc_ptr.emplace( + static_cast(y->MutableDataRaw()), + out_lda, + output_scales.data(), + nullptr, + MLAS_QGEMM_OUTPUT_MODE::ZeroMode, + output_scales.size() > 1 ? MLAS_QUANTIZATION_GRANULARITY::PerColumn : MLAS_QUANTIZATION_GRANULARITY::PerMatrix); + gemm_param.OutputProcessor = &*scale_bias_proc_ptr; + } + } }; ONNX_OPERATOR_TYPED_KERNEL_EX( diff --git a/onnxruntime/core/framework/op_kernel_context_internal.h b/onnxruntime/core/framework/op_kernel_context_internal.h index 64bd70465a..f0e7ff32a2 100644 --- a/onnxruntime/core/framework/op_kernel_context_internal.h +++ b/onnxruntime/core/framework/op_kernel_context_internal.h @@ -42,6 +42,11 @@ class OpKernelContextInternal : public OpKernelContext { return session_state_.GetUseDeterministicCompute(); } + // Add a getter method for session_state_ + const SessionState& GetSessionState() const { + return session_state_; + } + const SessionState* SubgraphSessionState(const std::string& attribute_name) { return session_state_.GetSubgraphSessionState(GetNodeIndex(), attribute_name); } diff --git a/onnxruntime/core/framework/session_options.h b/onnxruntime/core/framework/session_options.h index 8d4db36106..5bb3562240 100644 --- a/onnxruntime/core/framework/session_options.h +++ b/onnxruntime/core/framework/session_options.h @@ -80,6 +80,9 @@ struct SessionOptions { // set the execution order of the graph ExecutionOrder execution_order = ExecutionOrder::DEFAULT; + // set to true if emulating gpnpu + bool enable_gpnpu = false; + // enable profiling for this session. bool enable_profiling = false; diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 28ae64c4d5..c2533c797e 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -569,6 +569,56 @@ class MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSO MLAS_QUANTIZATION_GRANULARITY QuantGran_; }; +class MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT : public MLAS_QGEMM_OUTPUT_PROCESSOR { +public: + MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT( + float* Output, + size_t LeadingDimensionOutput, + const float* Scale, + const float* Bias, + MLAS_QGEMM_OUTPUT_MODE Mode = MLAS_QGEMM_OUTPUT_MODE::ZeroMode, + MLAS_QUANTIZATION_GRANULARITY QuantGran = MLAS_QUANTIZATION_GRANULARITY::PerMatrix) : + Output_(Output), + LeadingDimensionOutput_(LeadingDimensionOutput), + Scale_(Scale), + Bias_(Bias), + OutputMode_(Mode), + QuantGran_(QuantGran) + { + } + + void + Process( + const int32_t* C, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN, + size_t ldc + ) const override; + +private: + template + inline + void + ProcessImpl( + const int32_t* C, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN, + size_t ldc + ) const; + +private: + float* Output_; + size_t LeadingDimensionOutput_; + const float* Scale_; + const float* Bias_; + MLAS_QGEMM_OUTPUT_MODE OutputMode_; + MLAS_QUANTIZATION_GRANULARITY QuantGran_; +}; + /** * @brief Supply matrices shape and data type information to quantized gemm functions * @@ -1268,6 +1318,24 @@ MlasRequantizeOutput( size_t CountN ); +template +void +MLASCALL +MlasRequantizeOutputFixedPoint( + const int32_t* Input, + size_t InputLeadingDimension, + OutputType* Output, + size_t OutputLeadingDimension, + const int32_t* Bias, + const float* Scale, + bool PerColumnScale, + OutputType ZeroPoint, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN + ); + class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR { public: @@ -1318,6 +1386,56 @@ class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR : public MLAS_QGEMM_OUTPUT_PROCESSOR bool OutputIsSigned_; }; +class MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT : public MLAS_QGEMM_OUTPUT_PROCESSOR +{ + public: + MLAS_QGEMM_REQUANT_OUTPUT_PROCESSOR_FIXEDPOINT( + void* Output, + size_t OutputLeadingDimension, + const int32_t* Bias, + const float* Scale, + bool PerColumnScale, + int32_t ZeroPoint, + bool OutputIsSigned) + : Output_(Output), + OutputLeadingDimension_(OutputLeadingDimension), + Bias_(Bias), + Scale_(Scale), + PerColumnScale_(PerColumnScale), + ZeroPoint_(ZeroPoint), + OutputIsSigned_(OutputIsSigned) + { + } + + void Process(const int32_t* C, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN, + size_t ldc) const override + { + if(OutputIsSigned_){ + MlasRequantizeOutputFixedPoint(C, ldc, reinterpret_cast(Output_), OutputLeadingDimension_, + Bias_, Scale_, PerColumnScale_, static_cast(ZeroPoint_), + StartM, StartN, CountM, CountN); + } else { + MlasRequantizeOutputFixedPoint(C, ldc, reinterpret_cast(Output_), OutputLeadingDimension_, + Bias_, Scale_, PerColumnScale_, static_cast(ZeroPoint_), + StartM, StartN, CountM, CountN); + } + } + + + private: + void* Output_; + size_t OutputLeadingDimension_; + const int32_t* Bias_; + const float* Scale_; + bool PerColumnScale_; + int32_t ZeroPoint_; + bool OutputIsSigned_; +}; + void MLASCALL @@ -1368,6 +1486,39 @@ MlasQLinearGlobalAveragePoolNhwc( const T8Bits* ZeroBuffer ); +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNchwFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* AccumulateBuffer + ); + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNhwcFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Batch, + size_t ImageSize, + size_t Stride, + size_t Channels, + int32_t* AccumulateBuffer, + const T8Bits* ZeroBuffer + ); + // // InputA is of size N, // Input B is of size 1 if IsScalarB == true, otherwise it is of size N @@ -1389,6 +1540,23 @@ MlasQLinearAdd( bool IsScalarB ); +template +void +MLASCALL +MlasQLinearAddFixedPoint( + const DataType* InputA, + float ScaleA, + int32_t ZeroPointA, + const DataType* InputB, + float ScaleB, + int32_t ZeroPointB, + float ScaleC, + int32_t ZeroPointC, + DataType* OutputC, + size_t N, + bool IsScalarB + ); + template void MLASCALL diff --git a/onnxruntime/core/mlas/lib/qfunctions_helper.cpp b/onnxruntime/core/mlas/lib/qfunctions_helper.cpp new file mode 100644 index 0000000000..08bad5119a --- /dev/null +++ b/onnxruntime/core/mlas/lib/qfunctions_helper.cpp @@ -0,0 +1,47 @@ +#include "mlasi.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// Copying logic from data_to_qfp in tvm/python/tvm/target/epu_fx_util.py +// For purposes of calculating number of frac bits needed to represent scale in quantize ops + +// Function to derive fractional bits +int deriveFractionalBits(double scalar, int qfpSize) { + int valueBits = qfpSize - 1; + + double intPart; + ::modf(scalar, &intPart); // Returns the frac part which we dont care about, int part gets stored in pointer + intPart = std::abs(intPart); + + int intBits = (intPart == 0) ? 0 : static_cast(std::log2f(intPart)) + 1; + int fracBits = valueBits - intBits; + + assert(fracBits >= 0 && "Scalar cannot be represented in qfp format."); + + return fracBits; +} + +// Function to convert scalar to qfp +int scalarToQfp(double value, int fracBits) { + double frac, integer; + frac = ::modf(value, &integer); + + integer = static_cast(std::abs(integer)) << fracBits; + frac = std::roundf(std::abs(frac) * (1 << fracBits)); + + int qfp = static_cast(integer + frac); + if (value < 0) { + qfp *= -1; + } + + return qfp; +} diff --git a/onnxruntime/core/mlas/lib/qfunctions_helper.h b/onnxruntime/core/mlas/lib/qfunctions_helper.h new file mode 100644 index 0000000000..ab0ea38e6e --- /dev/null +++ b/onnxruntime/core/mlas/lib/qfunctions_helper.h @@ -0,0 +1,80 @@ +#include // For uint8_t, int32_t, etc. +#include +#include +#include +#include +#include +#include +#include +#include + +// Copying logic from fxRoundPosInf in cgc_ccl.hpp for custom round +template +inline int32_t customRound(const int32_t a) { + const int32_t zp5 = 1 << (aFracBits - 1); + return (a + zp5) >> aFracBits; +} + +// Function to derive fractional bits +int deriveFractionalBits(double scalar, int qfpSize); + +// Function to convert scalar to qfp +int scalarToQfp(double value, int fracBits); + +// Function to convert data to qfp +template +std::pair, int> dataToQfp( + const std::vector& data, int fracBits = -1, int qfpSize = 32, bool scalarAsFloat = true +) { + auto deriveFractionalBits = [qfpSize](double scalar) { + int valueBits = qfpSize - 1; + + double intPart; + ::modf(scalar, &intPart); + intPart = std::abs(intPart); + + int intBits = (intPart == 0) ? 0 : static_cast(std::log2f(intPart)) + 1; + int fracBits = valueBits - intBits; + + assert(fracBits >= 0 && "Scalar cannot be represented in qfp format."); + + return fracBits; + }; + + auto scalarToQfp = [](double value, int fracBits) { + double frac, integer; + frac = ::modf(value, &integer); + + integer = static_cast(std::abs(integer)) << fracBits; + frac = std::roundf(std::abs(frac) * (1 << fracBits)); + + int qfp = static_cast(integer + frac); + if (value < 0) { + qfp *= -1; + } + + return qfp; + }; + + std::vector qfp; + if (data.size() != 1) { + if (fracBits == -1) { + fracBits = deriveFractionalBits(*std::max_element(data.begin(), data.end(), [](T a, T b) { return std::abs(a) < std::abs(b); })); + } + qfp.reserve(data.size()); + std::transform(data.begin(), data.end(), std::back_inserter(qfp), [fracBits, &scalarToQfp](T value) { + return scalarToQfp(value, fracBits); + }); + } else { + if (fracBits == -1) { + fracBits = deriveFractionalBits(data[0]); + } + if (scalarAsFloat) { + qfp.push_back(static_cast(data[0])); + } else { + qfp.push_back(scalarToQfp(data[0], fracBits)); + } + } + + return std::make_pair(qfp, fracBits); +} diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h index 1ef5b5f741..2ae291d957 100644 --- a/onnxruntime/core/mlas/lib/qgemm.h +++ b/onnxruntime/core/mlas/lib/qgemm.h @@ -168,6 +168,22 @@ MlasGemmQuantKernel( bool ZeroMode ); +template +size_t +MlasGemmQuantKernelFixedPoint( + const typename KernelType::PackedAType* A, + const typename KernelType::PackedBType* B, + int32_t* C, + size_t PackedCountK, + size_t CountM, + size_t CountN, + size_t ldc, + const int32_t* RowSumBuffer, + const int32_t* ColumnSumBuffer, + const int32_t* ZeroPointB, + bool ZeroMode +); + /** * @brief Usually a wrapper of assembly/intrinsic kernel * of symmetric quant gemm diff --git a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp index 8f4baaa0ff..769464ebbf 100644 --- a/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp +++ b/onnxruntime/core/mlas/lib/qgemm_kernel_default.cpp @@ -214,6 +214,63 @@ MlasGemmQuantKernel( return 1; } +template<> +size_t +MlasGemmQuantKernelFixedPoint( + const MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedAType* A, + const MLAS_GEMM_QUANT_KERNEL_DEFAULT::PackedBType* B, + int32_t* C, + size_t PackedCountK, + size_t CountM, + size_t CountN, + size_t ldc, + const int32_t* RowSumBuffer, + const int32_t* ColumnSumBuffer, + const int32_t* ZeroPointB, + bool ZeroMode + ) +{ + MLAS_UNREFERENCED_PARAMETER(CountM); + MLAS_UNREFERENCED_PARAMETER(ldc); + + // + // Process a single column of matrix B in a loop. + // + + while (CountN-- > 0) { + + int32_t Accumulator = *RowSumBuffer; + + if (ZeroPointB != nullptr) { + Accumulator *= *ZeroPointB++; + } + + Accumulator += *ColumnSumBuffer++; + + const auto* a = A; + + for (size_t k = 0; k < PackedCountK; k++) { + + Accumulator += a[0] * B[0]; + Accumulator += a[1] * B[1]; + Accumulator += a[2] * B[2]; + Accumulator += a[3] * B[3]; + + a += 4; + B += 4; + } + + if (!ZeroMode) { + Accumulator += C[0]; + } + + C[0] = Accumulator; + C += 1; + } + + return 1; +} + const MLAS_GEMM_QUANT_DISPATCH MlasGemmQuantDispatchDefault = { MlasGemmQuantOperation, nullptr, diff --git a/onnxruntime/core/mlas/lib/qladd.cpp b/onnxruntime/core/mlas/lib/qladd.cpp index 5dafa17c2a..73f148bb9c 100644 --- a/onnxruntime/core/mlas/lib/qladd.cpp +++ b/onnxruntime/core/mlas/lib/qladd.cpp @@ -19,6 +19,23 @@ Module Name: --*/ #include "qladd.h" +// #include "qfunctions_helper.h" + +#include "mlasi.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "qfunctions_helper.h" + // Pure C++ helper, back off here in rare case. template @@ -58,6 +75,69 @@ MlasQLinearAddKernelRawHelper( } } +template +MLAS_FORCEINLINE +static +void +MlasQLinearAddKernelRawHelperFixedPoint( + const DataType* InputA, + float ScaleA, + int32_t ZeroPointA, + const DataType* InputB, + float ScaleB, + int32_t ZeroPointB, + float ScaleC, + int32_t ZeroPointC, + DataType* OutputC, + size_t N + ) +{ + int dequant_frac_bits = 16; // hard coded in tvm python + + std::vector ScaleValueVecA = {ScaleA/ScaleC}; // Create single-element vector + auto pairA = dataToQfp(ScaleValueVecA, -1, 32, false); // Returns std::make_pair(qfp, fracBits) + int fracBitsA = pairA.second; + int mulScaleA = fracBitsA - dequant_frac_bits; + int64_t* fpScaleA = new int64_t; + + std::vector ScaleValueVecB = {ScaleB/ScaleC}; // Create single-element vector + auto pairB = dataToQfp(ScaleValueVecB, -1, 32, false); // Returns std::make_pair(qfp, fracBits) + int fracBitsB = pairB.second; + int mulScaleB = fracBitsB - dequant_frac_bits; + int64_t* fpScaleB = new int64_t; + + int fracBits = (fracBitsA > fracBitsB) ? fracBitsA : fracBitsB; + + *fpScaleA = static_cast((ScaleA/ScaleC) * (1LL << fracBits)); + *fpScaleB = static_cast((ScaleB/ScaleC) * (1LL << fracBits)); + + const int32_t MinimumValue = std::numeric_limits::lowest(); + const int32_t MaximumValue = std::numeric_limits::max(); + + int64_t ValueB; + + int mulScale = (fracBitsA > fracBitsB) ? mulScaleA : mulScaleB; + + if (IsScalarB) { + ValueB = ((int64_t(InputB[0]) - ZeroPointB) * (*fpScaleB)) >> mulScale; + } + + // here + for (size_t n = 0; n < N; n++) { + int64_t ValueA = ((*fpScaleA) * (int64_t(InputA[n]) - ZeroPointA)) >> mulScale; + if (!IsScalarB) { + ValueB = ((*fpScaleB) * (int64_t(InputB[n]) - ZeroPointB)) >> mulScale; + } + int64_t ValueC = ValueA + ValueB; + + // ValueC = ValueC >> mulScaleC; + ValueC = customRound<16>(static_cast(ValueC)); + int32_t ValueCInt = static_cast(ValueC + ZeroPointC); + ValueCInt = std::min(std::max(ValueCInt, MinimumValue), MaximumValue); + OutputC[n] = (DataType)(ValueCInt); + } +} + #if defined(MLAS_NEON_INTRINSICS) bool MlasCalcQLinearAddParameters( @@ -717,6 +797,33 @@ MlasQLinearAddKernel( } } +template +static +void +MLASCALL +MlasQLinearAddKernelFixedPoint( + const DataType* InputA, + float ScaleA, + int32_t ZeroPointA, + const DataType* InputB, + float ScaleB, + int32_t ZeroPointB, + float ScaleC, + int32_t ZeroPointC, + DataType* OutputC, + size_t N, + bool IsScalarB + ) +{ + if (IsScalarB) { + MlasQLinearAddKernelRawHelperFixedPoint( + InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N); + } else { + MlasQLinearAddKernelRawHelperFixedPoint( + InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N); + } +} + template<> void MLASCALL @@ -767,6 +874,46 @@ MlasQLinearAdd( InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB); } +template<> +void +MLASCALL +MlasQLinearAddFixedPoint( + const int8_t* InputA, + float ScaleA, + int32_t ZeroPointA, + const int8_t* InputB, + float ScaleB, + int32_t ZeroPointB, + float ScaleC, + int32_t ZeroPointC, + int8_t* OutputC, + size_t N, + bool IsScalarB + ) +{ + MlasQLinearAddKernelFixedPoint(InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB); +} + +template<> +void +MLASCALL +MlasQLinearAddFixedPoint( + const uint8_t* InputA, + float ScaleA, + int32_t ZeroPointA, + const uint8_t* InputB, + float ScaleB, + int32_t ZeroPointB, + float ScaleC, + int32_t ZeroPointC, + uint8_t* OutputC, + size_t N, + bool IsScalarB + ) +{ + MlasQLinearAddKernelFixedPoint(InputA, ScaleA, ZeroPointA, InputB, ScaleB, ZeroPointB, ScaleC, ZeroPointC, OutputC, N, IsScalarB); +} + // // Function definition for platform usage // diff --git a/onnxruntime/core/mlas/lib/qlgavgpool.cpp b/onnxruntime/core/mlas/lib/qlgavgpool.cpp index e44d7ad25c..4905cd4cee 100644 --- a/onnxruntime/core/mlas/lib/qlgavgpool.cpp +++ b/onnxruntime/core/mlas/lib/qlgavgpool.cpp @@ -15,6 +15,9 @@ Module Name: --*/ #include "mlasi.h" +#include "qfunctions_helper.h" +#include + size_t MLASCALL @@ -240,8 +243,617 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch( CALCULATE_ACCUMULATE_VECTORS(); - vst1q_s32(acc, vacc_lo); - vst1q_s32(acc + 4, vacc_hi); + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + } + finish_one_pass = true; + + i0 += step_next_group; + i1 += step_next_group; + i2 += step_next_group; + i3 += step_next_group; + i4 += step_next_group; + i5 += step_next_group; + i6 += step_next_group; + } + + if (ImageSize > 0) { + + switch (ImageSize) { + case 1: + i1 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 2: + i2 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 3: + i3 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 4: + i4 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 5: + i5 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 6: + i6 = (const uint8_t*)ZeroBuffer; /* fall through */ + default: + break; + } + + int32_t* acc = AccumulateBuffer; + size_t c = Channels; + for (; c >= 8; c -= 8) { + + LOAD_FULL_CHANNELS(); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + acc += 8; + } + + if (c > 0) { + + const uint8x8_t vi0 = + vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0)); + const uint8x8_t vi1 = vld1_u8( + ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1)); + const uint8x8_t vi2 = vld1_u8( + ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2)); + const uint8x8_t vi3 = vld1_u8( + ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3)); + const uint8x8_t vi4 = vld1_u8( + ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4)); + const uint8x8_t vi5 = vld1_u8( + ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5)); + const uint8x8_t vi6 = vld1_u8( + ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6)); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + } + } + MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false, + Output_zero_point, 0, 0, 1, Channels); +} + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNchwFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* AccumulateBuffer + ) +{ + float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize); + int32_t bias[] = {-ZeroPointInput * static_cast(ImageSize), 0, 0, 0}; + const int32x4_t vbias = vld1q_s32(bias); + const int32x4_t vzero = vmovq_n_s32(0); + const uint8_t* InputU8 = (const uint8_t*)(Input); + + int32_t* sum_buffer = AccumulateBuffer; + uint8_t tail_buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + for (size_t c = Channels; c > 0; c--) { + + int32x4_t vacc_lo = vbias; + int32x4_t vacc_hi = vzero; + auto Len = ImageSize; + for (; Len >= 32; Len -= 32) { + + const uint8x8_t vi0 = vld1_u8(InputU8); + const uint8x8_t vi1 = vld1_u8(InputU8 + 8); + const uint8x8_t vi2 = vld1_u8(InputU8 + 16); + const uint8x8_t vi3 = vld1_u8(InputU8 + 24); + + int16x8_t vsum; + if constexpr (std::is_signed::value) { + + const int16x8_t vs01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1)); + const int16x8_t vs23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3)); + vsum = vaddq_s16(vs01, vs23); + } else { + + const uint16x8_t vs01 = vaddl_u8(vi0, vi1); + const uint16x8_t vs23 = vaddl_u8(vi2, vi3); + vsum = vreinterpretq_s16_u16(vaddq_u16(vs01, vs23)); + } + + vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); + vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); + InputU8 += 32; + } + for (; Len >= 8; Len -= 8) { + + int16x8_t vsum; + if constexpr (std::is_signed::value) { + vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(InputU8))); + } else { + vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(InputU8))); + } + vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); + vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); + InputU8 += 8; + } + + if (Len > 0) { + + memcpy(tail_buffer, InputU8, Len); + int16x8_t vsum; + if constexpr (std::is_signed::value) { + vsum = vmovl_s8(vreinterpret_s8_u8(vld1_u8(tail_buffer))); + } else { + vsum = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(tail_buffer))); + } + + vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); + vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)); + InputU8 += Len; + } + + vacc_lo = vaddq_s32(vacc_lo, vacc_hi); + int32x2_t vacc = vadd_s32(vget_high_s32(vacc_lo), vget_low_s32(vacc_lo)); + *sum_buffer++ = vget_lane_s32(vpadd_s32(vacc, vacc), 0); + } + + MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false, + static_cast(ZeroPointOutput), 0, 0, 1, Channels); +} + +template +MLAS_FORCEINLINE +void +MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint( + const T8Bits* Input, + T8Bits* Output, + const T8Bits* LastOf8, + size_t ImageSize, + size_t Channels, + size_t Stride, + int32_t Bias, + float Scale, + T8Bits Output_zero_point, + int32_t* AccumulateBuffer, + const T8Bits* ZeroBuffer + ) +{ +#define LOAD_FULL_CHANNELS() \ + const uint8x8_t vi0 = vld1_u8(i0); \ + i0 += 8; \ + const uint8x8_t vi1 = vld1_u8(i1); \ + i1 += 8; \ + const uint8x8_t vi2 = vld1_u8(i2); \ + i2 += 8; \ + const uint8x8_t vi3 = vld1_u8(i3); \ + i3 += 8; \ + const uint8x8_t vi4 = vld1_u8(i4); \ + i4 += 8; \ + const uint8x8_t vi5 = vld1_u8(i5); \ + i5 += 8; \ + const uint8x8_t vi6 = vld1_u8(i6); \ + i6 += 8 + +#define CALCULATE_ACCUMULATE_VECTORS() \ + int32x4_t vacc_lo = finish_one_pass ? vld1q_s32(acc) : vbias; \ + int32x4_t vacc_hi = finish_one_pass ? vld1q_s32(acc + 4) : vbias; \ + int16x8_t vsum; \ + if constexpr (std::is_signed::value) { \ + const int16x8_t vsum01 = vaddl_s8(vreinterpret_s8_u8(vi0), vreinterpret_s8_u8(vi1)); \ + const int16x8_t vsum23 = vaddl_s8(vreinterpret_s8_u8(vi2), vreinterpret_s8_u8(vi3)); \ + const int16x8_t vsum45 = vaddl_s8(vreinterpret_s8_u8(vi4), vreinterpret_s8_u8(vi5)); \ + const int16x8_t vsum016 = vaddw_s8(vsum01, vreinterpret_s8_u8(vi6)); \ + const int16x8_t vsum2345 = vaddq_s16(vsum23, vsum45); \ + vsum = vaddq_s16(vsum016, vsum2345); \ + } else { \ + const uint16x8_t vsum01 = vaddl_u8(vi0, vi1); \ + const uint16x8_t vsum23 = vaddl_u8(vi2, vi3); \ + const uint16x8_t vsum45 = vaddl_u8(vi4, vi5); \ + const uint16x8_t vsum016 = vaddw_u8(vsum01, vi6); \ + const uint16x8_t vsum2345 = vaddq_u16(vsum23, vsum45); \ + vsum = vreinterpretq_s16_u16(vaddq_u16(vsum016, vsum2345)); \ + } \ + vacc_lo = vaddw_s16(vacc_lo, vget_low_s16(vsum)); \ + vacc_hi = vaddw_s16(vacc_hi, vget_high_s16(vsum)) + + uint8_t tail[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + const int32x4_t vbias = vld1q_dup_s32(&Bias); + bool finish_one_pass = false; + const size_t step_next_group = 7 * Stride - (Channels & ~size_t{7}); + + const uint8_t* LastOf8U8 = (const uint8_t*)LastOf8; + const uint8_t* i0 = (const uint8_t*)Input; + const uint8_t* i1 = i0 + Stride; + const uint8_t* i4 = i0 + Stride * 4; + const uint8_t* i2 = i1 + Stride; + const uint8_t* i5 = i4 + Stride; + const uint8_t* i3 = i2 + Stride; + const uint8_t* i6 = i5 + Stride; + + for (; ImageSize > 7; ImageSize -= 7) { + + int32_t* acc = AccumulateBuffer; + size_t c = Channels; + for (; c >= 8; c -= 8) { + + LOAD_FULL_CHANNELS(); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + acc += 8; + } + if (c > 0) { + + const uint8x8_t vi0 = vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0)); + const uint8x8_t vi1 = vld1_u8(((i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1)); + const uint8x8_t vi2 = vld1_u8(((i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2)); + const uint8x8_t vi3 = vld1_u8(((i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3)); + const uint8x8_t vi4 = vld1_u8(((i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4)); + const uint8x8_t vi5 = vld1_u8(((i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5)); + const uint8x8_t vi6 = vld1_u8(((i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6)); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + } + finish_one_pass = true; + + i0 += step_next_group; + i1 += step_next_group; + i2 += step_next_group; + i3 += step_next_group; + i4 += step_next_group; + i5 += step_next_group; + i6 += step_next_group; + } + + if (ImageSize > 0) { + + switch (ImageSize) { + case 1: + i1 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 2: + i2 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 3: + i3 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 4: + i4 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 5: + i5 = (const uint8_t*)ZeroBuffer; /* fall through */ + case 6: + i6 = (const uint8_t*)ZeroBuffer; /* fall through */ + default: + break; + } + + int32_t* acc = AccumulateBuffer; + size_t c = Channels; + for (; c >= 8; c -= 8) { + + LOAD_FULL_CHANNELS(); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + acc += 8; + } + + if (c > 0) { + + const uint8x8_t vi0 = + vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0)); + const uint8x8_t vi1 = vld1_u8( + ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1)); + const uint8x8_t vi2 = vld1_u8( + ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2)); + const uint8x8_t vi3 = vld1_u8( + ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3)); + const uint8x8_t vi4 = vld1_u8( + ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4)); + const uint8x8_t vi5 = vld1_u8( + ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5)); + const uint8x8_t vi6 = vld1_u8( + ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6)); + + CALCULATE_ACCUMULATE_VECTORS(); + + vst1q_s32(acc, vacc_lo); + vst1q_s32(acc + 4, vacc_hi); + } + } + MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false, + Output_zero_point, 0, 0, 1, Channels); +} + +#elif defined(MLAS_SSE2_INTRINSICS) + +template +void MLASCALL +MlasQLinearGlobalAveragePoolNchw( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* AccumulateBuffer + ) +{ + float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize); + const int32_t bias[] = {-ZeroPointInput * static_cast(ImageSize), 0, 0, 0}; + const auto vbias = _mm_loadu_si128((const __m128i*)&bias); + const auto vzero = _mm_setzero_si128(); + uint8_t buffer[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + + int32_t* sum_buffer = AccumulateBuffer; + for (size_t c = Channels; c > 0; c--) { + + __m128i vacc_lo = vbias; + __m128i vacc_hi = vzero; + auto Len = ImageSize; + for (; Len >= 32; Len -= 32) { + + const __m128i vi0 = _mm_loadl_epi64((const __m128i*)Input); + const __m128i vi1 = _mm_loadl_epi64((const __m128i*)(Input + 8)); + const __m128i vi2 = _mm_loadl_epi64((const __m128i*)(Input + 16)); + const __m128i vi3 = _mm_loadl_epi64((const __m128i*)(Input + 24)); + + if constexpr (std::is_signed::value) { + + const __m128i vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8); + const __m128i vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8); + const __m128i vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8); + const __m128i vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8); + const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), + _mm_add_epi16(vxi2, vxi3)); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); + } else { + + const __m128i vxi0 = _mm_unpacklo_epi8(vi0, vzero); + const __m128i vxi1 = _mm_unpacklo_epi8(vi1, vzero); + const __m128i vxi2 = _mm_unpacklo_epi8(vi2, vzero); + const __m128i vxi3 = _mm_unpacklo_epi8(vi3, vzero); + const __m128i vsum = _mm_add_epi16(_mm_add_epi16(vxi0, vxi1), + _mm_add_epi16(vxi2, vxi3)); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); + } + + Input += 32; + } + for (; Len >= 8; Len -= 8) { + + if constexpr (std::is_signed::value) { + + const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)Input)), 8); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); + } else { + + const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)Input), vzero); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); + } + + Input += 8; + } + if (Len > 0) { + + memcpy(buffer, Input, Len); + + if constexpr (std::is_signed::value) { + + const __m128i vsum = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, _mm_loadl_epi64((const __m128i*)buffer)), 8); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); + } else { + + const __m128i vsum = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)buffer), vzero); + vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); + vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); + } + + Input += Len; + } + + __m128i vacc = _mm_add_epi32(vacc_lo, vacc_hi); // [ D C | B A ] + __m128i vshuf = _mm_shuffle_epi32(vacc, _MM_SHUFFLE(2, 3, 0, 1)); // [ C D | A B ] + __m128i vsums = _mm_add_epi32(vacc, vshuf); // [ D+C C+D | B+A A+B ] + vshuf = _mm_shuffle_epi32(vsums, _MM_SHUFFLE(1, 0, 3, 2)); // [ B+A A+B | D+C C+D ] + vsums = _mm_add_epi32(vsums, vshuf); + *sum_buffer++ = _mm_cvtsi128_si32(vsums); + } + + MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false, + static_cast(ZeroPointOutput), 0, 0, 1, Channels); +} + +template +MLAS_FORCEINLINE +void +MlasQLinearGlobalAveragePoolNhwcSingleBatch( + const T8Bits* Input, + T8Bits* Output, + const T8Bits* LastOf8, + size_t ImageSize, + size_t Channels, + size_t Stride, + int32_t Bias, + float Scale, + T8Bits Output_zero_point, + int32_t* AccumulateBuffer, + const T8Bits* ZeroBuffer + ) +{ +#if defined(MLAS_TARGET_IX86) + + constexpr size_t PixelsPerIteration = 4; + +#define LOAD_FULL_CHANNELS() \ + const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0); \ + i0 += 8; \ + const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1); \ + i1 += 8; \ + const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2); \ + i2 += 8; \ + const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3); \ + i3 += 8; + +#define CALCULATE_ACCUMULATE_VECTORS() \ + __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias; \ + __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias; \ + __m128i vxi0; \ + __m128i vxi1; \ + __m128i vxi2; \ + __m128i vxi3; \ + if constexpr (std::is_signed::value) { \ + vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8); \ + vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8); \ + vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8); \ + vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8); \ + } else { \ + vxi0 = _mm_unpacklo_epi8(vi0, vzero); \ + vxi1 = _mm_unpacklo_epi8(vi1, vzero); \ + vxi2 = _mm_unpacklo_epi8(vi2, vzero); \ + vxi3 = _mm_unpacklo_epi8(vi3, vzero); \ + } \ + __m128i vsum01 = _mm_add_epi16(vxi0, vxi1); \ + __m128i vsum23 = _mm_add_epi16(vxi2, vxi3); \ + __m128i vsum = _mm_add_epi16(vsum01, vsum23); \ + \ + if constexpr (std::is_signed::value) { \ + vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \ + vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \ + } else { \ + vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); \ + vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); \ + } + +#else + + constexpr size_t PixelsPerIteration = 7; +#define LOAD_FULL_CHANNELS() \ + const __m128i vi0 = _mm_loadl_epi64((const __m128i*)i0); \ + i0 += 8; \ + const __m128i vi1 = _mm_loadl_epi64((const __m128i*)i1); \ + i1 += 8; \ + const __m128i vi2 = _mm_loadl_epi64((const __m128i*)i2); \ + i2 += 8; \ + const __m128i vi3 = _mm_loadl_epi64((const __m128i*)i3); \ + i3 += 8; \ + const __m128i vi4 = _mm_loadl_epi64((const __m128i*)i4); \ + i4 += 8; \ + const __m128i vi5 = _mm_loadl_epi64((const __m128i*)i5); \ + i5 += 8; \ + const __m128i vi6 = _mm_loadl_epi64((const __m128i*)i6); \ + i6 += 8 + +#define CALCULATE_ACCUMULATE_VECTORS() \ + __m128i vacc_lo = finish_one_pass ? _mm_loadu_si128((__m128i*)acc) : vbias; \ + __m128i vacc_hi = finish_one_pass ? _mm_loadu_si128(((__m128i*)acc) + 1) : vbias; \ + __m128i vxi0; \ + __m128i vxi1; \ + __m128i vxi2; \ + __m128i vxi3; \ + __m128i vxi4; \ + __m128i vxi5; \ + __m128i vxi6; \ + if constexpr (std::is_signed::value) { \ + vxi0 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi0), 8); \ + vxi1 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi1), 8); \ + vxi2 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi2), 8); \ + vxi3 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi3), 8); \ + vxi4 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi4), 8); \ + vxi5 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi5), 8); \ + vxi6 = _mm_srai_epi16(_mm_unpacklo_epi8(vzero, vi6), 8); \ + } else { \ + vxi0 = _mm_unpacklo_epi8(vi0, vzero); \ + vxi1 = _mm_unpacklo_epi8(vi1, vzero); \ + vxi2 = _mm_unpacklo_epi8(vi2, vzero); \ + vxi3 = _mm_unpacklo_epi8(vi3, vzero); \ + vxi4 = _mm_unpacklo_epi8(vi4, vzero); \ + vxi5 = _mm_unpacklo_epi8(vi5, vzero); \ + vxi6 = _mm_unpacklo_epi8(vi6, vzero); \ + } \ + const __m128i vsum01 = _mm_add_epi16(vxi0, vxi1); \ + const __m128i vsum23 = _mm_add_epi16(vxi2, vxi3); \ + const __m128i vsum45 = _mm_add_epi16(vxi4, vxi5); \ + const __m128i vsum016 = _mm_add_epi16(vsum01, vxi6); \ + const __m128i vsum2345 = _mm_add_epi16(vsum23, vsum45); \ + const __m128i vsum = _mm_add_epi16(vsum016, vsum2345); \ + if constexpr (std::is_signed::value) { \ + vacc_lo = _mm_add_epi32(vacc_lo, _mm_srai_epi32(_mm_unpacklo_epi16(vzero, vsum), 16)); \ + vacc_hi = _mm_add_epi32(vacc_hi, _mm_srai_epi32(_mm_unpackhi_epi16(vzero, vsum), 16)); \ + } else { \ + vacc_lo = _mm_add_epi32(vacc_lo, _mm_unpacklo_epi16(vsum, vzero)); \ + vacc_hi = _mm_add_epi32(vacc_hi, _mm_unpackhi_epi16(vsum, vzero)); \ + } + +#endif + + T8Bits tail[8] = {0, 0, 0, 0, 0, 0, 0, 0}; + bool finish_one_pass = false; + const __m128i vbias = _mm_set1_epi32(Bias); + const __m128i vzero = _mm_setzero_si128(); + size_t step_next_group = PixelsPerIteration * Stride - (Channels & ~size_t{7}); + + const T8Bits* i0 = Input; + const T8Bits* i1 = i0 + Stride; + const T8Bits* i2 = i1 + Stride; + const T8Bits* i3 = i2 + Stride; +#if !defined(MLAS_TARGET_IX86) + const T8Bits* i4 = i0 + Stride * 4; + const T8Bits* i5 = i4 + Stride; + const T8Bits* i6 = i5 + Stride; +#endif + + for (; ImageSize > PixelsPerIteration; ImageSize -= PixelsPerIteration) { + + int32_t* acc = AccumulateBuffer; + size_t c = Channels; + for (; c >= 8; c -= 8) { + + LOAD_FULL_CHANNELS(); + + CALCULATE_ACCUMULATE_VECTORS(); + + _mm_storeu_si128((__m128i*)acc, vacc_lo); + _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi); + acc += 8; + } + if (c > 0) { + const __m128i vi0 = + _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0)); + const __m128i vi1 = + _mm_loadl_epi64((const __m128i*)(i1 >= LastOf8 ? memcpy(tail, i1, c) : i1)); + const __m128i vi2 = + _mm_loadl_epi64((const __m128i*)(i2 >= LastOf8 ? memcpy(tail, i2, c) : i2)); + const __m128i vi3 = + _mm_loadl_epi64((const __m128i*)(i3 >= LastOf8 ? memcpy(tail, i3, c) : i3)); +#if !defined(MLAS_TARGET_IX86) + const __m128i vi4 = + _mm_loadl_epi64((const __m128i*)(i4 >= LastOf8 ? memcpy(tail, i4, c) : i4)); + const __m128i vi5 = + _mm_loadl_epi64((const __m128i*)(i5 >= LastOf8 ? memcpy(tail, i5, c) : i5)); + const __m128i vi6 = + _mm_loadl_epi64((const __m128i*)(i6 >= LastOf8 ? memcpy(tail, i6, c) : i6)); +#endif + + CALCULATE_ACCUMULATE_VECTORS(); + + _mm_storeu_si128((__m128i*)acc, vacc_lo); + _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi); } finish_one_pass = true; @@ -249,29 +861,52 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch( i1 += step_next_group; i2 += step_next_group; i3 += step_next_group; +#if !defined(MLAS_TARGET_IX86) i4 += step_next_group; i5 += step_next_group; i6 += step_next_group; +#endif } if (ImageSize > 0) { - +#if defined(MLAS_TARGET_IX86) switch (ImageSize) { case 1: - i1 = (const uint8_t*)ZeroBuffer; /* fall through */ + i1 = ZeroBuffer; + [[fallthrough]]; case 2: - i2 = (const uint8_t*)ZeroBuffer; /* fall through */ + i2 = ZeroBuffer; + [[fallthrough]]; case 3: - i3 = (const uint8_t*)ZeroBuffer; /* fall through */ + i3 = ZeroBuffer; + [[fallthrough]]; + default: + break; + } +#else + switch (ImageSize) { + case 1: + i1 = ZeroBuffer; + [[fallthrough]]; + case 2: + i2 = ZeroBuffer; + [[fallthrough]]; + case 3: + i3 = ZeroBuffer; + [[fallthrough]]; case 4: - i4 = (const uint8_t*)ZeroBuffer; /* fall through */ + i4 = ZeroBuffer; + [[fallthrough]]; case 5: - i5 = (const uint8_t*)ZeroBuffer; /* fall through */ + i5 = ZeroBuffer; + [[fallthrough]]; case 6: - i6 = (const uint8_t*)ZeroBuffer; /* fall through */ + i6 = ZeroBuffer; + [[fallthrough]]; default: break; } +#endif int32_t* acc = AccumulateBuffer; size_t c = Channels; @@ -281,43 +916,42 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch( CALCULATE_ACCUMULATE_VECTORS(); - vst1q_s32(acc, vacc_lo); - vst1q_s32(acc + 4, vacc_hi); + _mm_storeu_si128((__m128i*)acc, vacc_lo); + _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi); acc += 8; } if (c > 0) { - - const uint8x8_t vi0 = - vld1_u8(((i0 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i0, c) : i0)); - const uint8x8_t vi1 = vld1_u8( - ((1 < ImageSize && i1 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i1, c) : i1)); - const uint8x8_t vi2 = vld1_u8( - ((2 < ImageSize && i2 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i2, c) : i2)); - const uint8x8_t vi3 = vld1_u8( - ((3 < ImageSize && i3 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i3, c) : i3)); - const uint8x8_t vi4 = vld1_u8( - ((4 < ImageSize && i4 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i4, c) : i4)); - const uint8x8_t vi5 = vld1_u8( - ((5 < ImageSize && i5 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i5, c) : i5)); - const uint8x8_t vi6 = vld1_u8( - ((6 < ImageSize && i6 >= LastOf8U8) ? (const uint8_t*)memcpy(tail, i6, c) : i6)); + const __m128i vi0 = + _mm_loadl_epi64((const __m128i*)(i0 >= LastOf8 ? memcpy(tail, i0, c) : i0)); + const __m128i vi1 = _mm_loadl_epi64( + (const __m128i*)(1 < ImageSize && i1 >= LastOf8 ? memcpy(tail, i1, c) : i1)); + const __m128i vi2 = _mm_loadl_epi64( + (const __m128i*)(2 < ImageSize && i2 >= LastOf8 ? memcpy(tail, i2, c) : i2)); + const __m128i vi3 = _mm_loadl_epi64( + (const __m128i*)(3 < ImageSize && i3 >= LastOf8 ? memcpy(tail, i3, c) : i3)); +#if !defined(MLAS_TARGET_IX86) + const __m128i vi4 = _mm_loadl_epi64( + (const __m128i*)(4 < ImageSize && i4 >= LastOf8 ? memcpy(tail, i4, c) : i4)); + const __m128i vi5 = _mm_loadl_epi64( + (const __m128i*)(5 < ImageSize && i5 >= LastOf8 ? memcpy(tail, i5, c) : i5)); + const __m128i vi6 = _mm_loadl_epi64( + (const __m128i*)(6 < ImageSize && i6 >= LastOf8 ? memcpy(tail, i6, c) : i6)); +#endif CALCULATE_ACCUMULATE_VECTORS(); - vst1q_s32(acc, vacc_lo); - vst1q_s32(acc + 4, vacc_hi); + _mm_storeu_si128((__m128i*)acc, vacc_lo); + _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi); } } MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false, Output_zero_point, 0, 0, 1, Channels); } -#elif defined(MLAS_SSE2_INTRINSICS) - template void MLASCALL -MlasQLinearGlobalAveragePoolNchw( +MlasQLinearGlobalAveragePoolNchwFixedPoint( const T8Bits* Input, float ScaleInput, int32_t ZeroPointInput, @@ -415,14 +1049,14 @@ MlasQLinearGlobalAveragePoolNchw( *sum_buffer++ = _mm_cvtsi128_si32(vsums); } - MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false, + MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &scale, false, static_cast(ZeroPointOutput), 0, 0, 1, Channels); } template MLAS_FORCEINLINE void -MlasQLinearGlobalAveragePoolNhwcSingleBatch( +MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint( const T8Bits* Input, T8Bits* Output, const T8Bits* LastOf8, @@ -685,7 +1319,7 @@ MlasQLinearGlobalAveragePoolNhwcSingleBatch( _mm_storeu_si128(((__m128i*)acc) + 1, vacc_hi); } } - MlasRequantizeOutput(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false, + MlasRequantizeOutputFixedPoint(AccumulateBuffer, Channels, Output, Channels, nullptr, &Scale, false, Output_zero_point, 0, 0, 1, Channels); } @@ -1079,8 +1713,97 @@ MlasQLinearGlobalAveragePoolNhwc( } } +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNchwFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* /* AccumulateBuffer */ + ) +{ + float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize); + std::vector ScaleValueVec = {scale}; // Create single-element vector + auto pair = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits) + int fracBits = pair.second; + int64_t* fpScale = new int64_t; + *fpScale = static_cast((scale) * (1LL << fracBits)); + + int32_t bias = -ZeroPointInput * static_cast(ImageSize); + for (; Channels > 0; Channels--) { + + int32_t acc = bias; + for (size_t i = 0; i < ImageSize; ++i) { + acc += static_cast(*Input++); + } + int32_t v = static_cast((acc * (*fpScale)) >> fracBits) + ZeroPointOutput; + v = std::min(static_cast(std::numeric_limits::max()), v); + v = std::max(static_cast(std::numeric_limits::lowest()), v); + *Output++ = static_cast(v); + } +} + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNhwcFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Batch, + size_t ImageSize, + size_t Stride, + size_t Channels, + int32_t* AccumulateBuffer, + const T8Bits* /*ZeroBuffer*/ + ) +{ + float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize); + std::vector ScaleValueVec = {scale}; // Create single-element vector + auto pair = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits) + int fracBits = pair.second; + int64_t* fpScale = new int64_t; + *fpScale = static_cast((scale) * (1LL << fracBits)); + + int32_t bias = -ZeroPointInput * static_cast(ImageSize); + for (; Batch > 0; Batch--) { + + const T8Bits* batch_input = Input; + T8Bits* batch_output = Output; + Input += Stride * ImageSize; + Output += Stride; + std::fill_n(AccumulateBuffer, Channels, bias); + for (size_t i = 0; i < ImageSize; ++i) { + + for (size_t c = 0; c < Channels; ++c) { + AccumulateBuffer[c] += static_cast(batch_input[c]); + } + + batch_input += Stride; + } + + for (size_t c = 0; c < Channels; ++c) { + + int32_t v = static_cast((AccumulateBuffer[c] * (*fpScale)) >> fracBits) + ZeroPointOutput; + v = std::min(static_cast(std::numeric_limits::max()), v); + v = std::max(static_cast(std::numeric_limits::lowest()), v); + *batch_output++ = static_cast(v); + } + } +} + #endif + #if defined(MLAS_NEON_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || defined(MLAS_LSX_INTRINSICS) template @@ -1114,6 +1837,37 @@ MlasQLinearGlobalAveragePoolNhwc( } } +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNhwcFixedPoint( + const T8Bits* Input, + float ScaleInput, + int32_t ZeroPointInput, + T8Bits* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Batch, + size_t ImageSize, + size_t Stride, + size_t Channels, + int32_t* AccumulateBuffer, + const T8Bits* ZeroBuffer + ) +{ + float scale = CheckQLinearGlobalAveragePoolScaleAndSize(ScaleInput, ScaleOutput, ImageSize); + const int32_t bias = -ZeroPointInput * static_cast(ImageSize); + const T8Bits* inputLastOf8 = Input + (Batch * ImageSize * Stride - Stride + Channels) - 8; + + for (; Batch > 0; Batch--) { + MlasQLinearGlobalAveragePoolNhwcSingleBatchFixedPoint( + Input, Output, inputLastOf8, ImageSize, Channels, Stride, bias, scale, + static_cast(ZeroPointOutput), AccumulateBuffer, ZeroBuffer); + Input += ImageSize * Stride; + Output += Stride; + } +} + #endif template @@ -1181,3 +1935,69 @@ MlasQLinearGlobalAveragePoolNhwc( int32_t* AccumulateBuffer, const uint8_t* ZeroBuffer ); + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNchwFixedPoint( + const int8_t* Input, + float ScaleInput, + int32_t ZeroPointInput, + int8_t* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* AccumulateBuffer + ); + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNchwFixedPoint( + const uint8_t* Input, + float ScaleInput, + int32_t ZeroPointInput, + uint8_t* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Channels, + size_t ImageSize, + int32_t* AccumulateBuffer + ); + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNhwcFixedPoint( + const int8_t* Input, + float ScaleInput, + int32_t ZeroPointInput, + int8_t* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Batch, + size_t ImageSize, + size_t Stride, + size_t Channels, + int32_t* AccumulateBuffer, + const int8_t* ZeroBuffer + ); + +template +void +MLASCALL +MlasQLinearGlobalAveragePoolNhwcFixedPoint( + const uint8_t* Input, + float ScaleInput, + int32_t ZeroPointInput, + uint8_t* Output, + float ScaleOutput, + int32_t ZeroPointOutput, + size_t Batch, + size_t ImageSize, + size_t Stride, + size_t Channels, + int32_t* AccumulateBuffer, + const uint8_t* ZeroBuffer + ); diff --git a/onnxruntime/core/mlas/lib/qpostprocessor.cpp b/onnxruntime/core/mlas/lib/qpostprocessor.cpp index 97e9000a19..620a5efa50 100644 --- a/onnxruntime/core/mlas/lib/qpostprocessor.cpp +++ b/onnxruntime/core/mlas/lib/qpostprocessor.cpp @@ -158,6 +158,227 @@ Return Value: Output += StartM * LeadingDimensionOutput_ + StartN; + while (CountM-- > 0) { + + float* c_out = Output; + const int32_t* c = C; + const float* bias = Bias; + const float* scale = Scale; + + size_t n = CountN; + + while (n >= 4) { + + MLAS_FLOAT32X4 FloatVector = MlasCastToFloat32x4(MlasLoadInt32x4(c)); + + if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) { + ScaleVector = MlasLoadFloat32x4(scale); + scale += 4; + } + + if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + FloatVector = MlasMultiplyAddFloat32x4(FloatVector, ScaleVector, MlasLoadFloat32x4(c_out)); + } else { + FloatVector = MlasMultiplyFloat32x4(FloatVector, ScaleVector); + } + + if (HasBias) { + FloatVector = MlasAddFloat32x4(FloatVector, MlasLoadFloat32x4(bias)); + bias += 4; + } + + MlasStoreFloat32x4(c_out, FloatVector); + + c_out += 4; + c += 4; + n -= 4; + } + + for (size_t offset = 0; offset < n; offset++) { + +#if defined(MLAS_SSE2_INTRINSICS) + __m128 FloatVector = _mm_set_ss(float(c[offset])); + + if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) { + ScaleVector = _mm_load_ss(&scale[offset]); + } + + if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + FloatVector = _mm_add_ps(_mm_mul_ss(FloatVector, ScaleVector), _mm_load_ss(&c_out[offset])); + } else { + FloatVector = _mm_mul_ss(FloatVector, ScaleVector); + } + + if (HasBias) { + FloatVector = _mm_add_ss(FloatVector, _mm_load_ss(&bias[offset])); + } + + _mm_store_ss(&c_out[offset], FloatVector); +#else + if (QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn) { + ScaleValue = scale[offset]; + } + + float result = float(c[offset]) * ScaleValue; + if (HasBias) { + result += bias[offset]; + } + + if (Mode == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + c_out[offset] += result; + } else { + c_out[offset] = result; + } +#endif + } + + C += ldc; + Output += LeadingDimensionOutput_; + } +} + +void MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT::Process( + const int32_t* C, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN, + size_t ldc + ) const +{ + if (Bias_) { + if (QuantGran_ == MLAS_QUANTIZATION_GRANULARITY::PerColumn) { + if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } else { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } + } else if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } else { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } + } else { + if (QuantGran_ == MLAS_QUANTIZATION_GRANULARITY::PerColumn) { + if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } else { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } + } else if (OutputMode_ == MLAS_QGEMM_OUTPUT_MODE::AccumulateMode) { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } else { + ProcessImpl( + C, + StartM, + StartN, + CountM, + CountN, + ldc); + } + } +} + +template +inline +void +MLAS_QGEMM_SCALE_BIAS_OUTPUT_PROCESSOR_FIXEDPOINT::ProcessImpl( + const int32_t* C, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN, + size_t ldc) const +/*++ + +Routine Description: + + This routine converts the output matrix C to a floating point format using + the stored scale and bias parameters. + +Arguments: + + C - Supplies the address of matrix C. + + StartM - Supplies the starting row offset relative to the matrix. + + StartN - Supplies the starting column offset relative to the matrix. + + CountM - Supplies the number of rows of the output matrix to process. + + CountN - Supplies the number of columns of the output matrix to process. + + ldc - Supplies the leading dimension of C. + +Return Value: + + None. + +--*/ +{ + float* Output = Output_; + const float* Bias = Bias_; + const float* Scale = Scale_; + + if (HasBias) { + Bias += StartN; + } + + if(QuantGran == MLAS_QUANTIZATION_GRANULARITY::PerColumn){ + Scale += StartN; + } + + MLAS_FLOAT32X4 ScaleVector = MlasBroadcastFloat32x4(Scale_); +#if !defined(MLAS_SSE2_INTRINSICS) + float ScaleValue = MlasExtractLaneFloat32x4<0>(ScaleVector); +#endif + + C += StartM * ldc + StartN; + Output += StartM * LeadingDimensionOutput_ + StartN; + + while (CountM-- > 0) { float* c_out = Output; diff --git a/onnxruntime/core/mlas/lib/quantize.cpp b/onnxruntime/core/mlas/lib/quantize.cpp index ae638fafee..ccddd8a4ad 100644 --- a/onnxruntime/core/mlas/lib/quantize.cpp +++ b/onnxruntime/core/mlas/lib/quantize.cpp @@ -19,12 +19,24 @@ Module Name: --*/ #include "mlasi.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include #if defined(MLAS_NEON64_INTRINSICS) || defined(MLAS_SSE2_INTRINSICS) || \ - defined(MLAS_LSX_INTRINSICS) + defined(MLAS_LSX_INTRINSICS) || defined(MLAS_SSE41_INTRINSICS) #include +#include "qfunctions_helper.h" + // // QuantizeLinear implementation using NEON or SSE2 intrinsics. // @@ -1023,7 +1035,7 @@ MlasRequantizeOutput( size_t CountN ) { - const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale); + const __m128 PerMatrixScaleVector = PerColumnScale ? _mm_setzero_ps() : _mm_load1_ps(Scale); // _mm_load1_ps loads one float into 4 words const __m128 MinimumValueVector = _mm_set1_ps(float(std::numeric_limits::lowest() - ZeroPoint)); const __m128 MaximumValueVector = _mm_set1_ps(float(std::numeric_limits::max() - ZeroPoint)); const __m128i ZeroPointVector = _mm_set1_epi32(ZeroPoint); @@ -2047,6 +2059,93 @@ MlasRequantizeOutput( #endif +template +void +MLASCALL +MlasRequantizeOutputFixedPoint( + const int32_t* Input, + size_t InputLeadingDimension, + OutputType* Output, + size_t OutputLeadingDimension, + const int32_t* Bias, + const float* Scale, + bool PerColumnScale, + OutputType ZeroPoint, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN + ) +{ + // New MlasRequantizeOuput but for fixed point not floating point + // Floating point conversion to fixed point is multiply by 2**n where n is the number of decimal places + // Then, interpret this number as a 32 bit int + // Need to wrap into vector to use function scalarToQfp + std::vector ScaleValueVec = {*Scale}; // Create single-element vector + auto p = dataToQfp(ScaleValueVec, -1, 32, false); // Returns std::make_pair(qfp, fracBits) + int fracBits = p.second; + int mulScale = fracBits - 2; + + int64_t* fpScale = new int64_t; + *fpScale = static_cast(*Scale * (1LL << fracBits)); + + const int32_t PerMatrixScaleValue = PerColumnScale ? 0 : static_cast(*fpScale); + const int32_t MinimumValue = std::numeric_limits::lowest(); + const int32_t MaximumValue = std::numeric_limits::max(); + + + if (nullptr != Bias) { + Bias += StartN; + } + if (PerColumnScale) { + fpScale += StartN; + } + + Input += StartM * InputLeadingDimension + StartN; + Output += StartM * OutputLeadingDimension + StartN; + + // + // Step through each row of the output matrix. + // + + while (CountM-- > 0) { + + const int32_t* bias = Bias; + const int64_t* fpscale = fpScale; + size_t n = CountN; + + auto* RowInput = Input; + auto* RowOutput = Output; + + while (n > 0) { + + int32_t IntegerValue = *RowInput++; + + if (bias != nullptr) { + IntegerValue += *bias++; + } + + int64_t ScaleValue = PerColumnScale ? *fpscale++ : PerMatrixScaleValue; + + int64_t largeInt = static_cast(IntegerValue) * ScaleValue; // This is a 29 fixed point + largeInt = largeInt >> mulScale; + IntegerValue = customRound<2>(static_cast(largeInt)); + int32_t Intermediate = IntegerValue + ZeroPoint; + Intermediate = std::max(Intermediate, MinimumValue); + Intermediate = std::min(Intermediate, MaximumValue); + + *RowOutput++ = OutputType(Intermediate); + + n -= 1; + } + + // Next Row + Input += InputLeadingDimension; + Output += OutputLeadingDimension; + } + delete fpScale; +} + template void MLASCALL @@ -2083,6 +2182,42 @@ MlasRequantizeOutput( size_t CountN ); +template +void +MLASCALL +MlasRequantizeOutputFixedPoint( + const int32_t* Input, + size_t InputLeadingDimension, + int8_t* Output, + size_t OutputLeadingDimension, + const int32_t* Bias, + const float* Scale, + bool PerColumnScale, + int8_t ZeroPoint, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN + ); + +template +void +MLASCALL +MlasRequantizeOutputFixedPoint( + const int32_t* Input, + size_t InputLeadingDimension, + uint8_t* Output, + size_t OutputLeadingDimension, + const int32_t* Bias, + const float* Scale, + bool PerColumnScale, + uint8_t ZeroPoint, + size_t StartM, + size_t StartN, + size_t CountM, + size_t CountN + ); + void MLASCALL MlasFindMinMaxElement( diff --git a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc index 7797cbe678..42291f89c9 100644 --- a/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc +++ b/onnxruntime/core/providers/cpu/quantization/qlinearconv.cc @@ -10,6 +10,7 @@ #include "core/util/math_cpuonly.h" #include "core/util/qmath.h" #include "core/mlas/inc/mlas.h" +#include "core/framework/op_kernel_context_internal.h" namespace onnxruntime { @@ -513,6 +514,15 @@ Status QLinearConv::UseSharedPrePackedBuffers(std::vector Status QLinearConv::Compute(OpKernelContext* context) const { + // Cast to internal type because we want to access session_options parameter + auto* internal_context = dynamic_cast(context); + if (!internal_context) { + return Status(common::ONNXRUNTIME, common::FAIL, "Failed to cast OpKernelContext to OpKernelContextInternal"); + } + const auto& session_options = internal_context->GetSessionState().GetSessionOptions(); + // Test to see if we have access to enable_gpnpu flag + const bool gpnpu_flag = session_options.enable_gpnpu; + const Tensor* X = context->Input(InputTensors::IN_X); const Tensor* W = is_W_packed_ ? nullptr : context->Input(InputTensors::IN_W); const auto& W_shape = W ? W->Shape() : W_shape_; @@ -973,8 +983,9 @@ Status QLinearConv::Compute(OpKernelContext* context) const { } } } - - MlasRequantizeOutput( + if (gpnpu_flag) { + // New MlasRequantizeOuput but for fixed point not floating point + MlasRequantizeOutputFixedPoint( worker_gemm_output, static_cast(M), worker_output, @@ -987,6 +998,21 @@ Status QLinearConv::Compute(OpKernelContext* context) const { 0, static_cast(output_count), static_cast(M)); + } else { + MlasRequantizeOutput( + worker_gemm_output, + static_cast(M), + worker_output, + static_cast(M), + Bdata, + output_scales.data(), + output_scales.size() > 1, + Y_zero_point_value, + 0, + 0, + static_cast(output_count), + static_cast(M)); + } }; concurrency::ThreadPool::TrySimpleParallelFor(thread_pool, onnxruntime::narrow(task_count), conv_worker); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 7af659851e..4a6c9a0e9f 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1654,6 +1654,13 @@ void addObjectMethods(py::module& m, ExecutionProviderRegistrationFn ep_registra }, R"pbdoc(Enables the memory arena on CPU. Arena may pre-allocate memory for future usage. Set this option to false if you don't want it. Default is True.)pbdoc") + .def_property( + "enable_gpnpu", + [](const PySessionOptions* options) -> bool { return options->value.enable_gpnpu; }, + [](PySessionOptions* options, bool enable_gpnpu) -> void { + options->value.enable_gpnpu = enable_gpnpu; + }, + R"pbdoc(Enable GPNPU mode. Default is false.)pbdoc") .def_property( "enable_profiling", [](const PySessionOptions* options) -> bool { return options->value.enable_profiling; }, diff --git a/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json b/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json new file mode 100644 index 0000000000..81c449c552 --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/16gpnpu_2025-01-31_18-22-14.json @@ -0,0 +1,237 @@ +[ +{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :24021,"ts" :6,"ph" : "X","name" :"model_loading_uri","args" : {}}, +{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :85073,"ts" :24066,"ph" : "X","name" :"session_initialization","args" : {}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :109329,"ph" : "X","name" :"input_QuantizeLinear_fence_before","args" : {"op_name" : "QuantizeLinear"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1325,"ts" :109332,"ph" : "X","name" :"input_QuantizeLinear_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [109], "core": 2, "Distribution": 4, "DistributionEnqueue": 3, "Run": 1254, "Wait": 21, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 1, "core": 30},"132401808279232": {"num_run": 1, "core": 19},"132401663575744": {"num_run": 1, "core": 16},"132401797793472": {"num_run": 1, "core": 26},"132401787307712": {"num_run": 1, "core": 31},"132401776821952": {"num_run": 1, "core": 17},"132401692935872": {"num_run": 1, "core": 24},"132401682450112": {"num_run": 1, "core": 29},"132401653089984": {"num_run": 1, "core": 9},"132401642604224": {"num_run": 0, "core": -1},"132401558718144": {"num_run": 0, "core": -1},"132401548232384": {"num_run": 0, "core": -1},"132401537746624": {"num_run": 0, "core": -1},"132401527260864": {"num_run": 0, "core": -1},"132401516775104": {"num_run": 0, "core": -1}}},"output_type_shape" : [{"int8":[1,3,224,224]}],"output_size" : "150528","parameter_size" : "5","activation_size" : "602112","node_index" : "0","input_type_shape" : [{"float":[1,3,224,224]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QuantizeLinear"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110663,"ph" : "X","name" :"input_QuantizeLinear_fence_after","args" : {"op_name" : "QuantizeLinear"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110667,"ph" : "X","name" :"Transpose_fence_before","args" : {"op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :36,"ts" :110668,"ph" : "X","name" :"Transpose_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 1, "core": 30},"132401808279232": {"num_run": 1, "core": 19},"132401663575744": {"num_run": 1, "core": 16},"132401797793472": {"num_run": 1, "core": 26},"132401787307712": {"num_run": 1, "core": 31},"132401776821952": {"num_run": 1, "core": 17},"132401692935872": {"num_run": 1, "core": 24},"132401682450112": {"num_run": 1, "core": 29},"132401653089984": {"num_run": 1, "core": 9},"132401642604224": {"num_run": 0, "core": -1},"132401558718144": {"num_run": 0, "core": -1},"132401548232384": {"num_run": 0, "core": -1},"132401537746624": {"num_run": 0, "core": -1},"132401527260864": {"num_run": 0, "core": -1},"132401516775104": {"num_run": 0, "core": -1}}},"output_type_shape" : [{"int8":[1,224,224,3]}],"output_size" : "150528","parameter_size" : "0","activation_size" : "150528","node_index" : "77","input_type_shape" : [{"int8":[1,3,224,224]}],"provider" : "CPUExecutionProvider","op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110709,"ph" : "X","name" :"Transpose_fence_after","args" : {"op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :110712,"ph" : "X","name" :"/conv1/Conv_quant_token_1_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1781,"ts" :110713,"ph" : "X","name" :"/conv1/Conv_quant_token_1_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 0, "Distribution": 1, "DistributionEnqueue": 0, "Run": 727, "Wait": 151, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 2, "core": 30},"132401808279232": {"num_run": 2, "core": 19},"132401663575744": {"num_run": 2, "core": 16},"132401797793472": {"num_run": 2, "core": 26},"132401787307712": {"num_run": 2, "core": 31},"132401776821952": {"num_run": 2, "core": 17},"132401692935872": {"num_run": 2, "core": 24},"132401682450112": {"num_run": 2, "core": 29},"132401653089984": {"num_run": 2, "core": 9},"132401642604224": {"num_run": 1, "core": 6},"132401558718144": {"num_run": 1, "core": 3},"132401548232384": {"num_run": 1, "core": 27},"132401537746624": {"num_run": 1, "core": 21},"132401527260864": {"num_run": 1, "core": 25},"132401516775104": {"num_run": 1, "core": 20}}},"output_type_shape" : [{"int8":[1,112,112,64]}],"output_size" : "802816","parameter_size" : "271","activation_size" : "150528","node_index" : "79","input_type_shape" : [{"int8":[1,224,224,3]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112499,"ph" : "X","name" :"/conv1/Conv_quant_token_1_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112503,"ph" : "X","name" :"/maxpool/MaxPool_token_175_fence_before","args" : {"op_name" : "NhwcMaxPool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :68,"ts" :112503,"ph" : "X","name" :"/maxpool/MaxPool_token_175_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 0, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 2, "core": 30},"132401808279232": {"num_run": 2, "core": 19},"132401663575744": {"num_run": 2, "core": 16},"132401797793472": {"num_run": 2, "core": 26},"132401787307712": {"num_run": 2, "core": 31},"132401776821952": {"num_run": 2, "core": 17},"132401692935872": {"num_run": 2, "core": 24},"132401682450112": {"num_run": 2, "core": 29},"132401653089984": {"num_run": 2, "core": 9},"132401642604224": {"num_run": 1, "core": 6},"132401558718144": {"num_run": 1, "core": 3},"132401548232384": {"num_run": 1, "core": 27},"132401537746624": {"num_run": 1, "core": 21},"132401527260864": {"num_run": 1, "core": 25},"132401516775104": {"num_run": 1, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "0","activation_size" : "802816","node_index" : "200","input_type_shape" : [{"int8":[1,112,112,64]}],"provider" : "CPUExecutionProvider","op_name" : "NhwcMaxPool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112574,"ph" : "X","name" :"/maxpool/MaxPool_token_175_fence_after","args" : {"op_name" : "NhwcMaxPool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112577,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :398,"ts" :112577,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 320, "Wait": 64, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 3, "core": 30},"132401808279232": {"num_run": 3, "core": 19},"132401663575744": {"num_run": 3, "core": 16},"132401797793472": {"num_run": 3, "core": 26},"132401787307712": {"num_run": 3, "core": 31},"132401776821952": {"num_run": 3, "core": 17},"132401692935872": {"num_run": 3, "core": 24},"132401682450112": {"num_run": 3, "core": 29},"132401653089984": {"num_run": 3, "core": 9},"132401642604224": {"num_run": 2, "core": 6},"132401558718144": {"num_run": 2, "core": 3},"132401548232384": {"num_run": 2, "core": 27},"132401537746624": {"num_run": 2, "core": 21},"132401527260864": {"num_run": 2, "core": 25},"132401516775104": {"num_run": 2, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "88","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112979,"ph" : "X","name" :"/layer1/layer1.0/downsample/downsample.0/Conv_quant_token_14_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :112981,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :160,"ts" :112981,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 90, "Wait": 60, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 4, "core": 30},"132401808279232": {"num_run": 4, "core": 19},"132401663575744": {"num_run": 4, "core": 16},"132401797793472": {"num_run": 4, "core": 26},"132401787307712": {"num_run": 4, "core": 31},"132401776821952": {"num_run": 4, "core": 17},"132401692935872": {"num_run": 4, "core": 24},"132401682450112": {"num_run": 4, "core": 29},"132401653089984": {"num_run": 4, "core": 9},"132401642604224": {"num_run": 3, "core": 6},"132401558718144": {"num_run": 3, "core": 3},"132401548232384": {"num_run": 3, "core": 27},"132401537746624": {"num_run": 3, "core": 21},"132401527260864": {"num_run": 3, "core": 25},"132401516775104": {"num_run": 3, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "82","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113144,"ph" : "X","name" :"/layer1/layer1.0/conv1/Conv_quant_token_5_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113145,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :679,"ts" :113146,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 1, "DistributionEnqueue": 0, "Run": 574, "Wait": 48, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 5, "core": 30},"132401808279232": {"num_run": 5, "core": 19},"132401663575744": {"num_run": 5, "core": 16},"132401797793472": {"num_run": 5, "core": 26},"132401787307712": {"num_run": 5, "core": 31},"132401776821952": {"num_run": 5, "core": 17},"132401692935872": {"num_run": 5, "core": 24},"132401682450112": {"num_run": 5, "core": 29},"132401653089984": {"num_run": 5, "core": 9},"132401642604224": {"num_run": 4, "core": 6},"132401558718144": {"num_run": 4, "core": 3},"132401548232384": {"num_run": 4, "core": 27},"132401537746624": {"num_run": 4, "core": 21},"132401527260864": {"num_run": 4, "core": 25},"132401516775104": {"num_run": 4, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "84","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113826,"ph" : "X","name" :"/layer1/layer1.0/conv2/Conv_quant_token_8_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :113828,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :356,"ts" :113828,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 325, "Wait": 21, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 6, "core": 30},"132401808279232": {"num_run": 6, "core": 19},"132401663575744": {"num_run": 6, "core": 16},"132401797793472": {"num_run": 6, "core": 26},"132401787307712": {"num_run": 6, "core": 31},"132401776821952": {"num_run": 6, "core": 17},"132401692935872": {"num_run": 6, "core": 24},"132401682450112": {"num_run": 6, "core": 29},"132401653089984": {"num_run": 6, "core": 9},"132401642604224": {"num_run": 5, "core": 6},"132401558718144": {"num_run": 5, "core": 3},"132401548232384": {"num_run": 5, "core": 27},"132401537746624": {"num_run": 5, "core": 21},"132401527260864": {"num_run": 5, "core": 25},"132401516775104": {"num_run": 5, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "86","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114186,"ph" : "X","name" :"/layer1/layer1.0/conv3/Conv_quant_token_11_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114186,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :139,"ts" :114187,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 68, "Wait": 59, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 7, "core": 30},"132401808279232": {"num_run": 7, "core": 19},"132401663575744": {"num_run": 7, "core": 16},"132401797793472": {"num_run": 7, "core": 26},"132401787307712": {"num_run": 7, "core": 31},"132401776821952": {"num_run": 7, "core": 17},"132401692935872": {"num_run": 7, "core": 24},"132401682450112": {"num_run": 7, "core": 29},"132401653089984": {"num_run": 7, "core": 9},"132401642604224": {"num_run": 6, "core": 6},"132401558718144": {"num_run": 6, "core": 3},"132401548232384": {"num_run": 6, "core": 27},"132401537746624": {"num_run": 6, "core": 21},"132401527260864": {"num_run": 6, "core": 25},"132401516775104": {"num_run": 6, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "7","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114327,"ph" : "X","name" :"/layer1/layer1.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114328,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :297,"ts" :114328,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 216, "Wait": 73, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 8, "core": 30},"132401808279232": {"num_run": 8, "core": 19},"132401663575744": {"num_run": 8, "core": 16},"132401797793472": {"num_run": 8, "core": 26},"132401787307712": {"num_run": 8, "core": 31},"132401776821952": {"num_run": 8, "core": 17},"132401692935872": {"num_run": 8, "core": 24},"132401682450112": {"num_run": 8, "core": 29},"132401653089984": {"num_run": 8, "core": 9},"132401642604224": {"num_run": 7, "core": 6},"132401558718144": {"num_run": 7, "core": 3},"132401548232384": {"num_run": 7, "core": 27},"132401537746624": {"num_run": 7, "core": 21},"132401527260864": {"num_run": 7, "core": 25},"132401516775104": {"num_run": 7, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "802816","node_index" : "91","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114627,"ph" : "X","name" :"/layer1/layer1.1/conv1/Conv_quant_token_18_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :114629,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :708,"ts" :114630,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 594, "Wait": 66, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 9, "core": 30},"132401808279232": {"num_run": 9, "core": 19},"132401663575744": {"num_run": 9, "core": 16},"132401797793472": {"num_run": 9, "core": 26},"132401787307712": {"num_run": 9, "core": 31},"132401776821952": {"num_run": 9, "core": 17},"132401692935872": {"num_run": 9, "core": 24},"132401682450112": {"num_run": 9, "core": 29},"132401653089984": {"num_run": 9, "core": 9},"132401642604224": {"num_run": 8, "core": 6},"132401558718144": {"num_run": 8, "core": 3},"132401548232384": {"num_run": 8, "core": 27},"132401537746624": {"num_run": 8, "core": 21},"132401527260864": {"num_run": 8, "core": 25},"132401516775104": {"num_run": 8, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "93","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115341,"ph" : "X","name" :"/layer1/layer1.1/conv2/Conv_quant_token_21_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115341,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :368,"ts" :115342,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 333, "Wait": 26, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 10, "core": 30},"132401808279232": {"num_run": 10, "core": 19},"132401663575744": {"num_run": 10, "core": 16},"132401797793472": {"num_run": 10, "core": 26},"132401787307712": {"num_run": 10, "core": 31},"132401776821952": {"num_run": 10, "core": 17},"132401692935872": {"num_run": 10, "core": 24},"132401682450112": {"num_run": 10, "core": 29},"132401653089984": {"num_run": 10, "core": 9},"132401642604224": {"num_run": 9, "core": 6},"132401558718144": {"num_run": 9, "core": 3},"132401548232384": {"num_run": 9, "core": 27},"132401537746624": {"num_run": 9, "core": 21},"132401527260864": {"num_run": 9, "core": 25},"132401516775104": {"num_run": 9, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "95","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115712,"ph" : "X","name" :"/layer1/layer1.1/conv3/Conv_quant_token_24_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115713,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :139,"ts" :115713,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 69, "Wait": 63, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 11, "core": 30},"132401808279232": {"num_run": 11, "core": 19},"132401663575744": {"num_run": 11, "core": 16},"132401797793472": {"num_run": 11, "core": 26},"132401787307712": {"num_run": 11, "core": 31},"132401776821952": {"num_run": 11, "core": 17},"132401692935872": {"num_run": 11, "core": 24},"132401682450112": {"num_run": 11, "core": 29},"132401653089984": {"num_run": 11, "core": 9},"132401642604224": {"num_run": 10, "core": 6},"132401558718144": {"num_run": 10, "core": 3},"132401548232384": {"num_run": 10, "core": 27},"132401537746624": {"num_run": 10, "core": 21},"132401527260864": {"num_run": 10, "core": 25},"132401516775104": {"num_run": 10, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "11","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115854,"ph" : "X","name" :"/layer1/layer1.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :115855,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :321,"ts" :115855,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 257, "Wait": 56, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 12, "core": 30},"132401808279232": {"num_run": 12, "core": 19},"132401663575744": {"num_run": 12, "core": 16},"132401797793472": {"num_run": 12, "core": 26},"132401787307712": {"num_run": 12, "core": 31},"132401776821952": {"num_run": 12, "core": 17},"132401692935872": {"num_run": 12, "core": 24},"132401682450112": {"num_run": 12, "core": 29},"132401653089984": {"num_run": 12, "core": 9},"132401642604224": {"num_run": 11, "core": 6},"132401558718144": {"num_run": 11, "core": 3},"132401548232384": {"num_run": 11, "core": 27},"132401537746624": {"num_run": 11, "core": 21},"132401527260864": {"num_run": 11, "core": 25},"132401516775104": {"num_run": 11, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "802816","node_index" : "98","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116178,"ph" : "X","name" :"/layer1/layer1.2/conv1/Conv_quant_token_28_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116179,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :729,"ts" :116179,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 629, "Wait": 51, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 13, "core": 30},"132401808279232": {"num_run": 13, "core": 19},"132401663575744": {"num_run": 13, "core": 16},"132401797793472": {"num_run": 13, "core": 26},"132401787307712": {"num_run": 13, "core": 31},"132401776821952": {"num_run": 13, "core": 17},"132401692935872": {"num_run": 13, "core": 24},"132401682450112": {"num_run": 13, "core": 29},"132401653089984": {"num_run": 13, "core": 9},"132401642604224": {"num_run": 12, "core": 6},"132401558718144": {"num_run": 12, "core": 3},"132401548232384": {"num_run": 12, "core": 27},"132401537746624": {"num_run": 12, "core": 21},"132401527260864": {"num_run": 12, "core": 25},"132401516775104": {"num_run": 12, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,64]}],"output_size" : "200704","parameter_size" : "271","activation_size" : "200704","node_index" : "100","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[64]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116910,"ph" : "X","name" :"/layer1/layer1.2/conv2/Conv_quant_token_31_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :116912,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :412,"ts" :116912,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 367, "Wait": 36, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 14, "core": 30},"132401808279232": {"num_run": 14, "core": 19},"132401663575744": {"num_run": 14, "core": 16},"132401797793472": {"num_run": 14, "core": 26},"132401787307712": {"num_run": 14, "core": 31},"132401776821952": {"num_run": 14, "core": 17},"132401692935872": {"num_run": 14, "core": 24},"132401682450112": {"num_run": 14, "core": 29},"132401653089984": {"num_run": 14, "core": 9},"132401642604224": {"num_run": 13, "core": 6},"132401558718144": {"num_run": 13, "core": 3},"132401548232384": {"num_run": 13, "core": 27},"132401537746624": {"num_run": 13, "core": 21},"132401527260864": {"num_run": 13, "core": 25},"132401516775104": {"num_run": 13, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "1039","activation_size" : "200704","node_index" : "102","input_type_shape" : [{"int8":[1,56,56,64]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117326,"ph" : "X","name" :"/layer1/layer1.2/conv3/Conv_quant_token_34_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117327,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :151,"ts" :117327,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [50176], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 75, "Wait": 69, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 15, "core": 30},"132401808279232": {"num_run": 15, "core": 19},"132401663575744": {"num_run": 15, "core": 16},"132401797793472": {"num_run": 15, "core": 26},"132401787307712": {"num_run": 15, "core": 31},"132401776821952": {"num_run": 15, "core": 17},"132401692935872": {"num_run": 15, "core": 24},"132401682450112": {"num_run": 15, "core": 29},"132401653089984": {"num_run": 15, "core": 9},"132401642604224": {"num_run": 14, "core": 6},"132401558718144": {"num_run": 14, "core": 3},"132401548232384": {"num_run": 14, "core": 27},"132401537746624": {"num_run": 14, "core": 21},"132401527260864": {"num_run": 14, "core": 25},"132401516775104": {"num_run": 14, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,256]}],"output_size" : "802816","parameter_size" : "15","activation_size" : "1605632","node_index" : "15","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117481,"ph" : "X","name" :"/layer1/layer1.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :117482,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1009,"ts" :117483,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 940, "Wait": 52, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 16, "core": 30},"132401808279232": {"num_run": 16, "core": 19},"132401663575744": {"num_run": 16, "core": 16},"132401797793472": {"num_run": 16, "core": 26},"132401787307712": {"num_run": 16, "core": 31},"132401776821952": {"num_run": 16, "core": 17},"132401692935872": {"num_run": 16, "core": 24},"132401682450112": {"num_run": 16, "core": 29},"132401653089984": {"num_run": 16, "core": 9},"132401642604224": {"num_run": 15, "core": 6},"132401558718144": {"num_run": 15, "core": 3},"132401548232384": {"num_run": 15, "core": 27},"132401537746624": {"num_run": 15, "core": 21},"132401527260864": {"num_run": 15, "core": 25},"132401516775104": {"num_run": 15, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "802816","node_index" : "111","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :118495,"ph" : "X","name" :"/layer2/layer2.0/downsample/downsample.0/Conv_quant_token_47_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :118495,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :671,"ts" :118496,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 529, "Wait": 134, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 17, "core": 30},"132401808279232": {"num_run": 17, "core": 19},"132401663575744": {"num_run": 17, "core": 16},"132401797793472": {"num_run": 17, "core": 26},"132401787307712": {"num_run": 17, "core": 31},"132401776821952": {"num_run": 17, "core": 17},"132401692935872": {"num_run": 17, "core": 24},"132401682450112": {"num_run": 17, "core": 29},"132401653089984": {"num_run": 17, "core": 9},"132401642604224": {"num_run": 16, "core": 6},"132401558718144": {"num_run": 16, "core": 3},"132401548232384": {"num_run": 16, "core": 27},"132401537746624": {"num_run": 16, "core": 21},"132401527260864": {"num_run": 16, "core": 25},"132401516775104": {"num_run": 16, "core": 20}}},"output_type_shape" : [{"int8":[1,56,56,128]}],"output_size" : "401408","parameter_size" : "527","activation_size" : "802816","node_index" : "105","input_type_shape" : [{"int8":[1,56,56,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :119169,"ph" : "X","name" :"/layer2/layer2.0/conv1/Conv_quant_token_38_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :119171,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :972,"ts" :119171,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 862, "Wait": 61, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 18, "core": 30},"132401808279232": {"num_run": 18, "core": 19},"132401663575744": {"num_run": 18, "core": 16},"132401797793472": {"num_run": 18, "core": 26},"132401787307712": {"num_run": 18, "core": 31},"132401776821952": {"num_run": 18, "core": 17},"132401692935872": {"num_run": 18, "core": 24},"132401682450112": {"num_run": 18, "core": 29},"132401653089984": {"num_run": 18, "core": 9},"132401642604224": {"num_run": 17, "core": 6},"132401558718144": {"num_run": 17, "core": 3},"132401548232384": {"num_run": 17, "core": 27},"132401537746624": {"num_run": 17, "core": 21},"132401527260864": {"num_run": 17, "core": 25},"132401516775104": {"num_run": 17, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "107","input_type_shape" : [{"int8":[1,56,56,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120145,"ph" : "X","name" :"/layer2/layer2.0/conv2/Conv_quant_token_41_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120146,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :545,"ts" :120147,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 506, "Wait": 29, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 19, "core": 30},"132401808279232": {"num_run": 19, "core": 19},"132401663575744": {"num_run": 19, "core": 16},"132401797793472": {"num_run": 19, "core": 26},"132401787307712": {"num_run": 19, "core": 31},"132401776821952": {"num_run": 19, "core": 17},"132401692935872": {"num_run": 19, "core": 24},"132401682450112": {"num_run": 19, "core": 29},"132401653089984": {"num_run": 19, "core": 9},"132401642604224": {"num_run": 18, "core": 6},"132401558718144": {"num_run": 18, "core": 3},"132401548232384": {"num_run": 18, "core": 27},"132401537746624": {"num_run": 18, "core": 21},"132401527260864": {"num_run": 18, "core": 25},"132401516775104": {"num_run": 18, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "109","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120693,"ph" : "X","name" :"/layer2/layer2.0/conv3/Conv_quant_token_44_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120697,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :120697,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 71, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 20, "core": 30},"132401808279232": {"num_run": 20, "core": 4},"132401663575744": {"num_run": 20, "core": 16},"132401797793472": {"num_run": 20, "core": 26},"132401787307712": {"num_run": 20, "core": 12},"132401776821952": {"num_run": 20, "core": 17},"132401692935872": {"num_run": 20, "core": 24},"132401682450112": {"num_run": 20, "core": 29},"132401653089984": {"num_run": 20, "core": 31},"132401642604224": {"num_run": 19, "core": 6},"132401558718144": {"num_run": 19, "core": 3},"132401548232384": {"num_run": 19, "core": 27},"132401537746624": {"num_run": 18, "core": 21},"132401527260864": {"num_run": 18, "core": 25},"132401516775104": {"num_run": 18, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "20","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120781,"ph" : "X","name" :"/layer2/layer2.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :120782,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :408,"ts" :120782,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 392, "Wait": 6, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 21, "core": 30},"132401808279232": {"num_run": 21, "core": 4},"132401663575744": {"num_run": 21, "core": 16},"132401797793472": {"num_run": 21, "core": 26},"132401787307712": {"num_run": 21, "core": 12},"132401776821952": {"num_run": 21, "core": 17},"132401692935872": {"num_run": 21, "core": 24},"132401682450112": {"num_run": 21, "core": 29},"132401653089984": {"num_run": 21, "core": 31},"132401642604224": {"num_run": 20, "core": 6},"132401558718144": {"num_run": 20, "core": 3},"132401548232384": {"num_run": 20, "core": 27},"132401537746624": {"num_run": 19, "core": 21},"132401527260864": {"num_run": 19, "core": 25},"132401516775104": {"num_run": 19, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "114","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :121193,"ph" : "X","name" :"/layer2/layer2.1/conv1/Conv_quant_token_51_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :121194,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :957,"ts" :121194,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 864, "Wait": 58, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 22, "core": 30},"132401808279232": {"num_run": 22, "core": 4},"132401663575744": {"num_run": 22, "core": 16},"132401797793472": {"num_run": 22, "core": 26},"132401787307712": {"num_run": 22, "core": 12},"132401776821952": {"num_run": 22, "core": 17},"132401692935872": {"num_run": 22, "core": 24},"132401682450112": {"num_run": 22, "core": 29},"132401653089984": {"num_run": 22, "core": 31},"132401642604224": {"num_run": 21, "core": 6},"132401558718144": {"num_run": 21, "core": 3},"132401548232384": {"num_run": 21, "core": 27},"132401537746624": {"num_run": 20, "core": 21},"132401527260864": {"num_run": 20, "core": 25},"132401516775104": {"num_run": 20, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "116","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122153,"ph" : "X","name" :"/layer2/layer2.1/conv2/Conv_quant_token_54_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122153,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :540,"ts" :122154,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 488, "Wait": 42, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 23, "core": 30},"132401808279232": {"num_run": 23, "core": 4},"132401663575744": {"num_run": 23, "core": 16},"132401797793472": {"num_run": 23, "core": 26},"132401787307712": {"num_run": 23, "core": 12},"132401776821952": {"num_run": 23, "core": 17},"132401692935872": {"num_run": 23, "core": 24},"132401682450112": {"num_run": 23, "core": 29},"132401653089984": {"num_run": 23, "core": 31},"132401642604224": {"num_run": 22, "core": 6},"132401558718144": {"num_run": 22, "core": 3},"132401548232384": {"num_run": 22, "core": 27},"132401537746624": {"num_run": 21, "core": 21},"132401527260864": {"num_run": 21, "core": 25},"132401516775104": {"num_run": 21, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "118","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122695,"ph" : "X","name" :"/layer2/layer2.1/conv3/Conv_quant_token_57_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122696,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :77,"ts" :122696,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 70, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 24, "core": 30},"132401808279232": {"num_run": 24, "core": 4},"132401663575744": {"num_run": 24, "core": 16},"132401797793472": {"num_run": 24, "core": 26},"132401787307712": {"num_run": 24, "core": 12},"132401776821952": {"num_run": 24, "core": 17},"132401692935872": {"num_run": 24, "core": 24},"132401682450112": {"num_run": 24, "core": 29},"132401653089984": {"num_run": 24, "core": 31},"132401642604224": {"num_run": 23, "core": 6},"132401558718144": {"num_run": 23, "core": 3},"132401548232384": {"num_run": 23, "core": 27},"132401537746624": {"num_run": 21, "core": 21},"132401527260864": {"num_run": 21, "core": 25},"132401516775104": {"num_run": 21, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "24","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122775,"ph" : "X","name" :"/layer2/layer2.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :122776,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :395,"ts" :122777,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 294, "Wait": 93, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 25, "core": 30},"132401808279232": {"num_run": 25, "core": 4},"132401663575744": {"num_run": 25, "core": 16},"132401797793472": {"num_run": 25, "core": 26},"132401787307712": {"num_run": 25, "core": 12},"132401776821952": {"num_run": 25, "core": 17},"132401692935872": {"num_run": 25, "core": 24},"132401682450112": {"num_run": 25, "core": 29},"132401653089984": {"num_run": 25, "core": 31},"132401642604224": {"num_run": 24, "core": 6},"132401558718144": {"num_run": 24, "core": 3},"132401548232384": {"num_run": 24, "core": 27},"132401537746624": {"num_run": 22, "core": 21},"132401527260864": {"num_run": 22, "core": 25},"132401516775104": {"num_run": 22, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "121","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :123173,"ph" : "X","name" :"/layer2/layer2.2/conv1/Conv_quant_token_61_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :123174,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :914,"ts" :123175,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 834, "Wait": 45, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 26, "core": 30},"132401808279232": {"num_run": 26, "core": 4},"132401663575744": {"num_run": 26, "core": 16},"132401797793472": {"num_run": 26, "core": 26},"132401787307712": {"num_run": 26, "core": 12},"132401776821952": {"num_run": 26, "core": 17},"132401692935872": {"num_run": 26, "core": 24},"132401682450112": {"num_run": 26, "core": 29},"132401653089984": {"num_run": 26, "core": 31},"132401642604224": {"num_run": 25, "core": 6},"132401558718144": {"num_run": 25, "core": 3},"132401548232384": {"num_run": 25, "core": 27},"132401537746624": {"num_run": 23, "core": 21},"132401527260864": {"num_run": 23, "core": 25},"132401516775104": {"num_run": 23, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "123","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124090,"ph" : "X","name" :"/layer2/layer2.2/conv2/Conv_quant_token_64_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124090,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :527,"ts" :124091,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 369, "Wait": 149, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 27, "core": 30},"132401808279232": {"num_run": 27, "core": 4},"132401663575744": {"num_run": 27, "core": 16},"132401797793472": {"num_run": 27, "core": 26},"132401787307712": {"num_run": 27, "core": 12},"132401776821952": {"num_run": 27, "core": 17},"132401692935872": {"num_run": 27, "core": 24},"132401682450112": {"num_run": 27, "core": 29},"132401653089984": {"num_run": 27, "core": 31},"132401642604224": {"num_run": 26, "core": 6},"132401558718144": {"num_run": 26, "core": 3},"132401548232384": {"num_run": 26, "core": 27},"132401537746624": {"num_run": 24, "core": 21},"132401527260864": {"num_run": 24, "core": 25},"132401516775104": {"num_run": 24, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "125","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124620,"ph" : "X","name" :"/layer2/layer2.2/conv3/Conv_quant_token_67_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124621,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :75,"ts" :124621,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 68, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 28, "core": 30},"132401808279232": {"num_run": 28, "core": 4},"132401663575744": {"num_run": 28, "core": 16},"132401797793472": {"num_run": 28, "core": 26},"132401787307712": {"num_run": 28, "core": 12},"132401776821952": {"num_run": 28, "core": 17},"132401692935872": {"num_run": 28, "core": 24},"132401682450112": {"num_run": 28, "core": 29},"132401653089984": {"num_run": 28, "core": 31},"132401642604224": {"num_run": 27, "core": 6},"132401558718144": {"num_run": 27, "core": 3},"132401548232384": {"num_run": 27, "core": 27},"132401537746624": {"num_run": 24, "core": 21},"132401527260864": {"num_run": 24, "core": 25},"132401516775104": {"num_run": 24, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "28","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124699,"ph" : "X","name" :"/layer2/layer2.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :124699,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :396,"ts" :124700,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 382, "Wait": 5, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 29, "core": 30},"132401808279232": {"num_run": 29, "core": 4},"132401663575744": {"num_run": 29, "core": 16},"132401797793472": {"num_run": 29, "core": 26},"132401787307712": {"num_run": 29, "core": 12},"132401776821952": {"num_run": 29, "core": 17},"132401692935872": {"num_run": 29, "core": 24},"132401682450112": {"num_run": 29, "core": 29},"132401653089984": {"num_run": 29, "core": 31},"132401642604224": {"num_run": 28, "core": 6},"132401558718144": {"num_run": 28, "core": 3},"132401548232384": {"num_run": 28, "core": 27},"132401537746624": {"num_run": 25, "core": 21},"132401527260864": {"num_run": 25, "core": 25},"132401516775104": {"num_run": 25, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "401408","node_index" : "128","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125098,"ph" : "X","name" :"/layer2/layer2.3/conv1/Conv_quant_token_71_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125099,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :873,"ts" :125099,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 23, "Distribution": 0, "DistributionEnqueue": 0, "Run": 824, "Wait": 8, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 30, "core": 30},"132401808279232": {"num_run": 30, "core": 4},"132401663575744": {"num_run": 30, "core": 16},"132401797793472": {"num_run": 30, "core": 26},"132401787307712": {"num_run": 30, "core": 12},"132401776821952": {"num_run": 30, "core": 17},"132401692935872": {"num_run": 30, "core": 24},"132401682450112": {"num_run": 30, "core": 29},"132401653089984": {"num_run": 30, "core": 31},"132401642604224": {"num_run": 29, "core": 6},"132401558718144": {"num_run": 29, "core": 3},"132401548232384": {"num_run": 29, "core": 27},"132401537746624": {"num_run": 26, "core": 21},"132401527260864": {"num_run": 26, "core": 25},"132401516775104": {"num_run": 26, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,128]}],"output_size" : "100352","parameter_size" : "527","activation_size" : "100352","node_index" : "130","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[128]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125977,"ph" : "X","name" :"/layer2/layer2.3/conv2/Conv_quant_token_74_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :125980,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :528,"ts" :125980,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 362, "Wait": 152, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 31, "core": 30},"132401808279232": {"num_run": 31, "core": 4},"132401663575744": {"num_run": 31, "core": 16},"132401797793472": {"num_run": 31, "core": 26},"132401787307712": {"num_run": 31, "core": 12},"132401776821952": {"num_run": 31, "core": 17},"132401692935872": {"num_run": 31, "core": 24},"132401682450112": {"num_run": 31, "core": 29},"132401653089984": {"num_run": 31, "core": 31},"132401642604224": {"num_run": 30, "core": 6},"132401558718144": {"num_run": 30, "core": 3},"132401548232384": {"num_run": 30, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "2063","activation_size" : "100352","node_index" : "132","input_type_shape" : [{"int8":[1,28,28,128]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126512,"ph" : "X","name" :"/layer2/layer2.3/conv3/Conv_quant_token_77_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126514,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :76,"ts" :126514,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 67, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 32, "core": 30},"132401808279232": {"num_run": 32, "core": 4},"132401663575744": {"num_run": 32, "core": 16},"132401797793472": {"num_run": 32, "core": 26},"132401787307712": {"num_run": 32, "core": 12},"132401776821952": {"num_run": 32, "core": 17},"132401692935872": {"num_run": 32, "core": 24},"132401682450112": {"num_run": 32, "core": 29},"132401653089984": {"num_run": 32, "core": 31},"132401642604224": {"num_run": 31, "core": 6},"132401558718144": {"num_run": 31, "core": 3},"132401548232384": {"num_run": 31, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,512]}],"output_size" : "401408","parameter_size" : "15","activation_size" : "802816","node_index" : "32","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126593,"ph" : "X","name" :"/layer2/layer2.3/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :126594,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1147,"ts" :126594,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 906, "Wait": 222, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 33, "core": 30},"132401808279232": {"num_run": 33, "core": 4},"132401663575744": {"num_run": 33, "core": 16},"132401797793472": {"num_run": 33, "core": 26},"132401787307712": {"num_run": 33, "core": 12},"132401776821952": {"num_run": 33, "core": 17},"132401692935872": {"num_run": 33, "core": 24},"132401682450112": {"num_run": 33, "core": 29},"132401653089984": {"num_run": 33, "core": 31},"132401642604224": {"num_run": 32, "core": 6},"132401558718144": {"num_run": 32, "core": 3},"132401548232384": {"num_run": 32, "core": 27},"132401537746624": {"num_run": 27, "core": 21},"132401527260864": {"num_run": 27, "core": 25},"132401516775104": {"num_run": 27, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "401408","node_index" : "141","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :127744,"ph" : "X","name" :"/layer3/layer3.0/downsample/downsample.0/Conv_quant_token_90_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :127745,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :725,"ts" :127745,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 715, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 34, "core": 30},"132401808279232": {"num_run": 34, "core": 4},"132401663575744": {"num_run": 34, "core": 16},"132401797793472": {"num_run": 34, "core": 26},"132401787307712": {"num_run": 34, "core": 12},"132401776821952": {"num_run": 34, "core": 17},"132401692935872": {"num_run": 34, "core": 24},"132401682450112": {"num_run": 34, "core": 29},"132401653089984": {"num_run": 34, "core": 31},"132401642604224": {"num_run": 33, "core": 6},"132401558718144": {"num_run": 33, "core": 3},"132401548232384": {"num_run": 33, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,28,28,256]}],"output_size" : "200704","parameter_size" : "1039","activation_size" : "401408","node_index" : "135","input_type_shape" : [{"int8":[1,28,28,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :128472,"ph" : "X","name" :"/layer3/layer3.0/conv1/Conv_quant_token_81_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :128473,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1198,"ts" :128473,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 797, "Wait": 367, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 35, "core": 30},"132401808279232": {"num_run": 35, "core": 4},"132401663575744": {"num_run": 35, "core": 16},"132401797793472": {"num_run": 35, "core": 26},"132401787307712": {"num_run": 35, "core": 12},"132401776821952": {"num_run": 35, "core": 17},"132401692935872": {"num_run": 35, "core": 24},"132401682450112": {"num_run": 35, "core": 29},"132401653089984": {"num_run": 35, "core": 31},"132401642604224": {"num_run": 34, "core": 6},"132401558718144": {"num_run": 34, "core": 3},"132401548232384": {"num_run": 34, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "137","input_type_shape" : [{"int8":[1,28,28,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :129675,"ph" : "X","name" :"/layer3/layer3.0/conv2/Conv_quant_token_84_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :129676,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :629,"ts" :129676,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 435, "Wait": 185, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 36, "core": 30},"132401808279232": {"num_run": 36, "core": 4},"132401663575744": {"num_run": 36, "core": 16},"132401797793472": {"num_run": 36, "core": 26},"132401787307712": {"num_run": 36, "core": 12},"132401776821952": {"num_run": 36, "core": 17},"132401692935872": {"num_run": 36, "core": 24},"132401682450112": {"num_run": 36, "core": 29},"132401653089984": {"num_run": 36, "core": 31},"132401642604224": {"num_run": 35, "core": 6},"132401558718144": {"num_run": 35, "core": 3},"132401548232384": {"num_run": 35, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "139","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130308,"ph" : "X","name" :"/layer3/layer3.0/conv3/Conv_quant_token_87_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130309,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :130310,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 73, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 37, "core": 30},"132401808279232": {"num_run": 37, "core": 4},"132401663575744": {"num_run": 37, "core": 16},"132401797793472": {"num_run": 37, "core": 26},"132401787307712": {"num_run": 37, "core": 12},"132401776821952": {"num_run": 36, "core": 17},"132401692935872": {"num_run": 36, "core": 24},"132401682450112": {"num_run": 36, "core": 29},"132401653089984": {"num_run": 36, "core": 31},"132401642604224": {"num_run": 35, "core": 6},"132401558718144": {"num_run": 35, "core": 3},"132401548232384": {"num_run": 35, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "37","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130393,"ph" : "X","name" :"/layer3/layer3.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130394,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :538,"ts" :130394,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 342, "Wait": 187, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 38, "core": 30},"132401808279232": {"num_run": 38, "core": 4},"132401663575744": {"num_run": 38, "core": 16},"132401797793472": {"num_run": 38, "core": 26},"132401787307712": {"num_run": 38, "core": 28},"132401776821952": {"num_run": 37, "core": 17},"132401692935872": {"num_run": 37, "core": 24},"132401682450112": {"num_run": 37, "core": 29},"132401653089984": {"num_run": 37, "core": 31},"132401642604224": {"num_run": 36, "core": 6},"132401558718144": {"num_run": 36, "core": 3},"132401548232384": {"num_run": 36, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "144","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130934,"ph" : "X","name" :"/layer3/layer3.1/conv1/Conv_quant_token_94_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :130935,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1167,"ts" :130935,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 797, "Wait": 345, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 39, "core": 30},"132401808279232": {"num_run": 39, "core": 23},"132401663575744": {"num_run": 39, "core": 16},"132401797793472": {"num_run": 39, "core": 26},"132401787307712": {"num_run": 39, "core": 28},"132401776821952": {"num_run": 38, "core": 17},"132401692935872": {"num_run": 38, "core": 24},"132401682450112": {"num_run": 38, "core": 29},"132401653089984": {"num_run": 38, "core": 31},"132401642604224": {"num_run": 37, "core": 6},"132401558718144": {"num_run": 37, "core": 3},"132401548232384": {"num_run": 37, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "146","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132105,"ph" : "X","name" :"/layer3/layer3.1/conv2/Conv_quant_token_97_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132106,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :641,"ts" :132106,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 441, "Wait": 191, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 40, "core": 30},"132401808279232": {"num_run": 40, "core": 23},"132401663575744": {"num_run": 40, "core": 16},"132401797793472": {"num_run": 40, "core": 26},"132401787307712": {"num_run": 40, "core": 28},"132401776821952": {"num_run": 39, "core": 17},"132401692935872": {"num_run": 39, "core": 24},"132401682450112": {"num_run": 39, "core": 29},"132401653089984": {"num_run": 39, "core": 31},"132401642604224": {"num_run": 38, "core": 6},"132401558718144": {"num_run": 38, "core": 3},"132401548232384": {"num_run": 38, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "148","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132750,"ph" : "X","name" :"/layer3/layer3.1/conv3/Conv_quant_token_100_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132750,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :84,"ts" :132755,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 76, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 41, "core": 30},"132401808279232": {"num_run": 41, "core": 23},"132401663575744": {"num_run": 41, "core": 16},"132401797793472": {"num_run": 41, "core": 26},"132401787307712": {"num_run": 41, "core": 28},"132401776821952": {"num_run": 39, "core": 17},"132401692935872": {"num_run": 39, "core": 24},"132401682450112": {"num_run": 39, "core": 29},"132401653089984": {"num_run": 39, "core": 31},"132401642604224": {"num_run": 38, "core": 6},"132401558718144": {"num_run": 38, "core": 3},"132401548232384": {"num_run": 38, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "41","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132841,"ph" : "X","name" :"/layer3/layer3.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :132842,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :499,"ts" :132842,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 361, "Wait": 130, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 42, "core": 30},"132401808279232": {"num_run": 42, "core": 23},"132401663575744": {"num_run": 42, "core": 16},"132401797793472": {"num_run": 42, "core": 26},"132401787307712": {"num_run": 42, "core": 28},"132401776821952": {"num_run": 40, "core": 17},"132401692935872": {"num_run": 40, "core": 24},"132401682450112": {"num_run": 40, "core": 29},"132401653089984": {"num_run": 40, "core": 31},"132401642604224": {"num_run": 39, "core": 6},"132401558718144": {"num_run": 39, "core": 3},"132401548232384": {"num_run": 39, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "151","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :133344,"ph" : "X","name" :"/layer3/layer3.2/conv1/Conv_quant_token_104_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :133345,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1104,"ts" :133345,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 800, "Wait": 280, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 43, "core": 30},"132401808279232": {"num_run": 43, "core": 23},"132401663575744": {"num_run": 43, "core": 16},"132401797793472": {"num_run": 43, "core": 26},"132401787307712": {"num_run": 43, "core": 28},"132401776821952": {"num_run": 41, "core": 17},"132401692935872": {"num_run": 41, "core": 24},"132401682450112": {"num_run": 41, "core": 29},"132401653089984": {"num_run": 41, "core": 31},"132401642604224": {"num_run": 40, "core": 6},"132401558718144": {"num_run": 40, "core": 3},"132401548232384": {"num_run": 40, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "153","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :134451,"ph" : "X","name" :"/layer3/layer3.2/conv2/Conv_quant_token_107_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :134453,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :605,"ts" :134453,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 439, "Wait": 156, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 44, "core": 30},"132401808279232": {"num_run": 44, "core": 23},"132401663575744": {"num_run": 44, "core": 16},"132401797793472": {"num_run": 44, "core": 26},"132401787307712": {"num_run": 44, "core": 28},"132401776821952": {"num_run": 42, "core": 17},"132401692935872": {"num_run": 42, "core": 24},"132401682450112": {"num_run": 42, "core": 29},"132401653089984": {"num_run": 42, "core": 31},"132401642604224": {"num_run": 41, "core": 6},"132401558718144": {"num_run": 41, "core": 3},"132401548232384": {"num_run": 41, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "155","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135061,"ph" : "X","name" :"/layer3/layer3.2/conv3/Conv_quant_token_110_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135061,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :85,"ts" :135062,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 78, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 45, "core": 30},"132401808279232": {"num_run": 45, "core": 23},"132401663575744": {"num_run": 45, "core": 16},"132401797793472": {"num_run": 45, "core": 26},"132401787307712": {"num_run": 45, "core": 28},"132401776821952": {"num_run": 42, "core": 17},"132401692935872": {"num_run": 42, "core": 24},"132401682450112": {"num_run": 42, "core": 29},"132401653089984": {"num_run": 42, "core": 31},"132401642604224": {"num_run": 41, "core": 6},"132401558718144": {"num_run": 41, "core": 3},"132401548232384": {"num_run": 41, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "45","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135149,"ph" : "X","name" :"/layer3/layer3.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135150,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :471,"ts" :135150,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 368, "Wait": 94, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 46, "core": 30},"132401808279232": {"num_run": 46, "core": 23},"132401663575744": {"num_run": 46, "core": 16},"132401797793472": {"num_run": 46, "core": 26},"132401787307712": {"num_run": 46, "core": 28},"132401776821952": {"num_run": 43, "core": 17},"132401692935872": {"num_run": 43, "core": 24},"132401682450112": {"num_run": 43, "core": 29},"132401653089984": {"num_run": 43, "core": 31},"132401642604224": {"num_run": 42, "core": 6},"132401558718144": {"num_run": 42, "core": 3},"132401548232384": {"num_run": 42, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "158","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135623,"ph" : "X","name" :"/layer3/layer3.3/conv1/Conv_quant_token_114_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :135624,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1092,"ts" :135624,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 810, "Wait": 258, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 47, "core": 30},"132401808279232": {"num_run": 47, "core": 23},"132401663575744": {"num_run": 47, "core": 16},"132401797793472": {"num_run": 47, "core": 26},"132401787307712": {"num_run": 47, "core": 28},"132401776821952": {"num_run": 44, "core": 17},"132401692935872": {"num_run": 44, "core": 24},"132401682450112": {"num_run": 44, "core": 29},"132401653089984": {"num_run": 44, "core": 31},"132401642604224": {"num_run": 43, "core": 6},"132401558718144": {"num_run": 43, "core": 3},"132401548232384": {"num_run": 43, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "160","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :136718,"ph" : "X","name" :"/layer3/layer3.3/conv2/Conv_quant_token_117_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :136719,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :468,"ts" :136719,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 434, "Wait": 25, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 48, "core": 30},"132401808279232": {"num_run": 48, "core": 23},"132401663575744": {"num_run": 48, "core": 16},"132401797793472": {"num_run": 48, "core": 26},"132401787307712": {"num_run": 48, "core": 28},"132401776821952": {"num_run": 45, "core": 17},"132401692935872": {"num_run": 45, "core": 24},"132401682450112": {"num_run": 45, "core": 29},"132401653089984": {"num_run": 45, "core": 31},"132401642604224": {"num_run": 44, "core": 6},"132401558718144": {"num_run": 44, "core": 3},"132401548232384": {"num_run": 44, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "162","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137188,"ph" : "X","name" :"/layer3/layer3.3/conv3/Conv_quant_token_120_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137190,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :80,"ts" :137190,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 73, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 49, "core": 30},"132401808279232": {"num_run": 49, "core": 23},"132401663575744": {"num_run": 49, "core": 16},"132401797793472": {"num_run": 49, "core": 26},"132401787307712": {"num_run": 49, "core": 28},"132401776821952": {"num_run": 45, "core": 17},"132401692935872": {"num_run": 45, "core": 24},"132401682450112": {"num_run": 45, "core": 29},"132401653089984": {"num_run": 45, "core": 31},"132401642604224": {"num_run": 44, "core": 6},"132401558718144": {"num_run": 44, "core": 3},"132401548232384": {"num_run": 44, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "49","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137271,"ph" : "X","name" :"/layer3/layer3.3/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137272,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :516,"ts" :137272,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 348, "Wait": 160, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 50, "core": 30},"132401808279232": {"num_run": 50, "core": 23},"132401663575744": {"num_run": 50, "core": 16},"132401797793472": {"num_run": 50, "core": 26},"132401787307712": {"num_run": 50, "core": 28},"132401776821952": {"num_run": 46, "core": 17},"132401692935872": {"num_run": 46, "core": 24},"132401682450112": {"num_run": 46, "core": 29},"132401653089984": {"num_run": 46, "core": 31},"132401642604224": {"num_run": 45, "core": 6},"132401558718144": {"num_run": 45, "core": 3},"132401548232384": {"num_run": 45, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "165","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137790,"ph" : "X","name" :"/layer3/layer3.4/conv1/Conv_quant_token_124_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :137790,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :930,"ts" :137791,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 786, "Wait": 120, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 51, "core": 30},"132401808279232": {"num_run": 51, "core": 23},"132401663575744": {"num_run": 51, "core": 16},"132401797793472": {"num_run": 51, "core": 26},"132401787307712": {"num_run": 51, "core": 28},"132401776821952": {"num_run": 47, "core": 17},"132401692935872": {"num_run": 47, "core": 24},"132401682450112": {"num_run": 47, "core": 29},"132401653089984": {"num_run": 47, "core": 31},"132401642604224": {"num_run": 46, "core": 6},"132401558718144": {"num_run": 46, "core": 3},"132401548232384": {"num_run": 46, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "167","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :138723,"ph" : "X","name" :"/layer3/layer3.4/conv2/Conv_quant_token_127_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :138724,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :465,"ts" :138724,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 431, "Wait": 25, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 52, "core": 30},"132401808279232": {"num_run": 52, "core": 23},"132401663575744": {"num_run": 52, "core": 16},"132401797793472": {"num_run": 52, "core": 26},"132401787307712": {"num_run": 52, "core": 28},"132401776821952": {"num_run": 48, "core": 17},"132401692935872": {"num_run": 48, "core": 24},"132401682450112": {"num_run": 48, "core": 29},"132401653089984": {"num_run": 48, "core": 31},"132401642604224": {"num_run": 47, "core": 6},"132401558718144": {"num_run": 47, "core": 3},"132401548232384": {"num_run": 47, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "169","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139191,"ph" : "X","name" :"/layer3/layer3.4/conv3/Conv_quant_token_130_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139192,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :81,"ts" :139192,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 74, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 53, "core": 30},"132401808279232": {"num_run": 53, "core": 23},"132401663575744": {"num_run": 53, "core": 16},"132401797793472": {"num_run": 53, "core": 26},"132401787307712": {"num_run": 53, "core": 28},"132401776821952": {"num_run": 48, "core": 17},"132401692935872": {"num_run": 48, "core": 24},"132401682450112": {"num_run": 48, "core": 29},"132401653089984": {"num_run": 48, "core": 31},"132401642604224": {"num_run": 47, "core": 6},"132401558718144": {"num_run": 47, "core": 3},"132401548232384": {"num_run": 47, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "53","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139275,"ph" : "X","name" :"/layer3/layer3.4/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139276,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :485,"ts" :139276,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 352, "Wait": 125, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 54, "core": 30},"132401808279232": {"num_run": 54, "core": 23},"132401663575744": {"num_run": 54, "core": 16},"132401797793472": {"num_run": 54, "core": 26},"132401787307712": {"num_run": 54, "core": 28},"132401776821952": {"num_run": 49, "core": 17},"132401692935872": {"num_run": 49, "core": 24},"132401682450112": {"num_run": 49, "core": 29},"132401653089984": {"num_run": 49, "core": 31},"132401642604224": {"num_run": 48, "core": 6},"132401558718144": {"num_run": 48, "core": 3},"132401548232384": {"num_run": 48, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "200704","node_index" : "172","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139763,"ph" : "X","name" :"/layer3/layer3.5/conv1/Conv_quant_token_134_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :139764,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1137,"ts" :139765,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 800, "Wait": 311, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 55, "core": 30},"132401808279232": {"num_run": 55, "core": 23},"132401663575744": {"num_run": 55, "core": 16},"132401797793472": {"num_run": 55, "core": 26},"132401787307712": {"num_run": 55, "core": 28},"132401776821952": {"num_run": 50, "core": 17},"132401692935872": {"num_run": 50, "core": 24},"132401682450112": {"num_run": 50, "core": 29},"132401653089984": {"num_run": 50, "core": 31},"132401642604224": {"num_run": 49, "core": 6},"132401558718144": {"num_run": 49, "core": 3},"132401548232384": {"num_run": 49, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,256]}],"output_size" : "50176","parameter_size" : "1039","activation_size" : "50176","node_index" : "174","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[256]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :140905,"ph" : "X","name" :"/layer3/layer3.5/conv2/Conv_quant_token_137_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :140906,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :642,"ts" :140906,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 439, "Wait": 195, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 56, "core": 30},"132401808279232": {"num_run": 56, "core": 23},"132401663575744": {"num_run": 56, "core": 16},"132401797793472": {"num_run": 56, "core": 26},"132401787307712": {"num_run": 56, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "4111","activation_size" : "50176","node_index" : "176","input_type_shape" : [{"int8":[1,14,14,256]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[1024]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141550,"ph" : "X","name" :"/layer3/layer3.5/conv3/Conv_quant_token_140_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141552,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :82,"ts" :141552,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [29767], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 75, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 57, "core": 30},"132401808279232": {"num_run": 57, "core": 23},"132401663575744": {"num_run": 57, "core": 16},"132401797793472": {"num_run": 57, "core": 26},"132401787307712": {"num_run": 57, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,1024]}],"output_size" : "200704","parameter_size" : "15","activation_size" : "401408","node_index" : "57","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141636,"ph" : "X","name" :"/layer3/layer3.5/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :141637,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :4183,"ts" :141638,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3762, "Wait": 409, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 58, "core": 30},"132401808279232": {"num_run": 58, "core": 23},"132401663575744": {"num_run": 58, "core": 16},"132401797793472": {"num_run": 57, "core": 26},"132401787307712": {"num_run": 57, "core": 28},"132401776821952": {"num_run": 51, "core": 17},"132401692935872": {"num_run": 51, "core": 24},"132401682450112": {"num_run": 51, "core": 29},"132401653089984": {"num_run": 51, "core": 31},"132401642604224": {"num_run": 50, "core": 6},"132401558718144": {"num_run": 50, "core": 3},"132401548232384": {"num_run": 50, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "200704","node_index" : "185","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :145823,"ph" : "X","name" :"/layer4/layer4.0/downsample/downsample.0/Conv_quant_token_153_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :145824,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1348,"ts" :145824,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 825, "Wait": 513, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 59, "core": 30},"132401808279232": {"num_run": 59, "core": 23},"132401663575744": {"num_run": 59, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,14,14,512]}],"output_size" : "100352","parameter_size" : "2063","activation_size" : "200704","node_index" : "179","input_type_shape" : [{"int8":[1,14,14,1024]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :147174,"ph" : "X","name" :"/layer4/layer4.0/conv1/Conv_quant_token_144_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :147175,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3996,"ts" :147176,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3719, "Wait": 256, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 60, "core": 30},"132401808279232": {"num_run": 60, "core": 23},"132401663575744": {"num_run": 60, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "181","input_type_shape" : [{"int8":[1,14,14,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :151174,"ph" : "X","name" :"/layer4/layer4.0/conv2/Conv_quant_token_147_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :151177,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1972,"ts" :151177,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1845, "Wait": 117, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 61, "core": 30},"132401808279232": {"num_run": 61, "core": 23},"132401663575744": {"num_run": 61, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "183","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153150,"ph" : "X","name" :"/layer4/layer4.0/conv3/Conv_quant_token_150_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153151,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :144,"ts" :153151,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 61, "core": 30},"132401808279232": {"num_run": 61, "core": 23},"132401663575744": {"num_run": 61, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "62","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153296,"ph" : "X","name" :"/layer4/layer4.0/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :153297,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1740,"ts" :153297,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1617, "Wait": 114, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 62, "core": 30},"132401808279232": {"num_run": 62, "core": 23},"132401663575744": {"num_run": 62, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "188","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :155039,"ph" : "X","name" :"/layer4/layer4.1/conv1/Conv_quant_token_157_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :155040,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3893,"ts" :155040,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3646, "Wait": 230, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 63, "core": 30},"132401808279232": {"num_run": 63, "core": 5},"132401663575744": {"num_run": 63, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "25088","node_index" : "190","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :158935,"ph" : "X","name" :"/layer4/layer4.1/conv2/Conv_quant_token_160_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :158936,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1929,"ts" :158936,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1837, "Wait": 82, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 64, "core": 30},"132401808279232": {"num_run": 64, "core": 5},"132401663575744": {"num_run": 64, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "192","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :160868,"ph" : "X","name" :"/layer4/layer4.1/conv3/Conv_quant_token_163_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :160870,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :143,"ts" :160870,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 64, "core": 30},"132401808279232": {"num_run": 64, "core": 5},"132401663575744": {"num_run": 64, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "66","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :161015,"ph" : "X","name" :"/layer4/layer4.1/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :161016,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1726,"ts" :161016,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 1, "DistributionEnqueue": 1, "Run": 1609, "Wait": 107, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 65, "core": 30},"132401808279232": {"num_run": 65, "core": 5},"132401663575744": {"num_run": 65, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "100352","node_index" : "195","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :162743,"ph" : "X","name" :"/layer4/layer4.2/conv1/Conv_quant_token_167_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :162744,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :4001,"ts" :162744,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 3559, "Wait": 425, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 66, "core": 30},"132401808279232": {"num_run": 66, "core": 5},"132401663575744": {"num_run": 66, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,512]}],"output_size" : "25088","parameter_size" : "2063","activation_size" : "25088","node_index" : "197","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[512]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :166747,"ph" : "X","name" :"/layer4/layer4.2/conv2/Conv_quant_token_170_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :166748,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_fence_before","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :1999,"ts" :166749,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 1765, "Wait": 224, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "8207","activation_size" : "25088","node_index" : "199","input_type_shape" : [{"int8":[1,7,7,512]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]},{"int32":[2048]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168751,"ph" : "X","name" :"/layer4/layer4.2/conv3/Conv_quant_token_173_fence_after","args" : {"op_name" : "QLinearConv"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168753,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_fence_before","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :137,"ts" :168753,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,7,7,2048]}],"output_size" : "100352","parameter_size" : "15","activation_size" : "200704","node_index" : "70","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168892,"ph" : "X","name" :"/layer4/layer4.2/Add_quant_fence_after","args" : {"op_name" : "QLinearAdd"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168895,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_fence_before","args" : {"op_name" : "QLinearGlobalAveragePool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :19,"ts" :168895,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,1,1,2048]}],"output_size" : "2048","parameter_size" : "10","activation_size" : "100352","node_index" : "71","input_type_shape" : [{"int8":[1,7,7,2048]},{"float":[]},{"int8":[]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QLinearGlobalAveragePool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168916,"ph" : "X","name" :"/avgpool/GlobalAveragePool_quant_fence_after","args" : {"op_name" : "QLinearGlobalAveragePool"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168919,"ph" : "X","name" :"Transpose_token_193_fence_before","args" : {"op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :5,"ts" :168920,"ph" : "X","name" :"Transpose_token_193_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,2048,1,1]}],"output_size" : "2048","parameter_size" : "0","activation_size" : "2048","node_index" : "218","input_type_shape" : [{"int8":[1,1,1,2048]}],"provider" : "CPUExecutionProvider","op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168926,"ph" : "X","name" :"Transpose_token_193_fence_after","args" : {"op_name" : "Transpose"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168927,"ph" : "X","name" :"/Flatten_fence_before","args" : {"op_name" : "Flatten"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :3,"ts" :168927,"ph" : "X","name" :"/Flatten_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 67, "core": 30},"132401808279232": {"num_run": 67, "core": 5},"132401663575744": {"num_run": 67, "core": 16},"132401797793472": {"num_run": 58, "core": 26},"132401787307712": {"num_run": 58, "core": 28},"132401776821952": {"num_run": 52, "core": 17},"132401692935872": {"num_run": 52, "core": 24},"132401682450112": {"num_run": 52, "core": 29},"132401653089984": {"num_run": 52, "core": 31},"132401642604224": {"num_run": 51, "core": 6},"132401558718144": {"num_run": 51, "core": 3},"132401548232384": {"num_run": 51, "core": 27},"132401537746624": {"num_run": 28, "core": 21},"132401527260864": {"num_run": 28, "core": 25},"132401516775104": {"num_run": 28, "core": 20}}},"output_type_shape" : [{"int8":[1,2048]}],"output_size" : "2048","parameter_size" : "0","activation_size" : "2048","node_index" : "73","input_type_shape" : [{"int8":[1,2048,1,1]}],"provider" : "CPUExecutionProvider","op_name" : "Flatten"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168932,"ph" : "X","name" :"/Flatten_fence_after","args" : {"op_name" : "Flatten"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :168932,"ph" : "X","name" :"/fc/Gemm_quant_fence_before","args" : {"op_name" : "QGemm"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :627,"ts" :168933,"ph" : "X","name" :"/fc/Gemm_quant_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [1], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 59, "Wait": 78, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 68, "core": 30},"132401808279232": {"num_run": 68, "core": 5},"132401663575744": {"num_run": 68, "core": 16},"132401797793472": {"num_run": 59, "core": 26},"132401787307712": {"num_run": 59, "core": 28},"132401776821952": {"num_run": 53, "core": 17},"132401692935872": {"num_run": 53, "core": 24},"132401682450112": {"num_run": 53, "core": 29},"132401653089984": {"num_run": 53, "core": 25},"132401642604224": {"num_run": 52, "core": 6},"132401558718144": {"num_run": 52, "core": 3},"132401548232384": {"num_run": 52, "core": 27},"132401537746624": {"num_run": 29, "core": 21},"132401527260864": {"num_run": 29, "core": 31},"132401516775104": {"num_run": 29, "core": 20}}},"output_type_shape" : [{"int8":[1,1000]}],"output_size" : "1000","parameter_size" : "2052015","activation_size" : "2048","node_index" : "75","input_type_shape" : [{"int8":[1,2048]},{"float":[]},{"int8":[]},{"int8":[1000,2048]},{"float":[]},{"int8":[]},{"int32":[1000]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "QGemm"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169562,"ph" : "X","name" :"/fc/Gemm_quant_fence_after","args" : {"op_name" : "QGemm"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169564,"ph" : "X","name" :"output_DequantizeLinear_fence_before","args" : {"op_name" : "DequantizeLinear"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :7,"ts" :169565,"ph" : "X","name" :"output_DequantizeLinear_kernel_time","args" : {"thread_scheduling_stats" : {"main_thread": {"thread_pool_name": "session-1-intra-op", "thread_id": "132403315773568", "block_size": [], "core": 2, "Distribution": 0, "DistributionEnqueue": 0, "Run": 0, "Wait": 0, "WaitRevoke": 0}, "sub_threads": {"132401818764992": {"num_run": 68, "core": 30},"132401808279232": {"num_run": 68, "core": 5},"132401663575744": {"num_run": 68, "core": 16},"132401797793472": {"num_run": 59, "core": 26},"132401787307712": {"num_run": 59, "core": 28},"132401776821952": {"num_run": 53, "core": 17},"132401692935872": {"num_run": 53, "core": 24},"132401682450112": {"num_run": 53, "core": 29},"132401653089984": {"num_run": 53, "core": 25},"132401642604224": {"num_run": 52, "core": 6},"132401558718144": {"num_run": 52, "core": 3},"132401548232384": {"num_run": 52, "core": 27},"132401537746624": {"num_run": 29, "core": 21},"132401527260864": {"num_run": 29, "core": 31},"132401516775104": {"num_run": 29, "core": 20}}},"output_type_shape" : [{"float":[1,1000]}],"output_size" : "4000","parameter_size" : "5","activation_size" : "1000","node_index" : "76","input_type_shape" : [{"int8":[1,1000]},{"float":[]},{"int8":[]}],"provider" : "CPUExecutionProvider","op_name" : "DequantizeLinear"}}, +{"cat" : "Node","pid" :635173,"tid" :635173,"dur" :0,"ts" :169573,"ph" : "X","name" :"output_DequantizeLinear_fence_after","args" : {"op_name" : "DequantizeLinear"}}, +{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :60252,"ts" :109326,"ph" : "X","name" :"SequentialExecutor::Execute","args" : {}}, +{"cat" : "Session","pid" :635173,"tid" :635173,"dur" :60331,"ts" :109268,"ph" : "X","name" :"model_run","args" : {}} +] diff --git a/onnxruntime/test/python/gpnpumode/analyze_json.py b/onnxruntime/test/python/gpnpumode/analyze_json.py new file mode 100644 index 0000000000..1a7a704210 --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/analyze_json.py @@ -0,0 +1,18 @@ +import os +import sys +import numpy as np + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from helper import load_json, json_to_df + +def get_time(jsons): + times = [] + for json in jsons: + cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True) + times.append(cpu_df['duration'].values) + print(np.sum(np.array(times))) + return np.mean(np.array(times)), np.std(np.array(times)) + +cpu_mean_time, cpu_std_time = get_time(['onnxruntime_profile__2025-01-28_21-44-59.json']) +print(f"CPU Time: {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms") diff --git a/onnxruntime/test/python/gpnpumode/test_lutop.py b/onnxruntime/test/python/gpnpumode/test_lutop.py new file mode 100644 index 0000000000..6493b2516e --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_lutop.py @@ -0,0 +1,114 @@ +# import onnx +# import numpy as np +# import onnxruntime as ort +# from onnx import helper, TensorProto + +# # Define the custom op node +# input_tensor = helper.make_tensor_value_info('input', TensorProto.INT8, [3]) +# lut_tensor = helper.make_tensor_value_info('lut', TensorProto.INT8, [256]) +# output_tensor = helper.make_tensor_value_info('output', TensorProto.INT8, [3]) + +# node = helper.make_node( +# 'LookupTable', # Custom op name +# inputs=['input', 'lut'], +# outputs=['output'], +# domain='test.customop' # Custom domain +# ) + +# # Create the graph and model +# graph = helper.make_graph( +# [node], +# 'test_graph', +# [input_tensor, lut_tensor], +# [output_tensor] +# ) + +# # Add opset import for the custom domain +# opset_imports = [ +# helper.make_opsetid("", 13), # Default domain (ONNX) +# helper.make_opsetid("test.customop", 1) # Custom domain +# ] + +# model = helper.make_model(graph, opset_imports=opset_imports, producer_name='custom_op_test') + +# # Save the model +# onnx.save(model, 'test_model.onnx') + +# # Prepare input data +# input_data = np.array([-128, 0, 127], dtype=np.int8) +# lut_data = np.array([127 - i for i in range(256)], dtype=np.int8) # Example LUT: invert values + +# # Run the model with ONNX Runtime +# so = ort.SessionOptions() +# so.register_custom_ops_library('/home/maggies/onnxruntime/build/Linux/Release/libcustom_op_library.so') # Path to your custom op library + +# session = ort.InferenceSession('test_model.onnx', so) +# inputs = {'input': input_data, 'lut': lut_data} +# outputs = session.run(None, inputs) + +# print('Input:', input_data) +# print('LUT:', lut_data) +# print('Output:', outputs[0]) + + +import onnx +import numpy as np +import onnxruntime as ort +from onnx import helper, TensorProto + +# Define the custom op node +input_tensor = helper.make_tensor_value_info('input', TensorProto.INT8, [3]) +output_tensor = helper.make_tensor_value_info('output', TensorProto.INT8, [3]) + +# Example LUT: invert values +lut_data = np.array([127 - i for i in range(256)], dtype=np.int8) + +# Create the LUT tensor attribute +lut_tensor = helper.make_tensor( + name='lut', + data_type=TensorProto.INT8, + dims=[256], + vals=lut_data +) + +node = helper.make_node( + 'LookupTable', # Custom op name + inputs=['input'], + outputs=['output'], + domain='test.customop', # Custom domain + lut=lut_tensor # LUT as an attribute +) + +# Create the graph and model +graph = helper.make_graph( + [node], + 'test_graph', + [input_tensor], + [output_tensor] +) + +# Add opset import for the custom domain +opset_imports = [ + helper.make_opsetid("", 13), # Default domain (ONNX) + helper.make_opsetid("test.customop", 1) # Custom domain +] + +model = helper.make_model(graph, opset_imports=opset_imports, producer_name='custom_op_test') + +# Save the model +onnx.save(model, 'test_model.onnx') + +# Prepare input data +input_data = np.array([-128, 0, 127], dtype=np.int8) + +# Run the model with ONNX Runtime +so = ort.SessionOptions() +so.register_custom_ops_library('/home/maggies/onnxruntime/build/Linux/Release/libcustom_op_library.so') # Path to your custom op library + +session = ort.InferenceSession('test_model.onnx', so) +inputs = {'input': input_data} +outputs = session.run(None, inputs) + +print('Input:', input_data) +print('LUT:', lut_data) +print('Output:', outputs[0]) diff --git a/onnxruntime/test/python/gpnpumode/test_qgemm.py b/onnxruntime/test/python/gpnpumode/test_qgemm.py new file mode 100644 index 0000000000..f0f7d44e20 --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_qgemm.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest +import numpy as np +import onnx +import onnxruntime as ort +from onnx import helper, TensorProto +import os +import sys +import time + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from helper import get_onnx_const, generate_normal_inputs + + +m = 1 +k = 2024 +n = 1000 + + +class TestQGemm(unittest.TestCase): + def setUp(self): + # Create a specific ONNX model with a single QGemm node + self.model_path = "qgemm_model.onnx" + self.create_qgemm_model(self.model_path) + + def create_qgemm_model(self, output_model_path): + a_scale, a_zero_point = 0.2039528638124466, -14 + b_scale, b_zero_point = 0.003937007859349251, 0 + y_scale, y_zero_point = 0.1019764319062233, -6 + + # Define node names + input_a_name = "input_a" + input_a_scale_name = "input_a_scale" + input_a_zp_name = "input_a_zero_point" + input_b_name = "input_b" + input_b_scale_name = "input_b_scale" + input_b_zp_name = "input_b_zero_point" + output_scale_name = "output_scale" + output_zp_name = "output_zero_point" + output_name = "output" + + a_sc = get_onnx_const(input_a_scale_name, a_scale, TensorProto.FLOAT) + a_zp = get_onnx_const(input_a_zp_name, a_zero_point, TensorProto.INT8) + b_sc = get_onnx_const(input_b_scale_name, b_scale, TensorProto.FLOAT) + b_zp = get_onnx_const(input_b_zp_name, b_zero_point, TensorProto.INT8) + y_sc = get_onnx_const(output_scale_name, y_scale, TensorProto.FLOAT) + y_zp = get_onnx_const(output_zp_name, y_zero_point, TensorProto.INT8) + # Define input and output tensors + input_a_tensor = helper.make_tensor_value_info(input_a_name, TensorProto.INT8, [m, k]) + output_tensor = helper.make_tensor_value_info("out", TensorProto.INT8, [m, n]) + b = get_onnx_const(input_b_name, generate_normal_inputs([n, k], np.int8, 0, 32)) + y = get_onnx_const(output_name, generate_normal_inputs([n, ], np.int32, 0, 32)) + + + # Create QLinearAdd node + qlinear_add_node = onnx.helper.make_node( + "QGemm", + inputs=[input_a_name, input_a_scale_name, input_a_zp_name, + input_b_name, input_b_scale_name, input_b_zp_name, + output_name, + output_scale_name, output_zp_name], + outputs=["out"], + alpha=0.5, + transA=0, + transB=1, + domain="com.microsoft" + ) + + # Create graph + graph_name = "com.microsoft.QLinearAdd_test" + graph = helper.make_graph( + [qlinear_add_node], + graph_name, + [input_a_tensor], + [output_tensor], + initializer=[a_sc, a_zp, b, b_sc, b_zp, y, y_sc, y_zp], + ) + + # Create model + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("com.microsoft", 1),helper.make_opsetid("", 12)]) + model.ir_version = 8 # use stable onnx ir version + + # Save model + onnx.checker.check_model(model, True) + onnx.save(model, output_model_path) + + def tearDown(self): + # Delete the ONNX file after testing + if os.path.exists(self.model_path): + os.remove(self.model_path) + + def test_qlinearconv_inference(self): + session_options = ort.SessionOptions() + session_options.enable_gpnpu = False + print(f"Flag enable_gpnpu: {session_options.enable_gpnpu}") + + # Create an inference session + session1 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"]) + print(f"Check flag enable_gpnpu: {session1.get_session_options().enable_gpnpu}") + + session_options.enable_gpnpu = True + session2 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"]) + print(f"Check flag enable_gpnpu: {session2.get_session_options().enable_gpnpu}") + + # Get information about both inputs + input_a_info = session1.get_inputs()[0] + # input_b_info = session.get_inputs()[1] + + # print(f"Model input names: {input_a_info.name}") + # print(f"Model input shapes: {input_a_info.shape}") + + # Create random INT8 data matching the input shapes + shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape) + + # Generate random data for both inputs + x_data_a = np.random.randint( + low=-128, high=128, size=shape_tuple_a, dtype=np.int8 + ) + + # Create input dictionary with both inputs + input_dict = { + input_a_info.name: x_data_a + } + + # Run inference + output_name1 = session1.get_outputs()[0].name + # print(f"Process ID: {os.getpid()}") + t1 = time.time() + output_data1 = session1.run([output_name1], input_dict)[0] + t2 = time.time() + output_name2 = session2.get_outputs()[0].name + # print(f"Process ID: {os.getpid()}") + t3 = time.time() + output_data2 = session2.run([output_name2], input_dict)[0] + t4 = time.time() + + print("CPU ", t2-t1) + print("GPNPU", t4-t3) + + # Print shapes and types + print(f"Input A data shape: {x_data_a.shape}, dtype: {x_data_a.dtype}") + print(f"Output data shape: {output_data1.shape}, dtype: {output_data1.dtype}") + # print("Output data (truncated):\n", output_data1.flatten()[:50], "...\n") + # print("Output data (truncated):\n", output_data2.flatten()[:50], "...\n") + # print("hi") + difference = output_data1 - output_data2 + max_diff = np.max(np.abs(difference)) + print(max_diff) + + difference = output_data1 - output_data2 + + max_diff = np.max(np.abs(difference)) + + # Check the output shape and type + self.assertEqual(output_data1.shape, (m,n)) + self.assertEqual(output_data1.dtype, np.int8) + self.assertLessEqual(max_diff, 1) + +if __name__ == '__main__': + unittest.main() diff --git a/onnxruntime/test/python/gpnpumode/test_qlinearadd.py b/onnxruntime/test/python/gpnpumode/test_qlinearadd.py new file mode 100644 index 0000000000..9a2ee5f0ba --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_qlinearadd.py @@ -0,0 +1,159 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest +import numpy as np +import onnx +import onnxruntime as ort +from onnx import helper, TensorProto +import os +import sys + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +from helper import get_onnx_const, generate_normal_inputs + + +batch_size = 1 +h=128 +w=128 +channels=8 + +class TestQLinearAdd(unittest.TestCase): + def setUp(self): + # Create a specific ONNX model with a single QLinearConv node + self.model_path = "qlinearadd_model.onnx" + self.create_qlinearadd_model(self.model_path) + + def create_qlinearadd_model(self, output_model_path): + a_scale, a_zero_point = 0.2039528638124466, -14 + b_scale, b_zero_point = 0.003937007859349251, 0 + y_scale, y_zero_point = 0.1019764319062233, -6 + + # Create input shapes + input_shape = [batch_size, channels, h, w] + + # Define node names + input_a_name = "input_a" + input_a_scale_name = "input_a_scale" + input_a_zp_name = "input_a_zero_point" + input_b_name = "input_b" + input_b_scale_name = "input_b_scale" + input_b_zp_name = "input_b_zero_point" + output_scale_name = "output_scale" + output_zp_name = "output_zero_point" + output_name = "output" + + a_sc = get_onnx_const(input_a_scale_name, a_scale) + a_zp = get_onnx_const(input_a_zp_name, a_zero_point) + b_sc = get_onnx_const(input_b_scale_name, b_scale) + b_zp = get_onnx_const(input_b_zp_name, b_zero_point) + y_sc = get_onnx_const(output_scale_name, y_scale) + y_zp = get_onnx_const(output_zp_name, y_zero_point) + + # Create QLinearAdd node + qlinear_add_node = onnx.helper.make_node( + "QLinearAdd", + inputs=[ + input_a_name, input_a_scale_name, input_a_zp_name, + input_b_name, input_b_scale_name, input_b_zp_name, + output_scale_name, output_zp_name + ], + outputs=[output_name], + domain="com.microsoft" + ) + + # Define input and output tensors + input_a_tensor = helper.make_tensor_value_info(input_a_name, TensorProto.INT8, input_shape) + b = get_onnx_const(input_b_name, generate_normal_inputs(input_shape, np.int8, 0, 32)) + input_b_tensor = helper.make_tensor_value_info(input_b_name, TensorProto.INT8, input_shape) + output_tensor = helper.make_tensor_value_info(output_name, TensorProto.INT8, input_shape) + + # Create graph + graph_name = "com.microsoft.QLinearAdd_test" + graph = helper.make_graph( + [qlinear_add_node], + graph_name, + [input_a_tensor], + [output_tensor], + initializer=[a_sc, a_zp, b, b_sc, b_zp, y_sc, y_zp], + ) + + # Create model + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("com.microsoft", 1),helper.make_opsetid("", 12)]) + model.ir_version = 8 # use stable onnx ir version + + # Save model + onnx.checker.check_model(model, True) + onnx.save(model, output_model_path) + + def tearDown(self): + # Delete the ONNX file after testing + if os.path.exists(self.model_path): + os.remove(self.model_path) + + def test_qlinearconv_inference(self): + session_options = ort.SessionOptions() + session_options.enable_gpnpu = False + print(f"Flag enable_gpnpu: {session_options.enable_gpnpu}") + + # Create an inference session + session1 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"]) + print(f"Check flag enable_gpnpu: {session1.get_session_options().enable_gpnpu}") + + session_options.enable_gpnpu = True + session2 = ort.InferenceSession(self.model_path, sess_options=session_options, providers=["CPUExecutionProvider"]) + print(f"Check flag enable_gpnpu: {session2.get_session_options().enable_gpnpu}") + + # Get information about both inputs + input_a_info = session1.get_inputs()[0] + # input_b_info = session.get_inputs()[1] + + print(f"Model input names: {input_a_info.name}") + print(f"Model input shapes: {input_a_info.shape}") + + # Create random INT8 data matching the input shapes + shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape) + + # Generate random data for both inputs + x_data_a = np.random.randint( + low=-128, high=128, size=shape_tuple_a, dtype=np.int8 + ) + + # Create input dictionary with both inputs + input_dict = { + input_a_info.name: x_data_a + } + + # Run inference + output_name1 = session1.get_outputs()[0].name + print(f"Process ID: {os.getpid()}") + output_data1 = session1.run([output_name1], input_dict)[0] + output_name2 = session2.get_outputs()[0].name + print(f"Process ID: {os.getpid()}") + output_data2 = session2.run([output_name2], input_dict)[0] + + # Print shapes and types + print(f"Input A data shape: {x_data_a.shape}, dtype: {x_data_a.dtype}") + # print(f"Output data shape: {output_data1.shape}, dtype: {output_data1.dtype}") + print("Output data (truncated):\n", output_data1.flatten()[:50], "...\n") + print("Output data (truncated):\n", output_data2.flatten()[:50], "...\n") + # print("hi") + difference = output_data1 - output_data2 + max_diff = np.max(np.abs(difference)) + print(max_diff) + + difference = output_data1 - output_data2 + + max_diff = np.max(np.abs(difference)) + + # Check the output shape and type + self.assertEqual(output_data1.shape, (batch_size, channels, h, w)) + self.assertEqual(output_data1.dtype, np.int8) + self.assertLessEqual(max_diff, 1) + +if __name__ == '__main__': + unittest.main() diff --git a/onnxruntime/test/python/gpnpumode/test_qlinearconv.py b/onnxruntime/test/python/gpnpumode/test_qlinearconv.py new file mode 100644 index 0000000000..0a35136f4f --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_qlinearconv.py @@ -0,0 +1,366 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest +import numpy as np +import onnx +import onnxruntime as ort +import os +import sys +import glob + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from helper import json_to_df, load_json, get_onnx_const, generate_normal_inputs + +x_scale, x_zp = 0.018654844, -14 +w_scale, w_zp = 0.044774472, 0 +y_scale, y_zp = 0.023529412, -30 + + +def canonicalize_conv_params(kernel, strides, padding, dilation): + kernel = [kernel, kernel] if not isinstance(kernel, (list, tuple)) else kernel + + assert len(kernel) == 2, "Unexpected kernel:\n{call}" + + strides = [strides, strides] if not isinstance(strides, (list, tuple)) else strides + + assert len(strides) == 2, "Unexpected strides:\n{call}" + + padding = ( + [int(padding or 0), int(padding or 0), int(padding or 0), int(padding or 0)] + if not isinstance(padding, (list, tuple)) + else padding + ) + assert len(padding) == 4, "Unexpected padding:\n{call}" + + dilation = [dilation, dilation] if not isinstance(dilation, (list, tuple)) else dilation + + assert len(dilation) == 2, "Unexpected dilation:\n{call}" + + return kernel, strides, padding, dilation + +def conv_output_height_width(kernel, strides, padding, dilation, input_dims): + kernel, strides, padding, dilation = canonicalize_conv_params( + kernel, strides, padding, dilation + ) + return int( + (input_dims[0] + padding[0] + padding[2] - dilation[0] * (kernel[0] - 1) - 1) // strides[0] + + 1 + ), int( + (input_dims[1] + padding[1] + padding[3] - dilation[1] * (kernel[1] - 1) - 1) // strides[1] + + 1 + ) + +def get_onnx_linear_conv( + op_name, + inp, # Should be a ValueInfo + oc, + kernel_shape, + strides=[1, 1], + auto_pad="NOTSET", + padding=None, + dilations=[1, 1], + groups=1, + x_scale=1.0, + x_zp=0, + w_scale=1.0, + w_zp=0, + y_scale=1.0, + y_zp=0, + with_bias=True, +): + kernel_shape = ( + [kernel_shape, kernel_shape] + if not isinstance(kernel_shape, (list, tuple)) + else kernel_shape + ) + + if padding is None and auto_pad == "NOTSET": + padding = [int(kernel_shape[0]) // 2] * 4 + + xs = get_onnx_const(f"{op_name}.x_scale", x_scale) + xz = get_onnx_const(f"{op_name}.x_zp", x_zp) + ws = get_onnx_const(f"{op_name}.w_scale", w_scale) + wz = get_onnx_const(f"{op_name}.w_zp", w_zp) + ys = get_onnx_const(f"{op_name}.y_scale", y_scale) + yz = get_onnx_const(f"{op_name}.y_zp", y_zp) + + in_dims = [d.dim_value for d in inp.type.tensor_type.shape.dim] + # FIXME: Need to take into account padding and what not + ic = in_dims[1] + if padding: + out_height, out_width = conv_output_height_width( + kernel_shape, strides, padding, dilations, in_dims[-2:] + ) + else: + out_height = in_dims[-2] // strides[-2] + out_width = in_dims[-1] // strides[-1] + out_dims = [1, oc, out_height, out_width] + + group_size = ic // groups + wt_dims = [oc, group_size, kernel_shape[0], kernel_shape[1]] + bias_dims = [oc] + + wt = get_onnx_const(f"{op_name}.wt", generate_normal_inputs(wt_dims, np.int8, 0, 32)) + bias = get_onnx_const( + f"{op_name}.bias", + generate_normal_inputs(bias_dims, np.int32, 0, 256, -1024, 1024), + onnx.TensorProto.INT32, + ) + + out_name = f"{op_name}.output" + out = onnx.helper.make_tensor_value_info(out_name, onnx.TensorProto.INT8, out_dims) + + names = [ + inp.name, + f"{op_name}.x_scale", + f"{op_name}.x_zp", + f"{op_name}.wt", + f"{op_name}.w_scale", + f"{op_name}.w_zp", + f"{op_name}.y_scale", + f"{op_name}.y_zp", + f"{op_name}.bias", + ] + initializers = [xs, xz, wt, ws, wz, ys, yz, bias] + + if auto_pad == "NOTSET": + conv = onnx.helper.make_node( + "QLinearConv", + names, + [out_name], + name=op_name, + dilations=dilations, + group=groups, + pads=padding, + strides=strides, + kernel_shape=kernel_shape, + ) + else: + conv = onnx.helper.make_node( + "QLinearConv", + names, + [out_name], + name=op_name, + dilations=dilations, + auto_pad=auto_pad, + group=groups, + strides=strides, + kernel_shape=kernel_shape, + ) + + return conv, out, initializers + +def get_onnx( + h, + w, + ic, + oc, + kernel_size, + strides, + padding=None, + dilation=[1, 1], + auto_pad="NOTSET", + pad_mode="constant", + include_pre_op=False, + groups=1, +): + kernel_size = ( + [kernel_size, kernel_size] if not isinstance(kernel_size, (list, tuple)) else kernel_size + ) + + inp_dims = (1, ic, h, w) + + ops = [] + inits = [] + if include_pre_op: + op_name = "inp_relu" + relu_min = get_onnx_const(f"{op_name}.min", 0, dtype=onnx.TensorProto.INT8) + relu_max = get_onnx_const(f"{op_name}.max", 6, dtype=onnx.TensorProto.INT8) + inits = [relu_min, relu_max] + inp_pre = onnx.helper.make_tensor_value_info("inp.pre", onnx.TensorProto.INT8, inp_dims) + inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, inp_dims) + + relu6 = onnx.helper.make_node( + "Clip", ["inp.pre", f"{op_name}.min", f"{op_name}.max"], ["inp"], name=op_name + ) + ops.append(relu6) + elif pad_mode == "reflect": + # Create a pad node ahead of the conv + op_name = "inp_pad" + inp_pre = onnx.helper.make_tensor_value_info("inp.pre", onnx.TensorProto.INT8, inp_dims) + padded_dims = list(inp_dims) + padded_dims[-2] = 2 * (kernel_size[0] // 2) + padded_dims[-1] = 2 * (kernel_size[1] // 2) + inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, padded_dims) + inp_pads = get_onnx_const( + "inp.pads", + np.array( + [ + 0, + 0, + kernel_size[0] // 2, + kernel_size[1] // 2, + 0, + 0, + kernel_size[0] // 2, + kernel_size[1] // 2, + ], + dtype=np.int64, + ), + onnx.TensorProto.INT64, + ) + inits = [inp_pads] + pad = onnx.helper.make_node( + "Pad", ["inp.pre", "inp.pads"], ["inp"], name=op_name, mode=pad_mode + ) + padding = [0, 0, 0, 0] + ops.append(pad) + else: + inp = onnx.helper.make_tensor_value_info("inp", onnx.TensorProto.INT8, inp_dims) + + conv, outp, conv_inits = get_onnx_linear_conv( + "conv_0", + inp, + oc, + kernel_size, + strides, + auto_pad=auto_pad, + padding=padding, + dilations=dilation, + groups=groups, + x_scale=x_scale, + x_zp=x_zp, + w_scale=w_scale, + w_zp=w_zp, + y_scale=y_scale, + y_zp=y_zp, + ) + ops.append(conv) + inits = inits + conv_inits + + graph_input = (inp_pre if include_pre_op or pad_mode == "reflect" else inp,) + graph = onnx.helper.make_graph( + ops, + "test_conv", + graph_input, + [outp], + initializer=inits, + ) + + model = onnx.helper.make_model( + graph, + opset_imports=[ + onnx.helper.make_opsetid("com.microsoft", 1), + onnx.helper.make_opsetid("", 12), + ], + ) + return model + +class TestQLinearConv(unittest.TestCase): + def setUp(self): + # Create a specific ONNX model with a single QLinearConv node + self.model_path = "qlinearconv_model.onnx" + self.create_qlinearconv_model(self.model_path) + self.cpu_jsons = [] + self.gpnpu_jsons = [] + + def create_qlinearconv_model(self, model_path): + h = 128 + w = 128 + ic = 8 + oc = 64 + kernel_size = 3 + strides = [1, 1] + model_def = get_onnx( + h, + w, + ic, + oc, + kernel_size, + strides) + onnx.save(model_def, model_path) + + def tearDown(self): + # Delete the ONNX file and JSON files after testing + if os.path.exists(self.model_path): + os.remove(self.model_path) + for json_file in glob.glob("*.json"): + os.remove(json_file) + + def performance_and_accuracy_test(self, num_iterations=100): + for _ in range(num_iterations): + # CPU Session + session_options_cpu = ort.SessionOptions() + session_options_cpu.enable_gpnpu = False + session_options_cpu.enable_profiling = True + session_options_cpu.profile_file_prefix = "cpu" + session_cpu = ort.InferenceSession( + self.model_path, + sess_options=session_options_cpu, + providers=["CPUExecutionProvider"] + ) + + # GPNPU Session + session_options_gpnpu = ort.SessionOptions() + session_options_gpnpu.enable_gpnpu = True + session_options_gpnpu.enable_profiling = True + session_options_gpnpu.profile_file_prefix = "gpnpu" + session_gpnpu = ort.InferenceSession( + self.model_path, + sess_options=session_options_gpnpu, + providers=["CPUExecutionProvider"] + ) + + # Prepare input + input_a_info = session_cpu.get_inputs()[0] + shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape) + x_data_a = np.random.randint( + low=-128, high=128, size=shape_tuple_a, dtype=np.int8 + ) + input_dict = {input_a_info.name: x_data_a} + + # Time and run CPU inference + output_cpu = session_cpu.run( + [session_cpu.get_outputs()[0].name], + input_dict + )[0] + json_name_cpu = session_cpu.end_profiling() + self.cpu_jsons.append(json_name_cpu) + + # Time and run GPNPU inference + output_gpnpu = session_gpnpu.run( + [session_gpnpu.get_outputs()[0].name], + input_dict + )[0] + json_name_gpnpu = session_gpnpu.end_profiling() + self.gpnpu_jsons.append(json_name_gpnpu) + + # Calculate max difference + max_diff = np.max(np.abs(output_cpu - output_gpnpu)) + + self.assertLessEqual(max_diff, 1) + + def test_performance_and_accuracy(self): + # Run test + self.performance_and_accuracy_test(num_iterations=1000) + self.json_time_profiling() + + def json_time_profiling(self): + def get_time(jsons): + times = [] + for json in jsons: + cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True) + times.append(cpu_df[cpu_df['name'] == 'QLinearConv']['duration'].values[0]) + return np.mean(np.array(times)), np.std(np.array(times)) + cpu_mean_time, cpu_std_time = get_time(self.cpu_jsons) + gpnpu_mean_time, gpnpu_std_time = get_time(self.gpnpu_jsons) + print(f"CPU Time: {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms") + print(f"GPNPU Time: {gpnpu_mean_time:8.3f} ± {gpnpu_std_time:.3f} ms") + + +if __name__ == '__main__': + unittest.main() diff --git a/onnxruntime/test/python/gpnpumode/test_qlineargap.py b/onnxruntime/test/python/gpnpumode/test_qlineargap.py new file mode 100644 index 0000000000..3eba389272 --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_qlineargap.py @@ -0,0 +1,157 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest +import numpy as np +import onnx +import onnxruntime as ort +from onnx import helper, TensorProto +import os +import sys +import time +import glob + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from helper import json_to_df, load_json + +N, C, H, W = 1, 2048, 7, 7 + +class TestQGemm(unittest.TestCase): + def setUp(self): + # Create a specific ONNX model with a single QGemm node + self.model_path = "qlineargap.onnx" + self.create_qgemm_model(self.model_path) + self.cpu_jsons = [] + self.gpnpu_jsons = [] + + def create_qgemm_model(self, output_model_path): + # Define the quantization parameters for X + x_scale = 0.1 + x_zero_point = 128 + + # Create tensor for input X (quantized data) + X = helper.make_tensor_value_info("X", TensorProto.INT8, [N, C, H, W]) + Y = helper.make_tensor_value_info("Y", TensorProto.INT8, [N, C, 1, 1]) + + + # Define quantization parameters for output Y + y_scale = 0.2 + y_zero_point = 128 + + # Step 2: Create the QLinearGlobalAveragePool node + node = helper.make_node( + 'QLinearGlobalAveragePool', # Operator name + inputs=['X', 'x_scale', 'x_zero_point', 'y_scale', 'y_zero_point'], # Input tensors + outputs=['Y'], # Output tensor + channels_last=0, # Attribute indicating whether the channels are last in the shape (1 = True) + domain="com.microsoft" + ) + + # Step 3: Define the scale and zero point tensors for input/output + x_scale_tensor = helper.make_tensor('x_scale', TensorProto.FLOAT, [1], [x_scale]) + x_zero_point_tensor = helper.make_tensor('x_zero_point', TensorProto.INT8, [1], [x_zero_point]) + y_scale_tensor = helper.make_tensor('y_scale', TensorProto.FLOAT, [1], [y_scale]) + y_zero_point_tensor = helper.make_tensor('y_zero_point', TensorProto.INT8, [1], [y_zero_point]) + + # Step 4: Create the graph (composed of the node and input/output tensors) + graph = helper.make_graph( + [node], # List of nodes (here, just our QLinearGlobalAveragePool node) + 'QLinearGlobalAveragePoolModel', # Name of the graph + [X], # Inputs + [Y], + initializer=[x_scale_tensor, x_zero_point_tensor, y_scale_tensor, y_zero_point_tensor] + ) + + # Step 5: Create the model (version 1) + model = helper.make_model( + graph, + producer_name='onnx-example', + opset_imports=[helper.make_opsetid("com.microsoft", 1), helper.make_opsetid('', 12)] # Operator set version + ) + + # Step 6: Save the model to file + onnx.save(model, output_model_path) + + def tearDown(self): + # Delete the ONNX file and JSON files after testing + if os.path.exists(self.model_path): + os.remove(self.model_path) + for json_file in glob.glob("*.json"): + os.remove(json_file) + + def performance_and_accuracy_test(self, num_iterations=100): + for _ in range(num_iterations): + # CPU Session + session_options_cpu = ort.SessionOptions() + session_options_cpu.enable_gpnpu = False + session_options_cpu.enable_profiling = True + session_options_cpu.profile_file_prefix = "cpu" + session_cpu = ort.InferenceSession( + self.model_path, + sess_options=session_options_cpu, + providers=["CPUExecutionProvider"] + ) + + # GPNPU Session + session_options_gpnpu = ort.SessionOptions() + session_options_gpnpu.enable_gpnpu = True + session_options_gpnpu.enable_profiling = True + session_options_gpnpu.profile_file_prefix = "gpnpu" + session_gpnpu = ort.InferenceSession( + self.model_path, + sess_options=session_options_gpnpu, + providers=["CPUExecutionProvider"] + ) + + # Prepare input + input_a_info = session_cpu.get_inputs()[0] + shape_tuple_a = tuple(dim if isinstance(dim, int) else 1 for dim in input_a_info.shape) + x_data_a = np.random.randint( + low=-128, high=128, size=shape_tuple_a, dtype=np.int8 + ) + input_dict = {input_a_info.name: x_data_a} + + # Time and run CPU inference + output_cpu = session_cpu.run( + [session_cpu.get_outputs()[0].name], + input_dict + )[0] + json_name_cpu = session_cpu.end_profiling() + self.cpu_jsons.append(json_name_cpu) + + # Time and run GPNPU inference + output_gpnpu = session_gpnpu.run( + [session_gpnpu.get_outputs()[0].name], + input_dict + )[0] + json_name_gpnpu = session_gpnpu.end_profiling() + self.gpnpu_jsons.append(json_name_gpnpu) + + # Calculate max difference + max_diff = np.max(np.abs(output_cpu - output_gpnpu)) + + self.assertLessEqual(max_diff, 1) + + def test_performance_and_accuracy(self): + # Run test + self.performance_and_accuracy_test(num_iterations=1) + self.json_time_profiling() + + def json_time_profiling(self): + def get_time(jsons): + times = [] + for json in jsons: + cpu_df, gpu_df = json_to_df(load_json(json), lambda x: True) + times.extend(cpu_df[cpu_df['name'] == 'QLinearGlobalAveragePool']['duration'].values) + return np.mean(np.array(times)), np.std(np.array(times)) + cpu_mean_time, cpu_std_time = get_time(self.cpu_jsons) + gpnpu_mean_time, gpnpu_std_time = get_time(self.gpnpu_jsons) + print(f"CPU Time: {cpu_mean_time:8.3f} ± {cpu_std_time:.3f} ms") + print(f"GPNPU Time: {gpnpu_mean_time:8.3f} ± {gpnpu_std_time:.3f} ms") + + +if __name__ == '__main__': + unittest.main() diff --git a/onnxruntime/test/python/gpnpumode/test_resnet50.py b/onnxruntime/test/python/gpnpumode/test_resnet50.py new file mode 100644 index 0000000000..1232e279e0 --- /dev/null +++ b/onnxruntime/test/python/gpnpumode/test_resnet50.py @@ -0,0 +1,65 @@ +import numpy as np +import onnxruntime as ort +import time +import os +import sys +# from tvm.contrib.epu.chimera_job.chimera_job import ChimeraJob + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from helper import json_to_df, load_json + +print(np.__version__) +def run_ort(x_data, flag, onnx_file_path="resnet_50.onnx"): + # Create an inference session + session_options = ort.SessionOptions() + session_options.enable_gpnpu = flag + # session_options.enable_profiling = True + session_options.intra_op_num_threads = 16 + session_options.profile_file_prefix = str(16)+"gpnpu" + session = ort.InferenceSession(onnx_file_path, sess_options = session_options, providers=["CPUExecutionProvider"]) + # Inspect the model's input to get the name and shape + inp_info = session.get_inputs()[0] + input_name = inp_info.name + input_shape = inp_info.shape # e.g. [1, 8, 128, 128] + # print(f"Model input name: {input_name}")- 377 + # print(f"Model input shape: {input_shape}") + + # If any dimension is None or 'batch size' is variable, adjust accordingly + shape_tuple = tuple(dim if isinstance(dim, int) else 1 for dim in input_shape) + + # Run inference + output_name = session.get_outputs()[0].name + t1 = time.time() + output_data = session.run([output_name], {input_name: x_data})[0] + # name = session.end_profiling() + t2 = time.time() + + # print(t2-t1) + # Print shapes and types + # print(f"Input data shape: {x_data.shape}, dtype: {x_data.dtype}") + # print(f"Output data shape: {output_data.shape}, dtype: {output_data.dtype}") + # print("Output data (truncated):\n", output_data.flatten()[:50], "...\n") + return output_data.flatten() + +if __name__ == "__main__": + # total = 0 + # n = 1 + # name = "" + # for num in range(4, 20, 4): + # total = 0 + # for i in range(n): + # t, name = run_qlinearconv_model(num) + # total += t + + + # cpu_df, gpu_df = json_to_df(load_json(name), lambda x: True) + # print(str(num) + " - " + str(round(total/n*1000)) + " " + str(round(np.sum(cpu_df["duration"])/1000))) + x_data = np.random.rand(1, 3, 224, 224).astype(np.float32) + print(x_data) + ort_cpu = run_ort(x_data, False) + ort_gpnpu = run_ort(x_data, True) + np.save("ort_cpu.npy", ort_cpu) + np.save("ort_gpnpu.npy", ort_gpnpu) + + # output_tvm = run_tvm(x_data) + print(np.max(np.abs(ort_cpu) - ort_gpnpu)) diff --git a/onnxruntime/test/python/helper.py b/onnxruntime/test/python/helper.py index 2a2c3fc9b4..1ce79cc2cf 100644 --- a/onnxruntime/test/python/helper.py +++ b/onnxruntime/test/python/helper.py @@ -1,5 +1,8 @@ import os - +import onnx +import numpy as np +import json +import pandas as pd def get_name(name): if os.path.exists(name): @@ -13,3 +16,131 @@ def get_name(name): if os.path.exists(res): return res raise FileNotFoundError(f"Unable to find '{name}' or '{rel}' or '{res}'") + +def get_onnx_const(name, val, dtype=None): + if isinstance(val, np.ndarray): + dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[val.dtype] + dims = val.shape + else: + if not dtype: + dtype = onnx.TensorProto.INT8 if isinstance(val, int) else onnx.TensorProto.FLOAT + dims = () + val = [val] + + return onnx.helper.make_tensor(name=name, data_type=dtype, dims=dims, vals=val) + +def generate_normal_inputs(shape, dtype, mu=0, sigma=32, a_min=-127, a_max=127): + return np.clip(np.rint(np.random.normal(mu, sigma, shape)).astype(dtype), a_min, a_max) + +def load_json(profile_path): + with open(profile_path, encoding="utf-8") as file_obj: + data = json.load(file_obj) + if isinstance(data, dict): + data = data["traceEvents"] + return data + +def _shape_to_string(shape): + res = "" + for dict_obj in shape: + if len(dict_obj) > 1: + raise ValueError("Unhandled type in _shape_to_string()") + key = next(iter(dict_obj.keys())) + value = next(iter(dict_obj.values())) + if len(res) != 0: + res += "," + res += f'{key}({"x".join(str(v) for v in value)})' + return res + +def json_to_df(data, filter_matcher): + cpu_entries = [] + gpu_entries = [] + + most_recent_kernel_launch_event = None + num_missing_kernel_launch_events = 0 + total_kernel_events = 0 + + for item in data: + cat = item.get("cat") + if cat is None: + continue + dur = item.get("dur") + if dur is None: + continue + arg = item.get("args") + if arg is None: + continue + op_name = arg.get("op_name") + + name = item["name"] + + if not filter_matcher(name) and op_name is not None and not filter_matcher(op_name): + continue + + if cat != "Kernel" and not name.endswith("kernel_time"): + continue + if name.endswith("kernel_time"): + most_recent_kernel_launch_event = item + + block_x = arg.get("block_x", -1) + block_y = arg.get("block_y", -1) + block_z = arg.get("block_z", -1) + grid_x = arg.get("grid_x", -1) + grid_y = arg.get("grid_y", -1) + grid_z = arg.get("grid_z", -1) + + if cat == "Kernel": + gpu_entries.append( + { + "name": name, + "duration": dur, + "dimensions": f"b{block_x}x{block_y}x{block_z},g{grid_x}x{grid_y}x{grid_z}", + "op_name": op_name, + "input_type_shape": ( + _shape_to_string(most_recent_kernel_launch_event["args"]["input_type_shape"]) + if most_recent_kernel_launch_event is not None + else "unknown" + ), + } + ) + total_kernel_events += 1 + if gpu_entries[-1]["input_type_shape"] == "unknown" and "hipMem" not in gpu_entries[-1]["name"]: + num_missing_kernel_launch_events += 1 + else: + cpu_entries.append( + { + "name": item["args"]["op_name"], + "duration": dur, + "input_type_shape": _shape_to_string(item["args"]["input_type_shape"]), + "output_type_shape": _shape_to_string(item["args"]["output_type_shape"]), + } + ) + + if num_missing_kernel_launch_events > 0: + print( + f"WARNING: Could not resolve shapes for {num_missing_kernel_launch_events} of {total_kernel_events} kernels." + ) + + cpu_df = pd.DataFrame(cpu_entries) + gpu_df = pd.DataFrame(gpu_entries) + cpu_df["count"] = 1 + gpu_df["count"] = 1 + return cpu_df, gpu_df + +def construct_filter_matcher(args): + if args.filter is None or len(args.filter) == 0: + return lambda x: True + filter_list = args.filter + concrete_filter_set = set() + fnmatch_filter_set = set() + for pattern in filter_list: + if "*" in pattern or "?" in pattern or "[" in pattern or "]" in pattern: + fnmatch_filter_set.add(pattern) + else: + concrete_filter_set.add(pattern) + + def _match_item(item): + if item in concrete_filter_set: + return True + return any(fnmatch.fnmatch(item, pattern) for pattern in fnmatch_filter_set) + + return _match_item diff --git a/onnxruntime/test/python/quantization/calibration.cache b/onnxruntime/test/python/quantization/calibration.cache new file mode 100644 index 0000000000..592d63ff76 --- /dev/null +++ b/onnxruntime/test/python/quantization/calibration.cache @@ -0,0 +1 @@ +td 1.100000023841858 diff --git a/onnxruntime/test/python/quantization/calibration.flatbuffers b/onnxruntime/test/python/quantization/calibration.flatbuffers new file mode 100644 index 0000000000..9bbe626650 Binary files /dev/null and b/onnxruntime/test/python/quantization/calibration.flatbuffers differ diff --git a/onnxruntime/test/python/quantization/calibration.json b/onnxruntime/test/python/quantization/calibration.json new file mode 100644 index 0000000000..30a0221f0d --- /dev/null +++ b/onnxruntime/test/python/quantization/calibration.json @@ -0,0 +1 @@ +{"CLS": "TensorsData", "data": {"td": {"lowest": {"data": [0.10000000149011612], "dtype": "float32", "CLS": "numpy.array"}, "highest": {"data": [1.100000023841858], "dtype": "float32", "CLS": "numpy.array"}, "CLS": "TensorData"}}, "calibration_method": {"CLS": "CalibrationMethod", "value": "CalibrationMethod.MinMax"}} \ No newline at end of file diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc index ebef441350..b63c35ad5f 100644 --- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc +++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc @@ -1,6 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. - +#include #define ORT_API_MANUAL_INIT #include "onnxruntime_cxx_api.h" #undef ORT_API_MANUAL_INIT @@ -322,6 +322,117 @@ struct AttrTesterStringOp : Ort::CustomOpBase& input, + const Ort::Custom::Tensor& lut, + Ort::Custom::Tensor& output) { + auto input_shape = input.Shape(); + auto input_data = input.Data(); + auto lut_data = lut.Data(); + auto output_data = output.Allocate(input_shape); + int32_t ind = 0; + for (int64_t i = 0; i < input.NumberOfElement(); ++i) { + ind = static_cast(input_data[i]) + 128; + output_data[i] = lut_data[ind]; + } +} + + +struct LookupTableKernel { + std::vector lut_values; + + // Initialize from kernel info - this is called during CreateKernel + void Init(const OrtApi* api, const OrtKernelInfo* info) { + // Get default allocator + OrtAllocator* allocator; + CUSTOM_ENFORCE(api->GetAllocatorWithDefaultOptions(&allocator) == nullptr, + "Failed to get default allocator"); + + // Get the lookup table tensor attribute + OrtValue* lut_tensor = nullptr; + CUSTOM_ENFORCE(api->KernelInfoGetAttribute_tensor(info, "lut", allocator, &lut_tensor) == nullptr, + "Failed to get lut tensor attribute"); + + OrtTensorTypeAndShapeInfo* shape_info; + CUSTOM_ENFORCE(api->GetTensorTypeAndShape(lut_tensor, &shape_info) == nullptr, + "Failed to get tensor shape info"); + + ONNXTensorElementDataType tensor_type; + CUSTOM_ENFORCE(api->GetTensorElementType(shape_info, &tensor_type) == nullptr, + "Failed to get tensor element type"); + + size_t num_elements; + CUSTOM_ENFORCE(api->GetTensorShapeElementCount(shape_info, &num_elements) == nullptr, + "Failed to get tensor element count"); + + CUSTOM_ENFORCE(num_elements == 256, "Lookup table must contain exactly 256 values"); + + lut_values.resize(256); + + if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8) { + const int8_t* tensor_data; + CUSTOM_ENFORCE(api->GetTensorMutableData(lut_tensor, (void**)&tensor_data) == nullptr, + "Failed to get tensor data"); + + for (size_t i = 0; i < 256; ++i) { + lut_values[i] = tensor_data[i]; + } + } + else if (tensor_type == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64) { + const int64_t* tensor_data; + CUSTOM_ENFORCE(api->GetTensorMutableData(lut_tensor, (void**)&tensor_data) == nullptr, + "Failed to get tensor data"); + + for (size_t i = 0; i < 256; ++i) { + lut_values[i] = static_cast(tensor_data[i]); + } + } + else { + CUSTOM_ENFORCE(false, "Unsupported tensor type for LUT attribute"); + } + + api->ReleaseTensorTypeAndShapeInfo(shape_info); + api->ReleaseValue(lut_tensor); + } + + void Compute(OrtKernelContext* context) { + Ort::KernelContext ctx(context); + auto input = ctx.GetInput(0); + const auto* input_data = input.GetTensorData(); + auto dimensions = input.GetTensorTypeAndShapeInfo().GetShape(); + auto output = ctx.GetOutput(0, dimensions); + auto* output_data = output.GetTensorMutableData(); + const size_t size = output.GetTensorTypeAndShapeInfo().GetElementCount(); + + for (size_t i = 0; i < size; i++) { + uint8_t index = static_cast(input_data[i]) + 128; + output_data[i] = lut_values[index]; + } + } +}; + +struct LookupTableOp : Ort::CustomOpBase { + void* CreateKernel(const OrtApi& api, const OrtKernelInfo* info) const { + auto kernel = std::make_unique(); + kernel->Init(&api, info); + return kernel.release(); + } + + const char* GetName() const { return "LookupTable"; } + const char* GetExecutionProviderType() const { return "CPUExecutionProvider"; } + size_t GetInputTypeCount() const { return 1; } + ONNXTensorElementDataType GetInputType(size_t) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; } + size_t GetOutputTypeCount() const { return 1; } + ONNXTensorElementDataType GetOutputType(size_t) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8; } + + static Ort::Status InferOutputShape(Ort::ShapeInferContext& ctx) { + auto input_shape = ctx.GetInputShape(0); + ctx.SetOutputShape(0, input_shape); + return Ort::Status{nullptr}; + } +}; + + + void RegisterOps(Ort::CustomOpDomain& domain) { static const std::unique_ptr c_CustomOpOne{Ort::Custom::CreateLiteCustomOp("CustomOpOne", "CPUExecutionProvider")}; static const std::unique_ptr c_CustomOpTwo{Ort::Custom::CreateLiteCustomOp("CustomOpTwo", "CPUExecutionProvider", KernelTwo)}; @@ -337,6 +448,8 @@ void RegisterOps(Ort::CustomOpDomain& domain) { static const std::unique_ptr c_AtterTesterIntFloat{Ort::Custom::CreateLiteCustomOp("AttrTesterIntFloat", "CPUExecutionProvider", AttrTesterIntFloatCompute, AttrTesterIntFloatShapeInfer)}; static const AttrTesterStringOp c_AtterTesterString; + static const LookupTableOp c_LookupTable; + #if !defined(DISABLE_FLOAT8_TYPES) static const CustomOpOneFloat8 c_CustomOpOneFloat8; static const std::unique_ptr c_FilterFloat8{Ort::Custom::CreateLiteCustomOp("FilterFloat8", "CPUExecutionProvider", FilterFloat8)}; @@ -354,6 +467,8 @@ void RegisterOps(Ort::CustomOpDomain& domain) { domain.Add(c_CopyTensorArrayCombined.get()); domain.Add(c_AtterTesterIntFloat.get()); domain.Add(&c_AtterTesterString); + domain.Add(&c_LookupTable); + #if !defined(DISABLE_FLOAT8_TYPES) domain.Add(&c_CustomOpOneFloat8); diff --git a/validation/qlinearconv_model.onnx b/validation/qlinearconv_model.onnx new file mode 100644 index 0000000000..2a474788eb Binary files /dev/null and b/validation/qlinearconv_model.onnx differ diff --git a/validation/validate.py b/validation/validate.py new file mode 100644 index 0000000000..e668797297 --- /dev/null +++ b/validation/validate.py @@ -0,0 +1,77 @@ +import numpy as np +import onnxruntime as ort +import time +import os +import sys +from tvm.contrib.epu.chimera_job.chimera_job import ChimeraJob + +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +print(np.__version__) +def run_ort(flag, x_data, onnx_file_path="/Users/maggies/Desktop/resnet50_512_1024_int8_opset11.onnx"): + # Create an inference session + session_options = ort.SessionOptions() + session_options.enable_gpnpu = flag + session_options.enable_profiling = True + session_options.intra_op_num_threads = 16 + session_options.profile_file_prefix = str(16)+"gpnpu" + session = ort.InferenceSession(onnx_file_path, sess_options = session_options, providers=["CPUExecutionProvider"]) + # Inspect the model's input to get the name and shape + inp_info = session.get_inputs()[0] + input_name = inp_info.name + input_shape = inp_info.shape # e.g. [1, 8, 128, 128] + # print(f"Model input name: {input_name}")- 377 + # print(f"Model input shape: {input_shape}") + + # If any dimension is None or 'batch size' is variable, adjust accordingly + shape_tuple = tuple(dim if isinstance(dim, int) else 1 for dim in input_shape) + + # Run inference + output_name = session.get_outputs()[0].name + t1 = time.time() + output_data = session.run([output_name], {input_name: x_data})[0] + name = session.end_profiling() + t2 = time.time() + + # print(t2-t1) + # Print shapes and types + # print(f"Input data shape: {x_data.shape}, dtype: {x_data.dtype}") + # print(f"Output data shape: {output_data.shape}, dtype: {output_data.dtype}") + # print("Output data (truncated):\n", output_data.flatten()[:50], "...\n") + + return output_data.flatten() + +def run_tvm(img_input, model_path): + # Execute retina net with CGC + cgc_job = ChimeraJob(model_p=model_path, macs_per_pe=8, quiet_iss=False) + cgc_job.analyze_network() + cgc_job.compile(quiet=True) + print("compile finished!") + + outputs = cgc_job.run_inference_harness(inputs={"input": img_input}) + # return outputs + return outputs['495'].flatten() + +if __name__ == "__main__": + # total = 0 + # n = 1 + # name = "" + # for num in range(4, 20, 4): + # total = 0 + # for i in range(n): + # t, name = run_qlinearconv_model(num) + # total += t + + + # cpu_df, gpu_df = json_to_df(load_json(name), lambda x: True) + # print(str(num) + " - " + str(round(total/n*1000)) + " " + str(round(np.sum(cpu_df["duration"])/1000))) + x_data = np.random.rand(1, 8, 128, 128).astype(np.int8) + # print(x_data) + output_ort_gpnpu = run_ort(True, x_data, "qlinearconv_model.onnx") + output_ort_cpu = run_ort(False, x_data, "qlinearconv_model.onnx") + np.save("gpnpu.npy", output_ort_gpnpu) + np.save("cpu.npy", output_ort_cpu) + output_tvm = run_tvm(x_data, "qlinearconv_model.onnx") + print(output_tvm) + print(output_tvm.keys()) + np.save("tvm.npy", output_tvm)