Add Profiling Code

EikanWang · EikanWang · commit 85c8f18813c0 · 2020-09-02T20:44:09.000-07:00
diff --git a/cmake/CPU.cmake b/cmake/CPU.cmake
@@ -30,6 +30,14 @@ ELSE()
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2 -DNDEBUG")
 ENDIF()
 
+IF("${IPEX_DISP_OP}" STREQUAL "1")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIPEX_DISP_OP")
+ENDIF()
+
+IF("${IPEX_PROFILE_OP}" STREQUAL "1")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DIPEX_PROFILE_OP")
+ENDIF()
+
 # ---[ Build flags
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 14)
diff --git a/scripts/cpu/gen-dense-cpu-ops.py b/scripts/cpu/gen-dense-cpu-ops.py
@@ -517,16 +517,19 @@ def is_conv_overrideable_func(fname):
             # Gen definition code for cpp file
             code = '{} {{\n'.format(cpp_func_str_cpp)
 
-            # Gen profile info
-            code += '#if defined(_DEBUG)\n'
+            # Gen OP Name
+            code += '#if defined(IPEX_DISP_OP)\n'
             code += '  printf("{}::{}\\n");\n'.format(_IPEX_OP_FUNC_NS, cpp_sig.def_name)
             code += '#endif\n'
+
+            # Gen profile info
             profiler_inputs = []
             for param in cpp_sig.input_params:
                 if param.core_type in ['Tensor', 'Scalar']:
                     profiler_inputs.append(param.name)
+            code += '#if defined(IPEX_PROFILE_OP)\n'
             code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{{input_names}}}), torch::autograd::Node::peek_at_next_sequence_nr());\n'.format(ns=_IPEX_OP_FUNC_NS, name=cpp_sig.def_name, input_names=', '.join(profiler_inputs))
-
+            code += '#endif\n'
 
             if is_conv_overrideable_func(cpp_sig.def_name):
                 code += '  return AtenIpexCPUDev::dil_{}({});\n'.format(cpp_sig.def_name, ', '.join([param.name for param in cpp_sig.input_params]))
diff --git a/scripts/cpu/gen-sparse-cpu-ops.py b/scripts/cpu/gen-sparse-cpu-ops.py
@@ -81,6 +81,8 @@ class AtenIpexCPUSparse {{
 #include <ATen/CPUGenerator.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/record_function.h>
 
 #include "aten_ipex_bridge.h"
 #include "ipex_sparse_tensor_impl.h"
@@ -405,6 +407,21 @@ def gen_code(self):
 
                 # Gen definition code for cpp file
                 code += '{} {{\n'.format(cpp_func_str_cpp)
+
+                # Gen OP Name
+                code += '#if defined(IPEX_DISP_OP)\n'
+                code += '  printf("{}::{}\\n");\n'.format(_IPEX_OP_FUNC_NS, cpp_sparse_sig.def_name)
+                code += '#endif\n'
+
+                # Gen profile info
+                profiler_inputs = []
+                for param in cpp_sparse_sig.input_params:
+                    if param.core_type in ['Tensor', 'Scalar']:
+                        profiler_inputs.append(param.name)
+                code += '#if defined(IPEX_PROFILE_OP)\n'
+                code += '  RECORD_FUNCTION("{ns}::{name}", std::vector<c10::IValue>({{{input_names}}}), torch::autograd::Node::peek_at_next_sequence_nr());\n'.format(ns=_IPEX_OP_FUNC_NS, name=cpp_sparse_sig.def_name, input_names=', '.join(profiler_inputs))
+                code += '#endif\n'
+
                 code += self.gen_fallback_prepare_code(cpp_sparse_sig)
                 code += self.gen_fallback_code(cpp_sparse_sig)
                 code += self.gen_fallback_post_code(cpp_sparse_sig)
diff --git a/setup.py b/setup.py
@@ -193,6 +193,12 @@ def build_extension(self, ext):
             '-DPYTHON_INCLUDE_DIR=' + python_include_dir,
         ]
 
+    if _check_env_flag("IPEX_DISP_OP"):
+      cmake_args += ['-DIPEX_DISP_OP=1']
+
+    if _check_env_flag("IPEX_PROFILE_OP"):
+      cmake_args += ['-DIPEX_PROFILE_OP=1']
+
     if _check_env_flag("USE_SYCL"):
       cmake_args += ['-DUSE_SYCL=1']
 
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -9,13 +9,17 @@
 #include <c10/util/Optional.h>
 #include <torch/csrc/autograd/custom_function.h>
 #include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/record_function.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/script.h>
 
 class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
 public:
   static at::Tensor _forward(at::Tensor input, at::Tensor weight,
                              at::Tensor bias = at::Tensor()) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXLinearOp::_forward", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     try {
       if (torch_ipex::check_auto_dnnl() &&
           input.device().type() == c10::DeviceType::DPCPP) {
@@ -48,13 +52,19 @@ class NewLinearOp : public torch::autograd::Function<NewLinearOp> {
   static at::Tensor forward(torch::autograd::AutogradContext *ctx,
                             at::Tensor input, at::Tensor weight,
                             at::Tensor bias = at::Tensor()) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXLinearOp::forward", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     ctx->save_for_backward({input, weight, bias});
     return _forward(input, weight, bias);
   }
 
   static torch::autograd::tensor_list
   backward(torch::autograd::AutogradContext *ctx,
            torch::autograd::tensor_list grad_outputs) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXLinearOp::backward", std::vector<c10::IValue>({}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     auto saved = ctx->get_saved_variables();
     at::Tensor input = saved[0];
     at::Tensor weight = saved[1];
@@ -149,6 +159,9 @@ class NewMaxPool2dOp : public torch::autograd::Function<NewMaxPool2dOp> {
   _forward(at::Tensor input, at::IntArrayRef kernel_size,
            at::IntArrayRef stride, at::IntArrayRef padding,
            at::IntArrayRef dilation, bool ceil_mode) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool2dOp::_forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     try {
       if (torch_ipex::check_auto_dnnl() &&
           input.device().type() == c10::DeviceType::DPCPP) {
@@ -187,6 +200,9 @@ class NewMaxPool2dOp : public torch::autograd::Function<NewMaxPool2dOp> {
                             at::Tensor input, at::IntArrayRef kernel_size,
                             at::IntArrayRef stride, at::IntArrayRef padding,
                             at::IntArrayRef dilation, bool ceil_mode) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool2dOp::forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     ctx->saved_data["kernel_size"] = kernel_size;
     ctx->saved_data["stride"] = stride;
     ctx->saved_data["padding"] = padding;
@@ -203,6 +219,9 @@ class NewMaxPool2dOp : public torch::autograd::Function<NewMaxPool2dOp> {
   static torch::autograd::tensor_list
   backward(torch::autograd::AutogradContext *ctx,
            torch::autograd::tensor_list grad_outputs) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool2dOp::backward", std::vector<c10::IValue>({}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     auto saved = ctx->get_saved_variables();
     at::Tensor input = saved[0];
     at::Tensor indices = saved[1];
@@ -263,6 +282,9 @@ class NewMaxPool3dOp : public torch::autograd::Function<NewMaxPool3dOp> {
   _forward(at::Tensor input, at::IntArrayRef kernel_size,
            at::IntArrayRef stride, at::IntArrayRef padding,
            at::IntArrayRef dilation, bool ceil_mode) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool3dOp::_forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     try {
       if (torch_ipex::check_auto_dnnl() &&
           input.device().type() == c10::DeviceType::DPCPP) {
@@ -298,6 +320,9 @@ class NewMaxPool3dOp : public torch::autograd::Function<NewMaxPool3dOp> {
                             at::Tensor input, at::IntArrayRef kernel_size,
                             at::IntArrayRef stride, at::IntArrayRef padding,
                             at::IntArrayRef dilation, bool ceil_mode) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool3dOp::forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     ctx->saved_data["kernel_size"] = kernel_size;
     ctx->saved_data["stride"] = stride;
     ctx->saved_data["padding"] = padding;
@@ -314,6 +339,9 @@ class NewMaxPool3dOp : public torch::autograd::Function<NewMaxPool3dOp> {
   static torch::autograd::tensor_list
   backward(torch::autograd::AutogradContext *ctx,
            torch::autograd::tensor_list grad_outputs) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXMaxPool3dOp::backward", std::vector<c10::IValue>({}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     auto saved = ctx->get_saved_variables();
     at::Tensor input = saved[0];
     at::Tensor indices = saved[1];
@@ -372,6 +400,9 @@ class NewApaptiveAvgPoolingOp
     : public torch::autograd::Function<NewApaptiveAvgPoolingOp> {
 public:
   static at::Tensor _forward(at::Tensor input, at::IntArrayRef output_size) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXApaptiveAvgPoolingOp::_forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     try {
       if (torch_ipex::check_auto_dnnl() && input.device().type() == c10::DeviceType::DPCPP) {
         auto src_dil_type = torch_ipex::cpu::dbl::comm::try_gen_dil_tensor(input).get_data_type();
@@ -397,13 +428,19 @@ class NewApaptiveAvgPoolingOp
 
   static at::Tensor forward(torch::autograd::AutogradContext *ctx,
                             at::Tensor input, at::IntArrayRef output_size) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXApaptiveAvgPoolingOp::forward", std::vector<c10::IValue>({input}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     ctx->save_for_backward({input});
     return _forward(input, output_size);
   }
 
   static torch::autograd::tensor_list
   backward(torch::autograd::AutogradContext *ctx,
            torch::autograd::tensor_list grad_outputs) {
+#if defined(IPEX_PROFILE_OP)
+    RECORD_FUNCTION("IPEXApaptiveAvgPoolingOp::backward", std::vector<c10::IValue>({}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
     auto saved = ctx->get_saved_variables();
     at::Tensor input = saved[0];
 
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -26,7 +26,7 @@
 namespace torch_ipex {
 namespace cpu {
 
-#if defined(_DEBUG)
+#if defined(IPEX_DISP_OP)
 #define DEBUG(fmt) printf(fmt);
 #else
 #define DEBUG(fmt)
@@ -78,7 +78,7 @@ at::Tensor AtenIpexCPUDev::dil_convolution(
     dbl::conv::prepack_conv_weights(input, dil_input,
       weight, stride, padding, dilation, groups);
   }
-  
+
   dil_weight = dbl::comm::try_gen_dil_tensor(weight);
 
   if (bias.defined()) {
@@ -360,7 +360,7 @@ at::Tensor& dil_add_common(
   IPEX_CHECK(self.sizes().equals(other.sizes()),
       "dil add not support broadcast yet");
   if (check_auto_mix_int8_fp32()) {
-    // for accuracy, reorder int8 to fp32 
+    // for accuracy, reorder int8 to fp32
     dbl::comm::reorder_to_dtype(self, at::kFloat);
     dbl::comm::reorder_to_dtype(other, at::kFloat);
   } else {
@@ -824,7 +824,7 @@ at::Tensor AtenIpexCPUDev::dil_linear(
   if (check_auto_mix_int8_fp32() && check_int8_calibration()) {
     insert_or_updata_observer({self}, {aten_output}, "Linear");
   }
- 
+
   if (self.dim() > 2) {
     auto input_size = self.sizes();
     std::vector<int64_t> output_size(input_size.begin(), input_size.end() - 1);
@@ -1027,7 +1027,7 @@ std::tuple<at::Tensor, at::Tensor, at::Tensor> AtenIpexCPUDev::dil_native_batch_
       dil::batch_normalization_forward_inference::compute(
           x, w, b, y, eps, input_scales, output_scales);
     }
- 
+
     auto aten_output = dbl::comm::gen_aten_tensor_by(std::move(y));
 
     if (check_auto_mix_int8_fp32() && check_int8_calibration()) {
@@ -1421,7 +1421,7 @@ at::Tensor& AtenIpexCPUDev::dil_relu_(at::Tensor& input) {
     dil::algorithm::eltwise_relu,
     dil::prop_kind::forward_training,
     /*alpha*/ 0.0);
- 
+
   TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dil_self.is_public_format() || check_tensor_own_whole_storage(input));
   dbl::comm::sync_shape_from_dil_to_aten(input, dil_self);
   return input;
diff --git a/torch_ipex/csrc/cpu/FusionOPs.cpp b/torch_ipex/csrc/cpu/FusionOPs.cpp
@@ -5,6 +5,8 @@
 #include <ATen/InferSize.h>
 #include <c10/util/Exception.h>
 #include <c10/util/Logging.h>
+#include <torch/csrc/autograd/function.h>
+#include <torch/csrc/autograd/record_function.h>
 
 #include <limits>
 
@@ -220,6 +222,9 @@ at::Tensor AtenIpexJITDev::dil_convolution_swish(
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
     int64_t groups) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_swish", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   return dil_convolution_outplace_fusion(
     input,
     weight,
@@ -239,6 +244,9 @@ at::Tensor AtenIpexJITDev::dil_convolution_sigmoid(
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
     int64_t groups) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_sigmoid", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   return dil_convolution_outplace_fusion(
     input,
     weight,
@@ -260,6 +268,9 @@ at::Tensor AtenIpexJITDev::dil_convolution_clamp(
     int64_t groups,
     float lower_bound,
     float upper_bound) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_clamp", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   return dil_convolution_outplace_fusion(
     input,
     weight,
@@ -279,6 +290,9 @@ at::Tensor AtenIpexJITDev::dil_convolution_relu(
     at::IntArrayRef padding,
     at::IntArrayRef dilation,
     int64_t groups) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_relu", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   return dil_convolution_outplace_fusion(
     input,
     weight,
@@ -302,6 +316,9 @@ at::Tensor AtenIpexJITDev::dil_convolution_elu(
     float alpha,
     at::Scalar scale,
     at::Scalar input_scale) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_elu", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   auto scale_value = scale.to<float>();
   auto input_scale_value = input_scale.to<float>();
   return dil_convolution_outplace_fusion(
@@ -325,6 +342,9 @@ at::Tensor& AtenIpexJITDev::dil_convolution_sum(
     int64_t groups,
     at::Tensor& accumu,
     at::Scalar alpha) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_sum", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   auto scale = alpha.to<float>();
   return dil_convolution_inplace_fusion(
     input,
@@ -349,6 +369,9 @@ at::Tensor& AtenIpexJITDev::dil_convolution_sum_relu(
     int64_t groups,
     at::Tensor& accumu,
     at::Scalar alpha) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_convolution_sum_relu", std::vector<c10::IValue>({input, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   auto scale = alpha.to<float>();
   return dil_convolution_inplace_fusion(
     input,
@@ -367,6 +390,9 @@ at::Tensor AtenIpexJITDev::dil_linear_fuse_relu(
     const at::Tensor& self,
     const at::Tensor& weight,
     const at::Tensor& bias) {
+#if defined(IPEX_PROFILE_OP)
+  RECORD_FUNCTION("AtenIpexJITDev::dil_linear_fuse_relu", std::vector<c10::IValue>({self, weight, bias}), torch::autograd::Node::peek_at_next_sequence_nr());
+#endif
   IPEX_CHECK(self.dim() >= 2,
       "dil_linear: input needs to has dim at least 2, input dim ", self.dim());
   auto input_contiguous = self.is_contiguous() ? self : self.contiguous();