mark param tensor when doing model to dpcpp

chunyuan-w · chunyuan-w · commit 487a75e65ce0 · 2020-09-08T15:46:10.000+08:00
diff --git a/intel_pytorch_extension_py/ops/__init__.py b/intel_pytorch_extension_py/ops/__init__.py
@@ -6,3 +6,4 @@
 from .mlp import * 
 from .jit import *
 from .save import *
+from .to import *
diff --git a/intel_pytorch_extension_py/ops/to.py b/intel_pytorch_extension_py/ops/to.py
@@ -0,0 +1,26 @@
+import torch
+import _torch_ipex as core
+
+torch_to = torch.nn.Module.to
+
+def apply(m, fn):
+    for sub_module in m.children():
+        apply(sub_module, fn)
+    fn(m)
+    return m
+
+def to(module, *args, **kwargs):
+    m = torch_to(module, *args, **kwargs)
+
+    device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
+    
+    if not device or device.type != "dpcpp":
+        return m
+    
+    def mark_param(t):
+        for param in t.parameters():
+            core.set_parameter_tensor(param.data)
+    
+    return apply(m, mark_param)
+
+torch.nn.Module.to = to
diff --git a/tests/cpu/test_bf16_lazy_reorder.py b/tests/cpu/test_bf16_lazy_reorder.py
@@ -79,6 +79,55 @@ def _gen_op(seed, op, is_bn=False, is_forward=True):
     
     return op_cpu, op_auto_mix_inference, op_auto_mix_train, op_man_bf16, op_auto_mix_train_bf16
 
+class CascadedConvBnSumRelu(nn.Module):
+    def __init__(self, in_channels, mid_channels, out_channels, **kwargs):
+        super(CascadedConvBnSumRelu, self).__init__()
+        self.conv = torch.nn.Conv2d(in_channels, mid_channels, bias=False, **kwargs)
+        self.conv1 = torch.nn.Conv2d(
+            mid_channels, out_channels, bias=False, padding=1, **kwargs)
+        self.conv2 = torch.nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
+        self.bn = torch.nn.BatchNorm2d(mid_channels, eps=0.001)
+        self.bn1 = torch.nn.BatchNorm2d(out_channels, eps=0.001)
+        self.bn2 = torch.nn.BatchNorm2d(out_channels, eps=0.001)
+
+    def forward(self, x):
+        a = self.conv(x)
+        a = self.bn(a)
+        a = F.relu(a, inplace=True)
+        a = self.conv1(a)
+        a = self.bn1(a)
+        b = self.conv2(x)
+        b = self.bn2(b)
+        return F.relu(a.add_(b), inplace=True)
+
+def apply(m, fn, args):
+    for sub_module in m.children():
+        apply(sub_module, fn, args)
+    fn(m, args)
+
+class TestTo(TestCase):
+    def test_to(self):
+        rand_seed = int(get_rand_seed())
+        torch.manual_seed(rand_seed)
+
+        m = CascadedConvBnSumRelu(3, 64, 32, kernel_size=3, stride=1)
+        m_cpu = copy.deepcopy(m).to("cpu")
+        m_data_type = copy.deepcopy(m).to(torch.bfloat16)
+        m_auto_mix = copy.deepcopy(m).to(device)
+        m_auto_mix_data_type = copy.deepcopy(m).to(device=device, dtype=torch.bfloat16)
+
+        def check_param(t, is_param):
+            for param in t.parameters():
+                if is_param:
+                    self.assertTrue(ipex.core.is_parameter_tensor(param.data))
+                else:
+                    self.assertFalse(ipex.core.is_parameter_tensor(param.data))
+
+        apply(m_cpu, check_param, False)
+        apply(m_data_type, check_param, False)
+        apply(m_auto_mix, check_param, True)
+        apply(m_auto_mix_data_type, check_param, True)
+
 class TestConv(TestCase):
     def test_Conv2d_with_cpu(self):
         rand_seed = int(get_rand_seed())
diff --git a/torch_ipex/csrc/cpu/ShadeDataContext.h b/torch_ipex/csrc/cpu/ShadeDataContext.h
@@ -16,6 +16,8 @@ enum SHADE_DATA_TYPE {CPU_RAW, DIL};
 
 enum MIX_PREC_TYPE {NONE, MIX_BF16_FP32, MIX_INT8_FP32};
 
+enum SHADE_TENSOR_TAG{PARAM, OTHER};
+
 #define SANITY_CHECK_SHADE_DATA_CONTEXT(THIS) \
   { \
     if (THIS->data_type == SHADE_DATA_TYPE::DIL) { \
@@ -52,12 +54,14 @@ struct ShadeDataContext {
 
   SHADE_DATA_TYPE    data_type;    ///< Memory buffer type
   MIX_PREC_TYPE      mix_prec_type; ///< Record if the aten tensor is mix-precision
+  SHADE_TENSOR_TAG   shade_tensor_tag; ///< Record if the tensor is a PARAMETER (in mix-precision, never reorder a PARAMETER to bf16)
 
   ShadeDataContext() : dil_tensor(),
                        cpu_raw_data(nullptr),
                        cpu_del_fun(nullptr),
                        data_type(SHADE_DATA_TYPE::CPU_RAW),
-                       mix_prec_type(MIX_PREC_TYPE::NONE) {}
+                       mix_prec_type(MIX_PREC_TYPE::NONE),
+                       shade_tensor_tag(SHADE_TENSOR_TAG::OTHER) {}
 
   ~ShadeDataContext() {
     SANITY_CHECK_SHADE_DATA_CONTEXT(this);
@@ -216,6 +220,49 @@ struct ShadeDataContext {
     return res;
   }
 
+  /**
+   * Check if the input aten tensor is a parameter.
+   *
+   * @param tensor input aten tensor
+   */
+  static inline bool isParameterTensor(const at::Tensor &tensor) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.has_storage());
+
+    if (tensor.device().type() != c10::DeviceType::DPCPP) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.device().type() == c10::DeviceType::CPU);
+      return false;
+    }
+
+    void *storage_context = tensor.storage().data_ptr().get_context();
+    ShadeDataContext *shade_data_context = (ShadeDataContext*)storage_context;
+    auto shade_tensor_tag = shade_data_context->shade_tensor_tag;
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY((shade_tensor_tag == SHADE_TENSOR_TAG::OTHER) || (shade_tensor_tag == SHADE_TENSOR_TAG::PARAM));
+
+    SANITY_CHECK_SHADE_DATA_CONTEXT(shade_data_context);
+
+    return shade_tensor_tag == SHADE_TENSOR_TAG::PARAM;
+  }
+
+  /**
+   * Set the shade_tensor_tag of the input aten tensor to PARAM.
+   *
+   * @param tensor input aten tensor
+   */
+  static inline void setParameterTensor(const at::Tensor &tensor) {
+    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.has_storage());
+    
+    // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.device().type() == c10::DeviceType::DPCPP);
+    // TODO: if device is cpu, this function should not be called
+    if (tensor.device().type() != c10::DeviceType::DPCPP) {
+      TORCH_INTERNAL_ASSERT_DEBUG_ONLY(tensor.device().type() == c10::DeviceType::CPU);
+      return;
+    }
+
+    void *storage_context = tensor.storage().data_ptr().get_context();
+    ShadeDataContext *shade_data_context = (ShadeDataContext*)storage_context;
+    shade_data_context->shade_tensor_tag = SHADE_TENSOR_TAG::PARAM;
+  }
+
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/init_python_bindings.cpp
@@ -40,6 +40,14 @@ void setAutoDNNL(bool val) {
   AutoOptConfig::singleton().set_auto_dnnl(val);
 }
 
+void setParameterTensor(const at::Tensor &tensor) {
+  cpu::ShadeDataContext::setParameterTensor(tensor);
+}
+
+bool isParameterTensor(const at::Tensor &tensor) {
+  return cpu::ShadeDataContext::isParameterTensor(tensor);
+}
+
 /// **** Only for unit test ****
 bool isDilTensor(const at::Tensor &tensor) {
   return cpu::ShadeDataContext::isDilTensor(tensor);
@@ -125,6 +133,8 @@ void InitIpexModuleBindings(py::module m) {
   m.def("is_fp32_dil_tensor", &isFP32DilTensor);
   m.def("get_dil_tensor_sizes", &getDilStorageSizes);
   m.def("get_dil_tensor_strides", &getDilStorageStrides);
+  m.def("set_parameter_tensor", &setParameterTensor);
+  m.def("is_parameter_tensor", &isParameterTensor);
   m.def("enable_jit_opt", []() { AutoOptConfig::singleton().set_jit_fuse(true); });
   m.def("disable_jit_opt", []() { AutoOptConfig::singleton().set_jit_fuse(false); });
   m.def("get_jit_opt", []() { return AutoOptConfig::singleton().get_jit_fuse(); });