enable bf16 copy_

haozhe.zhu · haozhe.zhu · commit e68a64985d41 · 2020-09-13T19:37:48.000-07:00
diff --git a/scripts/cpu/gen-dense-cpu-ops.py b/scripts/cpu/gen-dense-cpu-ops.py
@@ -74,11 +74,13 @@
     'aten::index_select(Tensor self, int dim, Tensor index) -> Tensor',
     'aten::_unsafe_view(Tensor self, int[] size) -> Tensor',
     'aten::native_layer_norm(Tensor input, Tensor? weight, Tensor? bias, int M, int N, float eps) -> (Tensor, Tensor, Tensor)',
-    'aten::native_layer_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int M, int N, bool[3] output_mask) -> (Tensor, Tensor, Tensor)'
+    'aten::native_layer_norm_backward(Tensor grad_out, Tensor input, Tensor mean, Tensor rstd, Tensor? weight, int M, int N, bool[3] output_mask) -> (Tensor, Tensor, Tensor)',
+    'aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)',
 ]
 
 _FN_IPEX_FUNCS_WITH_SIMPLE_ATEN_SIG = [
     'aten::index_select(Tensor self, int dim, Tensor index) -> Tensor',
+    'aten::copy_(Tensor(a!) self, Tensor src, bool non_blocking=False) -> Tensor(a!)',
 ]
 
 _SHALLOW_FALLBACK_TO_CPU_TENSOR_LIST = 'shallowFallbackToCPUTensorList'
@@ -321,7 +323,10 @@ def is_out_func(fname):
         if fname.endswith('_'):
             assert len(dnnl_tensor_param_vars) > 0
             code += '      if (dbl::chk::dnnl_inplace_support_the_tensors(dnnl_input_tensors)) {\n'
-            code += '        return AtenIpexCPUDev::dil_{}({});\n'.format(fname, ', '.join(list(param_vars)))
+            if self.is_ipex_func(aten_func_sig_str):
+                code += self.gen_ipex_func_code(fname, param_vars)
+            else:
+                code += '        return AtenIpexCPUDev::dil_{}({});\n'.format(fname, ', '.join(list(param_vars)))
             code += '      }\n' # Check support tensors
         else:
             param_seq_str_vec = []
@@ -331,12 +336,7 @@ def is_out_func(fname):
             code += '      if (dbl::chk::dnnl_support_the_tensors(dnnl_input_tensors)) {\n'
 
             if self.is_ipex_func(aten_func_sig_str):
-                code += '        auto _result = AtenIpexCPUDev::dil_{}({});\n'.format(fname, ', '.join(param_seq_str_vec))
-                code += '        if (is_ipex_func_success()) {\n'
-                code += '          return _result;\n'
-                code += '        } else {\n'
-                code += '          reset_ipex_func_status();\n'
-                code += '        }\n'
+                code += self.gen_ipex_func_code(fname, param_seq_str_vec)
             else:
                 code += '        return AtenIpexCPUDev::dil_{}({});\n'.format(fname, ', '.join(param_seq_str_vec))
 
@@ -351,6 +351,16 @@ def is_out_func(fname):
 
         return code
 
+    def gen_ipex_func_code(self, fname, param_vars):
+        code = ''
+        code += '        auto _result = AtenIpexCPUDev::dil_{}({});\n'.format(fname, ', '.join(param_vars))
+        code += '        if (is_ipex_func_success()) {\n'
+        code += '          return _result;\n'
+        code += '        } else {\n'
+        code += '          reset_ipex_func_status();\n'
+        code += '        }\n'
+        return code
+
     def gen_fallback_prepare_code(self, cpp_sig):
         code = ''
         op_check_code = ''
diff --git a/tests/cpu/test_bf16_lazy_reorder.py b/tests/cpu/test_bf16_lazy_reorder.py
@@ -1956,6 +1956,31 @@ def test_save_and_load(self):
             torch.save(output_dpcpp, 'tensor_dpcpp.pt')
             self.assertEqual(torch.load('tensor.pt'), torch.load('tensor_dpcpp.pt'))
 
+class TestCopy_(TestCase):
+    def test_copy_(self):
+        rand_seed = int(get_rand_seed())
+        print("{} rand sed: {}".format(sys._getframe().f_code.co_name, rand_seed))
+        torch.manual_seed(rand_seed)
+        self_auto_mix = torch.randn(3, 4, 5, dtype=torch.float32, device=device) * 10
+        self_man_mix = (torch.randn(3, 4, 5, device=device) * 10).to(torch.bfloat16)
+        src_auto_mix = torch.randn(3, 4, 5, dtype=torch.float32, device=device) * 10
+        copy_src_auto_mix = copy.deepcopy(src_auto_mix).to(device=device)
+        copy_src_man_mix = copy.deepcopy(src_auto_mix).to(device=device).to(torch.bfloat16)
+
+        with AutoDNNL(True), AutoMixPrecision(False):
+            res_man_bf16 = copy_src_man_mix + copy_src_man_mix
+            self.assertEqual(res_man_bf16.dtype, torch.bfloat16)
+            self_man_mix.copy_(res_man_bf16)
+            self.assertEqual(self_man_mix.dtype, torch.bfloat16)
+
+            with AutoMixPrecision(True):
+                res_auto_mix = copy_src_auto_mix + copy_src_auto_mix
+                self.assertEqual(res_auto_mix.dtype, torch.float)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(res_auto_mix))
+                self_auto_mix.copy_(res_auto_mix)
+                self.assertTrue(ipex.core.is_bf16_dil_tensor(self_auto_mix))
+                self.assertEqual(self_auto_mix.dtype, torch.float)
+                self.assertEqual(self_auto_mix, self_man_mix.float())
 
 if __name__ == '__main__':
     test = unittest.main()
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -2042,5 +2042,31 @@ at::Tensor AtenIpexCPUDev::dil_shuffle(const at::Tensor & self, at::IntArrayRef
   return dbl::comm::gen_aten_tensor_by(std::move(y));
 }
 
+at::Tensor& AtenIpexCPUDev::dil_copy_(
+    at::Tensor & self,
+    const at::Tensor & src,
+    bool non_blocking) {
+  DEBUG("AtenIpexCPUDev::dil_copy_\n");
+  torch_ipex::reset_ipex_func_status();
+  
+  IPEX_CHECK(
+    self.device().type() == c10::DeviceType::DPCPP && 
+    src.device().type() == c10::DeviceType::DPCPP,
+    "IPEX copy only work on DPCPP tensor");
+  if (ShadeDataContext::isDilTensor(src) &&ShadeDataContext::isTensorMixPrecision(src)){
+    IPEX_CHECK(check_tensor_own_whole_storage(self),  "IPEX copy only works while self tensor own the whole storage");
+    auto dil_src = dbl::comm::try_gen_dil_tensor(src);
+    IPEX_CHECK(dil_src.get_data_type() == dil::data_type::bf16)
+    auto new_buffer_desc = dil_src.get_desc();
+    dil::tensor dil_buffer{new_buffer_desc};
+    dil_src.reorder_to(dil_buffer);
+    dbl::comm::equip_dil_buffer(self, dil_buffer);
+    return self;
+  }
+    // TODO: We need add more LP here
+  torch_ipex::set_ipex_func_status(torch_ipex::IPEXFuncStatus::IPEX_FALLBACK);
+  return self;
+}
+
 }  // namespace cpu
 }  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h
@@ -82,6 +82,8 @@ class AtenIpexCPUDev {
   static at::Tensor dil_index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index);
   static at::Tensor dil__unsafe_view(const at::Tensor & self, at::IntArrayRef size);
   static at::Tensor dil_shuffle(const at::Tensor & self, at::IntArrayRef view_shape, int64_t dim0, int64_t dim1);
+  static at::Tensor& dil_copy_(at::Tensor & self, const at::Tensor & src, bool non_blocking);
+
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/bf16/Bridge.cpp b/torch_ipex/csrc/cpu/bf16/Bridge.cpp
@@ -12,6 +12,9 @@ namespace bf16 {
 
 at::Tensor gen_consistent_tensor(const at::Tensor & self) {
   // Reorder dil buffer to public because aten tensor does not support blocked format
+  if (!ShadeDataContext::isDilTensor(self)){
+    return bridge::shallowFallbackToCPUTensor(self);
+  }
   dbl::comm::reorder_to_public(self, /*keep data type*/true);
 
   dil::tensor& self_dil_storage = ShadeDataContext::getDilStorage(self);
diff --git a/torch_ipex/csrc/cpu/bf16/DevOPs.cpp b/torch_ipex/csrc/cpu/bf16/DevOPs.cpp
@@ -13,7 +13,7 @@ namespace bf16 {
 
 at::Tensor index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index) {
   auto&& _tensor = bf16::gen_consistent_tensor(self);
-  auto&& _ipex_index = bridge::shallowFallbackToCPUTensor(index);
+  auto&& _ipex_index = bf16::gen_consistent_tensor(index);
   auto&& _ipex_result = at::index_select(_tensor, dim, _ipex_index);
   return bf16::gen_mix_prec_tensor(_ipex_result);
 }

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,7 @@ namespace bf16 {`
`13`	`13`
`14`	`14`	`at::Tensor index_select(const at::Tensor & self, int64_t dim, const at::Tensor & index) {`
`15`	`15`	`auto&& _tensor = bf16::gen_consistent_tensor(self);`
`16`		`- auto&& _ipex_index = bridge::shallowFallbackToCPUTensor(index);`
	`16`	`+ auto&& _ipex_index = bf16::gen_consistent_tensor(index);`
`17`	`17`	`auto&& _ipex_result = at::index_select(_tensor, dim, _ipex_index);`
`18`	`18`	`return bf16::gen_mix_prec_tensor(_ipex_result);`
`19`	`19`	`}`