Override Ops for failed UTs caused by torch-xpu-ops (#4961) (#4981)

LuFinch · web-flow · commit 0e744423d0c1 · 2024-11-05T14:03:38.000+08:00
* fix UTs

* fix format

* add device check
diff --git a/csrc/gpu/aten/operators/BatchNorm.cpp b/csrc/gpu/aten/operators/BatchNorm.cpp
@@ -5208,6 +5208,90 @@ std::tuple<Tensor, Tensor, Tensor, Tensor> batch_norm_backward_reduce(
       grad_output, input, mean, invstd, weight_opt, input_g, weight_g, bias_g);
 }
 
+#ifdef USE_OVERRIDE_OP
+// Rename below functions because they have overload with the same name
+// and can't be registered.
+std::tuple<Tensor, Tensor, Tensor> _native_batch_norm_legit_(
+    const Tensor& self,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean,
+    Tensor& running_var,
+    bool train,
+    double momentum,
+    double epsilon) {
+  return at::AtenIpexTypeXPU::_native_batch_norm_legit(
+      self,
+      weight_opt,
+      bias_opt,
+      running_mean,
+      running_var,
+      train,
+      momentum,
+      epsilon);
+}
+
+std::tuple<Tensor, Tensor, Tensor> _native_batch_norm_legit_no_state(
+    const Tensor& self,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    bool train,
+    double momentum,
+    double epsilon) {
+  return at::AtenIpexTypeXPU::_native_batch_norm_legit(
+      self, weight_opt, bias_opt, train, momentum, epsilon);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_out_(
+    const Tensor& self,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    Tensor& running_mean,
+    Tensor& running_var,
+    bool train,
+    double momentum,
+    double epsilon,
+    Tensor& output,
+    Tensor& save_mean,
+    Tensor& save_invstd) {
+  return at::AtenIpexTypeXPU::_native_batch_norm_legit_out(
+      self,
+      weight_opt,
+      bias_opt,
+      running_mean,
+      running_var,
+      train,
+      momentum,
+      epsilon,
+      output,
+      save_mean,
+      save_invstd);
+}
+
+std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_state_out(
+    const Tensor& self,
+    const c10::optional<Tensor>& weight_opt,
+    const c10::optional<Tensor>& bias_opt,
+    bool train,
+    double momentum,
+    double epsilon,
+    Tensor& output,
+    Tensor& save_mean,
+    Tensor& save_invstd) {
+  return at::AtenIpexTypeXPU::_native_batch_norm_legit_out(
+      self,
+      weight_opt,
+      bias_opt,
+      train,
+      momentum,
+      epsilon,
+      output,
+      save_mean,
+      save_invstd);
+}
+
+#endif
+
 } // namespace AtenIpexTypeXPU
 } // namespace at
 
@@ -5223,6 +5307,18 @@ IPEX_TORCH_LIBRARY_IMPL(aten, XPU, m) {
   m.impl(
       "native_batch_norm_backward",
       TORCH_FN((&at::AtenIpexTypeXPU::native_batch_norm_backward)));
+  m.impl(
+      "_native_batch_norm_legit",
+      TORCH_FN((&at::AtenIpexTypeXPU::_native_batch_norm_legit_)));
+  m.impl(
+      "_native_batch_norm_legit.out",
+      TORCH_FN((&at::AtenIpexTypeXPU::_native_batch_norm_legit_out_)));
+  m.impl(
+      "_native_batch_norm_legit.no_stats",
+      TORCH_FN((&at::AtenIpexTypeXPU::_native_batch_norm_legit_no_state)));
+  m.impl(
+      "_native_batch_norm_legit.no_stats_out",
+      TORCH_FN((&at::AtenIpexTypeXPU::_native_batch_norm_legit_no_state_out)));
 }
 
 } // namespace
diff --git a/csrc/gpu/aten/operators/EmbeddingBag.cpp b/csrc/gpu/aten/operators/EmbeddingBag.cpp
@@ -4,6 +4,9 @@
 #include <core/Memory.h>
 #include <runtime/Utils.h>
 #include <torch/torch.h>
+#ifdef USE_OVERRIDE_OP
+#include "utils/CustomOperatorRegistration.h"
+#endif
 #include <utils/DPCPP.h>
 
 #include "BitonicMergeSort.h"
@@ -1294,3 +1297,16 @@ Tensor _embedding_bag_per_sample_weights_backward(
 
 } // namespace AtenIpexTypeXPU
 } // namespace at
+
+#ifdef USE_OVERRIDE_OP
+namespace {
+
+IPEX_TORCH_LIBRARY_IMPL(aten, XPU, m) {
+  m.impl("_embedding_bag", TORCH_FN((&at::AtenIpexTypeXPU::_embedding_bag)));
+  m.impl(
+      "_embedding_bag_forward_only",
+      TORCH_FN((&at::AtenIpexTypeXPU::_embedding_bag_forward_only)));
+}
+
+} // namespace
+#endif
diff --git a/csrc/gpu/aten/operators/GatedLinearUnit.cpp b/csrc/gpu/aten/operators/GatedLinearUnit.cpp
@@ -3,6 +3,9 @@
 #include <ATen/OpMathType.h>
 #include <ATen/TensorUtils.h>
 #include <runtime/Utils.h>
+#ifdef USE_OVERRIDE_OP
+#include "utils/CustomOperatorRegistration.h"
+#endif
 #include <utils/DPCPP.h>
 
 #include "Loops.h"
@@ -208,3 +211,16 @@ Tensor glu_backward_jvp(
 
 } // namespace AtenIpexTypeXPU
 } // namespace at
+
+#ifdef USE_OVERRIDE_OP
+namespace {
+
+IPEX_TORCH_LIBRARY_IMPL(aten, XPU, m) {
+  m.impl("glu_backward", TORCH_FN((&at::AtenIpexTypeXPU::glu_backward)));
+  m.impl(
+      "glu_backward.grad_input",
+      TORCH_FN((&at::AtenIpexTypeXPU::glu_backward_out)));
+}
+
+} // namespace
+#endif
diff --git a/csrc/gpu/aten/operators/Indexing.cpp b/csrc/gpu/aten/operators/Indexing.cpp
@@ -1439,6 +1439,9 @@ Tensor& index_select_out(
     int64_t dim,
     const Tensor& index,
     Tensor& out) {
+  TORCH_CHECK(self.is_xpu(), "self must be a XPU tensor.");
+  TORCH_CHECK(out.is_xpu(), "out must be a XPU tensor.");
+
   IPEX_DISPATCH_ALL_TYPES_AND_COMPLEX_AND5(
       at::ScalarType::Half,
       at::ScalarType::BFloat16,
@@ -2334,6 +2337,9 @@ Tensor& index_out(
 IPEX_TORCH_LIBRARY_IMPL(aten, XPU, m) {
   m.impl(
       "_index_put_impl_", TORCH_FN((&at::AtenIpexTypeXPU::_index_put_impl_)));
+  m.impl("index_select", TORCH_FN((&at::AtenIpexTypeXPU::index_select)));
+  m.impl(
+      "index_select.out", TORCH_FN((&at::AtenIpexTypeXPU::index_select_out)));
   m.impl("nonzero", TORCH_FN((&at::AtenIpexTypeXPU::nonzero)));
   m.impl("nonzero.out", TORCH_FN((&at::AtenIpexTypeXPU::nonzero_out)));
 }
diff --git a/scripts/tools/torchgen/yaml/xpu_functions.yaml b/scripts/tools/torchgen/yaml/xpu_functions.yaml
@@ -18,10 +18,20 @@ supported:
 # - col2im.out
 # - im2col
 # - im2col.out
-#  - sort
-#  - sort.stable
-#  - sort.values
-#  - sort.values_stable
+# - _embedding_bag
+# - _embedding_bag_forward_only
+# - _native_batch_norm_legit
+# - _native_batch_norm_legit.out
+# - _native_batch_norm_legit.no_stats
+# - _native_batch_norm_legit.no_stats_out
+# - glu_backward
+# - glu_backward.grad_input
+# - index_select
+# - index_select.out
+# - sort
+# - sort.stable
+# - sort.values
+# - sort.values_stable
 ################## override below ops due to performance issues
 # - convolution_overrideable
 # - convolution_backward_overrideable
@@ -82,9 +92,7 @@ supported:
 #  - cumsum.out  # newly added
   - _dirichlet_grad
 #  - _efficientzerotensor
-#  - _embedding_bag
   - _embedding_bag_dense_backward
-#  - _embedding_bag_forward_only
   - _embedding_bag_per_sample_weights_backward
   - _empty_affine_quantized
   - _empty_per_channel_affine_quantized
@@ -191,10 +199,6 @@ supported:
 #  - batch_norm_stats
   - batch_norm_stats.out
 #  - batch_norm_update_stats
-#  - _native_batch_norm_legit
-#  - _native_batch_norm_legit.out
-#  - _native_batch_norm_legit.no_stats
-#  - _native_batch_norm_legit.no_stats_out
 #  - batch_norm_backward_elemt
 #  - batch_norm_backward_reduce
 #  - bernoulli_.Tensor
@@ -316,8 +320,6 @@ supported:
 #  - logit.out
 #  - glu
 #  - glu.out
-#  - glu_backward
-#  - glu_backward.grad_input
   - glu_backward_jvp
   - glu_jvp
 #  - gt.Scalar
@@ -355,8 +357,6 @@ supported:
   - _unsafe_index.Tensor
 #  - index_fill_.int_Scalar
 #  - index_fill_.int_Tensor
-#  - index_select
-#  - index_select.out
 #  - index_add.out
 #  - inverse
 #  - inverse.out
diff --git a/tests/gpu/examples/test_batch_norm.py b/tests/gpu/examples/test_batch_norm.py
@@ -593,9 +593,6 @@ def test_batch_norm_update_stats_simple(self):
         self.assertEqual(save_mean_cpu, save_mean_dpcpp.to(cpu_device))
         self.assertEqual(save_var_cpu, save_var_dpcpp.to(cpu_device))
 
-    @pytest.mark.skip(
-        reason="PT2.5: TensorAccessor expected 1 dims but tensor has 4",
-    )
     def test_batch_norm_legit_simple(self):
         input_cpu = torch.randn(1, 2, 3, 3, dtype=torch.float, device=cpu_device)
         n_input = input_cpu.size(1)
diff --git a/tests/gpu/examples/test_cat_array.py b/tests/gpu/examples/test_cat_array.py
@@ -135,9 +135,6 @@ def test_cat_block_layout(self, dtype=torch.float):
     @pytest.mark.skipif(
         torch.xpu.device_count() == 1, reason="doesn't support with one device"
     )
-    @pytest.mark.skip(
-        reason="PT2.5: Native API failed. Native API returns: -36 (PI_ERROR_INVALID_QUEUE) -36 (PI_ERROR_INVALID_QUEUE)",
-    )
     def test_cat_multi_device(self, dtype=torch.float):
         x_cpu1 = torch.randn([1, 2, 28, 28], device=cpu_device)
         x_cpu2 = torch.randn([1, 2, 28, 28], device=cpu_device)
diff --git a/tests/gpu/examples/test_conv.py b/tests/gpu/examples/test_conv.py
@@ -639,7 +639,6 @@ def test_group_conv3d_channels_last(self, dtype=torch.float):
         not torch.xpu.has_channels_last_1d() or torch.xpu.using_onednn_layout(),
         reason="doesn't enable channels last 1d or channels last does not support onednn block format",
     )
-    @pytest.mark.skip(reason="PT2.5: Tensor-likes are not close!")
     def test_channels_last_1d_fwd(self, dtype=torch.float):
         shapes = [
             (2, 2, 3),
@@ -708,7 +707,6 @@ def test_channels_last_1d_fwd(self, dtype=torch.float):
         not torch.xpu.has_channels_last_1d() or torch.xpu.using_onednn_layout(),
         reason="doesn't enable channels last 1d or channels last does not support onednn block format",
     )
-    @pytest.mark.skip(reason="PT2.5: Tensor-likes are not close!")
     def test_channels_last_1d_bwd(self, dtype=torch.float):
         shapes = [
             (1, 7, 15000),
@@ -978,7 +976,6 @@ def test_conv2d_bia_bf16_input_bf16_bia(self, dtype=torch.float):
         not torch.xpu.has_channels_last_1d() or torch.xpu.using_onednn_layout(),
         reason="doesn't enable channels last 1d or channels last does not support onednn block format",
     )
-    @pytest.mark.skip(reason="PT2.5: Tensor-likes are not close!")
     def test_channels_last_1d_bwd_no_grad(self, dtype=torch.float):
         shapes = [
             (1, 7, 15000),
diff --git a/tests/gpu/examples/test_embedding_bag.py b/tests/gpu/examples/test_embedding_bag.py
@@ -61,9 +61,6 @@ def test_embedding_bag_all(self, dtype=torch.float32):
                                 rtol=1e-5,
                             )
 
-    @pytest.mark.skip(
-        reason="PT2.5: Assertion `vec_idx < num_row` failed",
-    )
     def test_embeddingbag_out_of_bounds(self):
         stderr = TestCase.runWithPytorchAPIUsageStderr(
             f"""\
diff --git a/tests/gpu/examples/test_fp8_index_select.py b/tests/gpu/examples/test_fp8_index_select.py
@@ -5,13 +5,8 @@
     cast_to_fp8,
 )
 
-import pytest
-
 
 class TestTorchMethod(TestCase):
-    @pytest.mark.skip(
-        reason="PT2.5: 'index_select_xpu' not implemented for 'Float8_e4m3fn'"
-    )
     def test_index_select(self, dtype=torch.float):
         dim_size = 10
         dims = 3
diff --git a/tests/gpu/examples/test_glu.py b/tests/gpu/examples/test_glu.py
@@ -7,7 +7,6 @@
 from itertools import product
 
 import intel_extension_for_pytorch  # noqa
-import pytest
 
 cpu_device = torch.device("cpu")
 dpcpp_device = torch.device("xpu")
@@ -136,9 +135,6 @@ def new_func(*args):
 
 
 class TestNNMethod(TestCase):
-    @pytest.mark.skip(
-        reason="PT2.5: Tensor-likes are not close!",
-    )
     def test_glu(self):
         for dt in [torch.float32, torch.bfloat16, torch.float16]:
             input_cpu = torch.randn(4, 6, dtype=dt)
diff --git a/tests/gpu/examples/test_index_select.py b/tests/gpu/examples/test_index_select.py
@@ -3,7 +3,6 @@
 import intel_extension_for_pytorch  # noqa
 
 import numpy as np
-import pytest
 
 np.set_printoptions(threshold=np.inf)
 
@@ -12,9 +11,6 @@
 
 
 class TestTorchMethod(TestCase):
-    @pytest.mark.skip(
-        reason="PT2.5: 'index_select_xpu' not implemented for 'Float8_e4m3fn"
-    )
     def test_index_select(self, dtype=torch.float):
         dim_size = 10
         dims = 3
diff --git a/tests/gpu/examples/test_model_conversion_channels_last_1d.py b/tests/gpu/examples/test_model_conversion_channels_last_1d.py
@@ -27,9 +27,6 @@ class TestNNMethod(TestCase):
         torch.xpu.using_onednn_layout(),
         reason="channels last does not support onednn block format",
     )
-    @pytest.mark.skip(
-        reason="PT2.5: Booleans mismatch: False is not True",
-    )
     def test_model_conversion_channels_last_1d(self, dtype=torch.float):
         model = Model()
         test_input = torch.rand([2, 3, 4])
diff --git a/tests/gpu/examples/test_p2p_copy.py b/tests/gpu/examples/test_p2p_copy.py
@@ -2,11 +2,9 @@
 from torch.testing._internal.common_utils import TestCase
 
 import intel_extension_for_pytorch  # noqa
-import pytest
 
 
 class TestTorchMethod(TestCase):
-    @pytest.mark.skip(reason="PT2.5: Tensor-likes are not close!")
     def test_p2p_copy(self):
         device_count = torch.xpu.device_count()
         print("device_count:", device_count)
diff --git a/tests/gpu/regression/test_indexing.py b/tests/gpu/regression/test_indexing.py
@@ -2,7 +2,6 @@
 from torch.testing._internal.common_utils import TestCase
 import intel_extension_for_pytorch  # noqa F401
 
-import pytest
 
 cpu_device = torch.device("cpu")
 
@@ -79,9 +78,6 @@ def test_index_int32(self):
 
         self.assertEqual(out_xpu.to("cpu"), out_cpu)
 
-    @pytest.mark.skip(
-        reason="PT2.5: tensors used as indices must be long, byte or bool tensors",
-    )
     def test_index_ind_dtype(self):
         torch.use_deterministic_algorithms(True)
 

Original file line number	Diff line number	Diff line change
`@@ -61,9 +61,6 @@ def test_embedding_bag_all(self, dtype=torch.float32):`
`61`	`61`	`rtol=1e-5,`
`62`	`62`	`)`
`63`	`63`
`64`		`- @pytest.mark.skip(`
`65`		- reason="PT2.5: Assertion `vec_idx < num_row` failed",
`66`		`- )`
`67`	`64`	`def test_embeddingbag_out_of_bounds(self):`
`68`	`65`	`stderr = TestCase.runWithPytorchAPIUsageStderr(`
`69`	`66`	`f"""\`