[XPU] fix fleet unittests (#68542)

houj04 · web-flow · commit c44e0406ad71 · 2024-12-02T20:50:15.000+08:00
* [XPU] fix fleet unittests

* [XPU] fix fleet unittests

* refine: use new default parameter

* revert unnecessary modifications.

* revert unnecessary modifications.

* fix cmakelist

* revert unnecessary modifications.

* fix cmakelist for recompute ut.
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
@@ -143,6 +143,33 @@ phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
 }
 #endif
 
+#ifdef PADDLE_WITH_XPU
+phi::DenseTensor CastDataType(const phi::XPUContext& dev_ctx,
+                              const phi::DenseTensor& tensor,
+                              DataType dtype) {
+  switch (tensor.dtype()) {
+    case DataType::FLOAT32:
+      return phi::Cast<float>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT64:
+      return phi::Cast<double>(dev_ctx, tensor, dtype);
+    case DataType::INT32:
+      return phi::Cast<int32_t>(dev_ctx, tensor, dtype);
+    case DataType::INT64:
+      return phi::Cast<int64_t>(dev_ctx, tensor, dtype);
+    case DataType::FLOAT16:
+      return phi::Cast<phi::dtype::float16>(dev_ctx, tensor, dtype);
+    case DataType::BOOL:
+      return phi::Cast<bool>(dev_ctx, tensor, dtype);
+    case DataType::UINT8:
+      return phi::Cast<uint8_t>(dev_ctx, tensor, dtype);
+    default:
+      PADDLE_THROW(common::errors::Unimplemented(
+          "Data type (%s) is not supported when casting data type.",
+          tensor.dtype()));
+  }
+}
+#endif
+
 inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
                                       DataType dtype) {
   auto& pool = phi::DeviceContextPool::Instance();
@@ -161,6 +188,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
     return CastDataType(*dev_ctx, tensor, dtype);
 #endif
+#ifdef PADDLE_WITH_XPU
+  } else if (tensor.place().GetType() == phi::AllocationType::XPU) {
+    auto* dev_ctx = static_cast<phi::XPUContext*>(pool.Get(tensor.place()));
+    return CastDataType(*dev_ctx, tensor, dtype);
+#endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
   } else if (tensor.place().GetType() == phi::AllocationType::CUSTOM) {
     phi::DenseTensor out;
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -74,7 +74,7 @@ def __init__(
         optim,
         group=None,
         offload=False,
-        device="gpu",
+        device="xpu" if core.is_compiled_with_xpu() else "gpu",
         pretrain_sync_models=True,
         dp_group=None,
         **kw,
@@ -590,6 +590,12 @@ def _step(self):
                             )
                             .cast(dtype=param.dtype)
                         )
+                    elif self._default_device == "xpu":
+                        param.set_value(
+                            self._master_params[param.name]
+                            .to("xpu:" + str(self.dev_id))
+                            .cast(dtype=param.dtype)
+                        )
                     else:
                         param.set_value(
                             self._master_params[param.name]
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -31,6 +31,7 @@
 from paddle import nn
 from paddle.distributed import collective
 from paddle.distributed.utils.log_utils import get_logger
+from paddle.framework import core
 
 from .group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
 from .group_sharded_storage import GradStorage
@@ -66,7 +67,7 @@ def __init__(
         sync_buffers=False,
         buffer_max_size=2**23,  # 8MB
         auto_refresh_trainable=True,
-        device="gpu",
+        device="xpu" if core.is_compiled_with_xpu() else "gpu",
         dp_group=None,
     ):
         super().__init__()
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -104,7 +104,7 @@ def __init__(
         optimizer,
         group=None,
         sync_buffers=False,
-        device="gpu",
+        device="xpu" if core.is_compiled_with_xpu() else "gpu",
         segment_size=2**20,
         pretrain_sync_models=True,
         offload=False,
@@ -310,7 +310,10 @@ def _clear_gradients(self):
                         paddle.CustomPlace(self._default_device, DEV_ID), True
                     )
                 else:
-                    tmp_var = param.cuda(DEV_ID)
+                    # both GPU and XPU
+                    tmp_var = param.to(
+                        self._default_device + ":" + (str)(DEV_ID)
+                    )
 
                 if (
                     tmp_var.dtype == Type.fp32.value
@@ -1197,7 +1200,8 @@ def _cpu2device(param):
     if DEV in paddle.device.get_all_custom_device_type():
         tmp_p = param.fw_storage._copy_to(paddle.CustomPlace(DEV, DEV_ID), True)
     else:
-        tmp_p = param.fw_storage.cuda(DEV_ID)
+        # both GPU and XPU
+        tmp_p = param.fw_storage.to(DEV + ":" + (str)(DEV_ID))
     if (
         tmp_p.dtype == Type.fp32.value
         and param2dtype[param.name] == Type.fp16.value
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -167,6 +167,8 @@ def _dygraph_clip(self, params_grads):
                 global_norm_var = global_norm_var._copy_to(
                     paddle.CustomPlace(dev_type, dev_id), True
                 )
+            elif dev_type == "xpu":
+                global_norm_var = global_norm_var.to(self._device)
             else:
                 global_norm_var = global_norm_var.cuda(dev_id)
 
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
@@ -61,8 +61,13 @@ if((WITH_ROCM) AND LOCAL_ALL_PLAT)
     "PADDLE_DIST_UT_PORT=21204;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
   )
 endif()
-if(WITH_NCCL OR WITH_RCCL)
-  if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+if(WITH_NCCL
+   OR WITH_RCCL
+   OR WITH_XPU_BKCL)
+  if((WITH_GPU
+      OR WITH_ROCM
+      OR WITH_XPU)
+     AND LOCAL_ALL_PLAT)
     bash_test_modules(
       test_parallel_dygraph_mp_layers
       START_BASH
@@ -608,13 +613,19 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
   set_tests_properties(test_imperative_auto_mixed_precision_for_eager
                        PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
 endif()
-if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+if((WITH_GPU
+    OR WITH_ROCM
+    OR WITH_XPU)
+   AND LOCAL_ALL_PLAT)
   py_test_modules(
     test_dygraph_recompute_for_eager MODULES test_dygraph_recompute_for_eager
     ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
 endif()
-if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
+if((WITH_GPU
+    OR WITH_ROCM
+    OR WITH_XPU)
+   AND LOCAL_ALL_PLAT)
   py_test_modules(
     test_dygraph_recompute MODULES test_dygraph_recompute ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
diff --git a/test/collective/fleet/dygraph_group_sharded_stage2.py b/test/collective/fleet/dygraph_group_sharded_stage2.py
@@ -99,7 +99,9 @@ def train_mlp(
     scale_fn_test=False,
 ):
     if sharding_stage != "dp":
-        group = paddle.distributed.new_group([0, 1], backend="nccl")
+        group = paddle.distributed.new_group(
+            [0, 1], backend="bkcl" if paddle.is_compiled_with_xpu() else "nccl"
+        )
     if opt_group:
         optimizer = optimizer_setting(
             model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group
@@ -149,7 +151,7 @@ def train_mlp(
     )
 
     if sharding_stage == 2:
-        model.to(device="gpu")
+        model.to(device="xpu" if paddle.is_compiled_with_xpu() else "gpu")
 
     for eop in range(epoch):
         model.train()
@@ -210,7 +212,10 @@ def test_dp_stage2():
     )
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
+            dp_params[i].numpy(),
+            stage2_params[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8 if paddle.is_compiled_with_xpu() else 0,
         )
 
     # stage2 accumulate grad
@@ -232,7 +237,10 @@ def test_dp_stage2():
     )
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
+            dp_params[i].numpy(),
+            stage2_params[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8 if paddle.is_compiled_with_xpu() else 0,
         )
 
     # save/load model
diff --git a/test/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py b/test/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
@@ -98,7 +98,9 @@ def train_mlp(
     test_minimize=False,
 ):
     if sharding_stage != "dp":
-        group = paddle.distributed.new_group([0, 1], backend="nccl")
+        group = paddle.distributed.new_group(
+            [0, 1], backend="bkcl" if paddle.is_compiled_with_xpu() else "nccl"
+        )
     if opt_group:
         optimizer = optimizer_setting(
             model=model, use_pure_fp16=use_pure_fp16, opt_group=opt_group
@@ -140,7 +142,7 @@ def train_mlp(
     )
 
     if sharding_stage == 2:
-        model.to(device="gpu")
+        model.to(device="xpu" if paddle.is_compiled_with_xpu() else "gpu")
 
     for eop in range(epoch):
         model.train()
@@ -166,7 +168,10 @@ def train_mlp(
             optimizer.step()
             optimizer.clear_grad()
 
-    paddle.device.cuda.synchronize()
+    if paddle.is_compiled_with_xpu():
+        paddle.device.xpu.synchronize()
+    else:
+        paddle.device.cuda.synchronize()
 
     if save_model:
         return model, optimizer
@@ -201,7 +206,10 @@ def test_dp_stage2():
     )
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
+            dp_params[i].numpy(),
+            stage2_params[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8 if paddle.is_compiled_with_xpu() else 0,
         )
 
     # stage2 accumulate grad
@@ -223,7 +231,10 @@ def test_dp_stage2():
     )
     for i in range(len(dp_params)):
         np.testing.assert_allclose(
-            dp_params[i].numpy(), stage2_params[i].numpy(), rtol=1e-6
+            dp_params[i].numpy(),
+            stage2_params[i].numpy(),
+            rtol=1e-6,
+            atol=1e-8 if paddle.is_compiled_with_xpu() else 0,
         )
 
     # save/load model
diff --git a/test/collective/fleet/dygraph_group_sharded_stage2_offload.py b/test/collective/fleet/dygraph_group_sharded_stage2_offload.py
@@ -94,7 +94,10 @@ def train_mlp(model, offload=False, test=False):
 
     for dtype in optimizer.param_storages:
         for dst_rank, param_storage in optimizer.param_storages[dtype].items():
-            param_storage.to(device="gpu", dtype=dtype)
+            param_storage.to(
+                device="xpu" if paddle.is_compiled_with_xpu() else "gpu",
+                dtype=dtype,
+            )
 
     return model.parameters()
 
diff --git a/test/collective/fleet/dygraph_group_sharded_stage3.py b/test/collective/fleet/dygraph_group_sharded_stage3.py
@@ -366,10 +366,9 @@ def test_stage2_stage3():
         )
 
     # bfp16
-    nccl_version = core.nccl_version()
-
     if (
-        nccl_version >= 21000
+        paddle.is_compiled_with_xpu()
+        or core.nccl_version() >= 21000
         and paddle.device.cuda.get_device_properties().major >= 8
     ):
         stage2_params = train_mlp(
diff --git a/test/collective/fleet/dygraph_group_sharded_stage3_offload.py b/test/collective/fleet/dygraph_group_sharded_stage3_offload.py
@@ -216,9 +216,8 @@ def test_stage3_offload():
         )
 
     # bfp16 offload
-    nccl_version = core.nccl_version()
-    if (
-        nccl_version >= 21000
+    if paddle.is_compiled_with_xpu() or (
+        core.nccl_version() >= 21000
         and paddle.device.cuda.get_device_properties().major >= 8
     ):
         stage3_params = train_mlp(mlp7, use_pure_fp16=True, use_bfp16=True)
diff --git a/test/collective/fleet/hybrid_parallel_mp_bf16.py b/test/collective/fleet/hybrid_parallel_mp_bf16.py
@@ -60,7 +60,9 @@ def train_batch(self, batch, model, optimizer, is_mp):
 
 
 if __name__ == "__main__":
-    if (
+    if paddle.is_compiled_with_xpu():
+        unittest.main()
+    elif (
         check_nccl_version_for_bf16()
         and paddle.device.cuda.get_device_properties().major >= 8
     ):
diff --git a/test/collective/fleet/test_parallel_dygraph_tensor_parallel.py b/test/collective/fleet/test_parallel_dygraph_tensor_parallel.py
@@ -18,6 +18,8 @@
     TestMultipleAccelerators,
 )
 
+from paddle.framework import core
+
 
 class TestHybridParallel(TestMultipleAccelerators):
     def test_hybrid_parallel_mp_random(self):
@@ -35,7 +37,9 @@ def test_hybrid_parallel_mp_fp16(self):
         self.run_mnist_2accelerators('hybrid_parallel_mp_fp16.py')
 
     def test_hybrid_parallel_mp_bf16(self):
-        self.run_mnist_2accelerators('hybrid_parallel_mp_bf16.py')
+        # XPU will use its own fast_paddle lib for bf16 training, therefore skip ordinary ut here.
+        if not core.is_compiled_with_xpu():
+            self.run_mnist_2accelerators('hybrid_parallel_mp_bf16.py')
 
     def test_hybrid_parallel_mp_clip_grad(self):
         self.run_mnist_2accelerators('hybrid_parallel_mp_clip_grad.py')
diff --git a/test/legacy_test/test_parallel_dygraph_dataparallel.py b/test/legacy_test/test_parallel_dygraph_dataparallel.py
@@ -164,7 +164,7 @@ def run_mnist_2accelerators(
         target_file_name,
         allocator_strategy="auto_growth",
         need_envs={},
-        accelerator_type="gpu",
+        accelerator_type="xpu" if base.core.is_compiled_with_xpu() else "gpu",
     ):
         if accelerator_type == "gpu":
             if (
@@ -198,6 +198,7 @@ def run_mnist_2accelerators(
             training_script=target_file_name,
             training_script_args=[],
             need_envs=need_envs,
+            accelerator_type=accelerator_type,
         )
 
         while True:

Original file line number	Diff line number	Diff line change
`@@ -167,6 +167,8 @@ def _dygraph_clip(self, params_grads):`
`167`	`167`	`global_norm_var = global_norm_var._copy_to(`
`168`	`168`	`paddle.CustomPlace(dev_type, dev_id), True`
`169`	`169`	`)`
	`170`	`+ elif dev_type == "xpu":`
	`171`	`+ global_norm_var = global_norm_var.to(self._device)`
`170`	`172`	`else:`
`171`	`173`	`global_norm_var = global_norm_var.cuda(dev_id)`
`172`	`174`