yushangdi
diff --git a/‎test/distributed/fsdp/test_checkpoint_wrapper.py‎
Lines changed: 15 additions & 11 deletions b/‎test/distributed/fsdp/test_checkpoint_wrapper.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎test/distributed/fsdp/test_distributed_checkpoint.py‎
Lines changed: 3 additions & 3 deletions b/‎test/distributed/fsdp/test_distributed_checkpoint.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎test/distributed/fsdp/test_fsdp_apply.py‎
Lines changed: 12 additions & 0 deletions b/‎test/distributed/fsdp/test_fsdp_apply.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎test/distributed/fsdp/test_fsdp_backward_prefetch.py‎
Lines changed: 7 additions & 6 deletions b/‎test/distributed/fsdp/test_fsdp_backward_prefetch.py‎
Lines changed: 7 additions & 6 deletions
@@ -16,13 +16,16 @@
     OffloadWrapper,
 )
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
+from torch.testing._internal.common_fsdp import get_devtype
 from torch.testing._internal.common_utils import run_tests, TestCase
 from torch.utils.checkpoint import checkpoint
 
 
 _SAVED_PREFIX = "_saved_"
 GRAD_FN_NEXT_FUNCTIONS = "next_functions"
 
+device_type = torch.device(get_devtype())
+
 
 class CheckpointWrapperTest(TestCase):
     def test_load_activation_checkpointed_module(self):
@@ -130,7 +133,7 @@ def get_ctx_mgrs():
         m(torch.randn(2, 1)).sum().backward()
         self.assertEqual(2, count)
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
+    @unittest.skip
     def test_checkpoint_wrapper_parity(self):
         """
         Tests that using checkpoint_wrapper or the functional
@@ -155,9 +158,11 @@ def __init__(
                 self.use_reentrant = use_reentrant
                 wrp = partial(
                     checkpoint_wrapper,
-                    checkpoint_impl=CheckpointImpl.REENTRANT
-                    if use_reentrant
-                    else CheckpointImpl.NO_REENTRANT,
+                    checkpoint_impl=(
+                        CheckpointImpl.REENTRANT
+                        if use_reentrant
+                        else CheckpointImpl.NO_REENTRANT
+                    ),
                 )
                 for _ in range(self.n):
                     l = nn.Sequential(
@@ -184,12 +189,12 @@ def test(use_checkpointing, use_wrapper, use_reentrant):
                 use_checkpointing,
                 use_wrapper=use_wrapper,
                 use_reentrant=use_reentrant,
-            ).cuda()
-            x = torch.randn(10000, 256, requires_grad=True).cuda()
-            torch.cuda.reset_peak_memory_stats()
+            ).to(device_type.type)
+            x = torch.randn(10000, 256, requires_grad=True).to(device_type.type)
+            torch.get_device_module(device_type.type).reset_peak_memory_stats()
             loss = a(x).sum()
             loss.backward()
-            return torch.cuda.max_memory_allocated()
+            return torch.get_device_module(device_type.type).max_memory_allocated()
 
         functional_no_reentrant = test(
             use_checkpointing=True, use_wrapper=False, use_reentrant=False
@@ -333,13 +338,12 @@ def test_fqn(self):
         for fqn, _ in lin.named_parameters():
             self.assertTrue(fqn in state_dict, msg=f"{fqn} not in state_dict.")
 
-    @unittest.skipIf(not torch.cuda.is_available(), "Test requires CUDA")
     def test_checkpoint_wrapper_cpu_offload(self):
         model = nn.Sequential(
             nn.Linear(10, 10),
             nn.Linear(10, 10),
             nn.Linear(10, 10),
-        ).cuda()
+        ).to(device_type.type)
 
         # Patch saved_tensor_hooks to make the unpack keep the tensor on CPU for
         # testing, otherwise the tensor access during the DFS will cause orig
@@ -358,7 +362,7 @@ def testing_cpu_offload_unpack_hook(packed):
 
         model = offload_wrapper(model)
 
-        inp = torch.randn(3, 10, device="cuda")
+        inp = torch.randn(3, 10, device=device_type.type)
         loss = model(inp).sum()
 
         # All autograd saved tensors should be offloaded to CPU.
 
@@ -8,10 +8,10 @@
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP, StateDictType
 from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
 from torch.distributed.fsdp.wrap import enable_wrap, wrap
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import FSDPTest, SkipModel
 from torch.testing._internal.common_utils import (
-    instantiate_parametrized_tests,
     parametrize,
     run_tests,
     TEST_WITH_DEV_DBG_ASAN,
@@ -85,7 +85,7 @@ def test_distributed_checkpoint(self, state_dict_type) -> None:
         # TODO: add resharding test case.
 
 
-instantiate_parametrized_tests(TestDistributedCheckpoint)
-
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(TestDistributedCheckpoint, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
@@ -6,11 +6,13 @@
 import torch.distributed as dist
 import torch.nn as nn
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.testing._internal.common_device_type import instantiate_device_type_tests
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
 from torch.testing._internal.common_fsdp import (
     DEVICEInitMode,
     FSDPInitMode,
     FSDPTest,
+    get_devtype,
     NestedWrappedModule,
     TransformerWithSharedParams,
 )
@@ -28,6 +30,8 @@
     )
     sys.exit(0)
 
+device_type = torch.device(get_devtype())
+
 
 class TestApply(FSDPTest):
     @property
@@ -67,37 +71,45 @@ def _check_apply(self, fsdp):
     def test_nested_module_apply(self):
         """Tests that ``apply()`` modifies parameter values in-place on a
         non-FSDP-root nested FSDP-wrapped model."""
+        fsdp_kwargs = {"device_id": device_type.type}
         nested_wrapped_module = NestedWrappedModule.init(
             self.process_group,
             FSDPInitMode.RECURSIVE,
             DEVICEInitMode.DEVICE_AFTER,
+            fsdp_kwargs=fsdp_kwargs,
         )
         self._check_apply(nested_wrapped_module)
 
     @skip_if_lt_x_gpu(2)
     def test_transformer_module_apply(self):
         """Tests that ``apply()`` modifies parameter values in-place on an
         FSDP-wrapped transformer model with shared parameters."""
+        fsdp_kwargs = {"device_id": device_type.type}
         transformer = TransformerWithSharedParams.init(
             self.process_group,
             FSDPInitMode.RECURSIVE,
             DEVICEInitMode.DEVICE_AFTER,
+            fsdp_kwargs=fsdp_kwargs,
         )
         self._check_apply(transformer)
 
     @skip_if_lt_x_gpu(2)
     def test_apply_in_summon_raises_error(self):
         """Tests that calling ``apply()`` on an FSDP instance inside the
         ``summon_full_params()`` context raises an error."""
+        fsdp_kwargs = {"device_id": device_type.type}
         transformer = TransformerWithSharedParams.init(
             self.process_group,
             FSDPInitMode.RECURSIVE,
             DEVICEInitMode.DEVICE_AFTER,
+            fsdp_kwargs=fsdp_kwargs,
         )
         with transformer.summon_full_params(transformer):
             with self.assertRaisesRegex(ValueError, "expected to be in states"):
                 transformer.apply(self._init_linear_weights)
 
 
+devices = ("cuda", "hpu")
+instantiate_device_type_tests(TestApply, globals(), only_for=devices)
 if __name__ == "__main__":
     run_tests()
@@ -16,10 +16,12 @@
 )
 from torch.distributed.fsdp.wrap import ModuleWrapPolicy
 from torch.testing._internal.common_distributed import skip_if_lt_x_gpu
-from torch.testing._internal.common_fsdp import FSDPTest
+from torch.testing._internal.common_fsdp import FSDPTest, get_devtype
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_DEV_DBG_ASAN
 
 
+device_type = torch.device(get_devtype())
+
 NUM_ITERS = 2
 DECODER_PARAM_FQNS = [
     "decoder.layers.{index}.self_attn.in_proj_weight",
@@ -81,14 +83,13 @@ def world_size(self):
     def _dist_train(self, backward_prefetch=BackwardPrefetch.BACKWARD_PRE):
         rank = self.rank
         orig_get_handle_to_prefetch = _get_handle_to_prefetch
-
         torch.manual_seed(0)
         policy = ModuleWrapPolicy(
             {nn.TransformerEncoderLayer, nn.TransformerDecoderLayer}
         )
         model = FSDP(
-            nn.Transformer(d_model=1024, nhead=8, device="cuda"),
-            device_id=torch.cuda.current_device(),
+            nn.Transformer(d_model=1024, nhead=8, device=device_type),
+            device_id=device_type.type,
             auto_wrap_policy=policy,
             use_orig_params=True,
             backward_prefetch=backward_prefetch,
@@ -97,8 +98,8 @@ def _dist_train(self, backward_prefetch=BackwardPrefetch.BACKWARD_PRE):
 
         # prepare input
         torch.manual_seed(rank + 1)
-        src = torch.randn((10, 1, 1024), device="cuda")
-        tgt = torch.randn((20, 1, 1024), device="cuda")
+        src = torch.randn((10, 1, 1024), device=device_type)
+        tgt = torch.randn((20, 1, 1024), device=device_type)
 
         # monkey patch
         all_handle_fqns: List[List[str]] = []