ROCm
diff --git a/‎torch/distributed/fsdp/_common_utils.py‎
Lines changed: 8 additions & 6 deletions b/‎torch/distributed/fsdp/_common_utils.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎torch/distributed/fsdp/_debug_utils.py‎
Lines changed: 8 additions & 6 deletions b/‎torch/distributed/fsdp/_debug_utils.py‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎torch/distributed/fsdp/_exec_order_utils.py‎
Lines changed: 2 additions & 1 deletion b/‎torch/distributed/fsdp/_exec_order_utils.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎torch/distributed/fsdp/_flat_param.py‎
Lines changed: 85 additions & 39 deletions b/‎torch/distributed/fsdp/_flat_param.py‎
Lines changed: 85 additions & 39 deletions
diff --git a/‎torch/distributed/fsdp/_fsdp_extensions.py‎
Lines changed: 2 additions & 1 deletion b/‎torch/distributed/fsdp/_fsdp_extensions.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py‎
Lines changed: 8 additions & 4 deletions b/‎torch/distributed/fsdp/_fully_shard/_fsdp_collectives.py‎
Lines changed: 8 additions & 4 deletions
diff --git a/‎torch/distributed/fsdp/_fully_shard/_fsdp_common.py‎
Lines changed: 4 additions & 3 deletions b/‎torch/distributed/fsdp/_fully_shard/_fsdp_common.py‎
Lines changed: 4 additions & 3 deletions
@@ -203,9 +203,10 @@ def _module_handle(state: _FSDPState, module: nn.Module) -> Optional["FlatParamH
         # handles, meaning no entry in `_fully_sharded_module_to_handles`
         if state._handle is None:
             return None
-        assert module in state._fully_sharded_module_to_handle, (
-            f"Expects a fully sharded module but got {module} on rank {state.rank}"
-        )
+        if module not in state._fully_sharded_module_to_handle:
+            raise AssertionError(
+                f"Expects a fully sharded module but got {module} on rank {state.rank}"
+            )
         return state._fully_sharded_module_to_handle[module]
     else:
         # NOTE: This assumes `module` is a `FullyShardedDataParallel` instance.
@@ -258,9 +259,10 @@ def _named_parameters_with_duplicates(
     This API is required as some modules overwrite `named_parameters()` but do not support
     `remove_duplicate`.
     """
-    assert "remove_duplicate" not in kwargs, (
-        "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
-    )
+    if "remove_duplicate" in kwargs:
+        raise AssertionError(
+            "_named_parameters_with_duplicates cannot be used with `remove_duplicate` argument."
+        )
     kwargs["remove_duplicate"] = False
     try:
         ret = list(module.named_parameters(**kwargs))
 
@@ -39,11 +39,12 @@ def reset(cls) -> None:
     @classmethod
     @contextmanager
     def profile(cls, profile_type: str) -> Iterator[None]:
-        assert profile_type not in cls.profiling, (
-            f"{profile_type} is already being profiled. "
-            "SimpleProfiler does not support profiling multiple instances at "
-            "the same time. "
-        )
+        if profile_type in cls.profiling:
+            raise AssertionError(
+                f"{profile_type} is already being profiled. "
+                "SimpleProfiler does not support profiling multiple instances at "
+                "the same time. "
+            )
 
         cls.profiling.add(profile_type)
         begin = time.monotonic()
@@ -129,7 +130,8 @@ def module_fn(
 
         if handle:
             param = handle.flat_param
-            assert isinstance(param, flat_param_file.FlatParameter)
+            if not isinstance(param, flat_param_file.FlatParameter):
+                raise AssertionError(f"Expected FlatParameter, got {type(param)}")
             global_fqns = [
                 clean_tensor_name(prefix + name) for name in param._fqns
             ]  # prefixed from the top level `model` (i.e. including `prefix`)
 
@@ -214,7 +214,8 @@ def _check_order(self, handle: FlatParamHandle, is_training: bool) -> None:
             # parameters
             # TODO (awgu): Since every module has at most one handle in the
             # current implementation, this should never raise the error.
-            assert self.world_size is not None  # mypy
+            if self.world_size is None:
+                raise AssertionError("Expected world_size to not be None")
             if not torch.distributed._functional_collectives.is_torchdynamo_compiling():
                 # TODO(voz): Don't graph break on this - dynamo hates the n1 != n2
                 # tensor comparison control flow.
 
@@ -360,7 +360,8 @@ class FlatParameter(nn.Parameter, metaclass=_FlatParameterMeta):
     _is_padding_mask: list[bool]
 
     def __new__(cls, data=None, requires_grad=True):
-        assert cls is FlatParameter, "subclasses FlatParameter not supported"
+        if cls is not FlatParameter:
+            raise AssertionError("subclasses FlatParameter not supported")
         r = nn.Parameter.__new__(nn.Parameter, data, requires_grad)  # type: ignore[call-arg]
         r._is_flat_param = True  # type: ignore[attr-defined]
         return r
@@ -398,11 +399,26 @@ def _init_metadata(
         Args:
             See the Attributes in the class docstring.
         """
-        assert len(param_infos) == len(shapes)
-        assert len(param_infos) == len(strides)
-        assert len(param_infos) == len(contiguities)
-        assert len(param_infos) == len(fqns)
-        assert len(param_infos) == len(param_extensions)
+        if len(param_infos) != len(shapes):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match shapes length {len(shapes)}"
+            )
+        if len(param_infos) != len(strides):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match strides length {len(strides)}"
+            )
+        if len(param_infos) != len(contiguities):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match contiguities length {len(contiguities)}"
+            )
+        if len(param_infos) != len(fqns):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match fqns length {len(fqns)}"
+            )
+        if len(param_infos) != len(param_extensions):
+            raise AssertionError(
+                f"Expected param_infos length {len(param_infos)} to match param_extensions length {len(param_extensions)}"
+            )
         self._num_params = len(param_infos)
         self._param_infos = param_infos
         self._shapes = shapes
@@ -418,22 +434,32 @@ def _init_metadata(
                 numels_without_padding.append(numel)
         self._numels = tuple(numels_without_padding)
         self._numels_with_padding = tuple(numels)
-        assert len(self._numels) == self._num_params
+        if len(self._numels) != self._num_params:
+            raise AssertionError(
+                f"Expected _numels length {len(self._numels)} to equal _num_params {self._num_params}"
+            )
 
         self._shared_param_infos = tuple(shared_param_infos)
         self._modules = {pi.module for pi in self._param_infos}.union(
             {spi.module for spi in self._shared_param_infos}
         )
-        assert (params is None) == (shared_params is None)
-        if params is not None:
-            assert shared_params is not None and len(shared_params) == len(
-                shared_param_infos
+        if (params is None) != (shared_params is None):
+            raise AssertionError(
+                "Expected params and shared_params to both be None or both be not None"
             )
+        if params is not None:
+            if shared_params is None or len(shared_params) != len(shared_param_infos):
+                raise AssertionError(
+                    f"Expected shared_params to be not None and have length {len(shared_param_infos)}, got {shared_params}"
+                )
             self._params = []
             for param, is_padding in zip(params, is_padding_mask):
                 if not is_padding:
                     self._params.append(param)
-            self._shared_params = shared_params
+            if shared_params is not None:
+                self._shared_params = shared_params
+            else:
+                self._shared_params = []
             # Mark the original parameters to avoid flattening them into
             # another `FlatParameter` during recursive construction
             for param in chain(self._params, self._shared_params):
@@ -579,7 +605,8 @@ def __init__(
         # before `_init_flat_param()`, which performs the actual validation
         self._orig_param_dtype = params[0].dtype
         self._init_param_reduce_dtypes(mp_param_dtype, mp_reduce_dtype)
-        assert self._fwd_bwd_param_dtype is not None  # mypy
+        if self._fwd_bwd_param_dtype is None:
+            raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")  # mypy
         self._aligned_numel = (
             _get_aligned_numel(unsharded_dtype=self._fwd_bwd_param_dtype)
             if align_addresses
@@ -807,7 +834,8 @@ def _validate_tensors_to_flatten(
             dtype = tensor.dtype
             flat_param_requires_grad = flat_param_requires_grad or tensor.requires_grad
             device = tensor.device
-        assert flat_param_requires_grad is not None, "Requires non-empty `tensors` list"
+        if flat_param_requires_grad is None:
+            raise AssertionError("Requires non-empty `tensors` list")
         return dtype, flat_param_requires_grad, device
 
     def flatten_tensors(
@@ -908,8 +936,10 @@ def _init_param_reduce_dtypes(
         else:
             self._fwd_bwd_param_dtype = mp_param_dtype or self._orig_param_dtype
             self._reduce_dtype = mp_reduce_dtype or self._orig_param_dtype
-        assert self._fwd_bwd_param_dtype is not None
-        assert self._reduce_dtype is not None
+        if self._fwd_bwd_param_dtype is None:
+            raise AssertionError("Expected _fwd_bwd_param_dtype to be not None")
+        if self._reduce_dtype is None:
+            raise AssertionError("Expected _reduce_dtype to be not None")
 
     ###################################
     # SHARD INITIALIZATION & METADATA #
@@ -985,9 +1015,10 @@ def _init_shard_metadata(
         shard_param_infos = self._get_shard_metadata(
             unsharded_start_idx, unsharded_end_idx
         )
-        assert len(shard_param_infos) == flat_param._num_params, (
-            f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
-        )
+        if len(shard_param_infos) != flat_param._num_params:
+            raise AssertionError(
+                f"Expects length {flat_param._num_params} but got {len(shard_param_infos)}"
+            )
         flat_param._shard_param_infos = shard_param_infos  # type: ignore[attr-defined]
         flat_param._shard_numel_padded = numel_padded  # type: ignore[attr-defined]
 
@@ -1003,9 +1034,10 @@ def _get_shard_metadata(
         unsharded flat parameter specifying the shard.
         """
         flat_param_offsets = self._get_flat_param_offsets()
-        assert len(flat_param_offsets) == len(self.flat_param._numels_with_padding), (
-            f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
-        )
+        if len(flat_param_offsets) != len(self.flat_param._numels_with_padding):
+            raise AssertionError(
+                f"Expected {len(self.flat_param._numels_with_padding)} but got {len(flat_param_offsets)}"
+            )
         shard_param_infos: list[_ShardParamInfo] = []
         sharded_flat_param_numel = unsharded_end_idx - unsharded_start_idx + 1
         # `unsharded_param_start_idx` and `unsharded_param_end_idx` are indices
@@ -1033,12 +1065,13 @@ def _get_shard_metadata(
                         unsharded_start_idx - unsharded_param_start_idx
                     )
                     offset_in_shard = 0
-                assert (
+                if not (
                     offset_in_shard >= 0 and offset_in_shard < sharded_flat_param_numel
-                ), (
-                    f"Invalid `offset_in_shard` of {offset_in_shard} for "
-                    f"sharded flat parameter with {sharded_flat_param_numel} numel"
-                )
+                ):
+                    raise AssertionError(
+                        f"Invalid `offset_in_shard` of {offset_in_shard} for "
+                        f"sharded flat parameter with {sharded_flat_param_numel} numel"
+                    )
                 intra_param_end_idx = (
                     min(unsharded_param_end_idx, unsharded_end_idx)
                     - unsharded_param_start_idx
@@ -1082,9 +1115,10 @@ def _get_unpadded_shard(
         else:
             chunk = chunks[rank]
         numel_to_pad = chunks[0].numel() - chunk.numel()
-        assert numel_to_pad >= 0, (
-            "Chunk's size should be at most the first chunk's size"
-        )
+        if numel_to_pad < 0:
+            raise AssertionError(
+                "Chunk's size should be at most the first chunk's size"
+            )
         return chunk, numel_to_pad
 
     @staticmethod
@@ -1115,12 +1149,16 @@ def _get_sharded_size(tensor: Tensor, rank: int, world_size: int) -> torch.Size:
         This requires ``tensor`` to have 1D shape and ensures that the returned
         shape is 1D.
         """
-        assert len(tensor.shape) == 1, f"{tensor.shape}"
+        if len(tensor.shape) != 1:
+            raise AssertionError(f"Expected 1D tensor shape, got {tensor.shape}")
         unpadded_sharded_tensor, numel_to_pad = FlatParamHandle._get_unpadded_shard(
             tensor, rank, world_size
         )
         unpadded_sharded_size = unpadded_sharded_tensor.size()
-        assert len(unpadded_sharded_size) == 1, f"{unpadded_sharded_size}"
+        if len(unpadded_sharded_size) != 1:
+            raise AssertionError(
+                f"Expected 1D unpadded_sharded_size, got {unpadded_sharded_size}"
+            )
         return torch.Size([unpadded_sharded_size[0] + numel_to_pad])
 
     def _get_flat_param_offsets(self) -> list[tuple[int, int]]:
@@ -2059,7 +2097,7 @@ def _use_unsharded_grad_views(self) -> None:
             _p_assert(
                 hasattr(module, param_name),
                 f"{module_name + '.' + param_name if module_name else param_name} is missing",
-            )  # did not save FQN info in `_shared_param_infos`
+            )
             param = getattr(module, param_name)
             prim_param = getattr(prim_module, prim_param_name)
             if (
@@ -2130,7 +2168,8 @@ def _use_sharded_views(self) -> None:
                 offset = shard_param_info.offset_in_shard
                 numel_in_shard = shard_param_info.numel_in_shard
                 param.data = flat_param[offset : offset + numel_in_shard]
-        assert self.flat_param._shared_params is not None
+        if self.flat_param._shared_params is None:
+            raise AssertionError("Expected _shared_params to be not None")
         for i, (
             param,
             (param_name, module, _, prim_param_name, prim_module, _),
@@ -2194,7 +2233,8 @@ def _use_sharded_grad_views(self) -> None:
                         )
                 else:
                     param.grad = None
-        assert flat_param._shared_params is not None
+        if flat_param._shared_params is None:
+            raise AssertionError("Expected _shared_params to be not None")
         for param, (_, _, _, prim_param_name, prim_module, _) in zip(
             flat_param._shared_params, flat_param._shared_param_infos
         ):
@@ -2408,7 +2448,8 @@ def _writeback_tensor(
             dst_tensor[offset : offset + expected_shape.numel()].copy_(src_tensor)
         else:
             dst_tensor[offset : offset + expected_shape.numel()].zero_()
-            assert self.flat_param._is_grad_none_mask is not None
+            if self.flat_param._is_grad_none_mask is None:
+                raise AssertionError("Expected _is_grad_none_mask to be not None")
             self.flat_param._is_grad_none_mask[tensor_index] = True
 
     def _reset_flat_param_grad_info_if_needed(self):
@@ -2427,7 +2468,8 @@ def _reset_flat_param_grad_info_if_needed(self):
         if not self._use_orig_params:
             return
         flat_param = self.flat_param
-        assert flat_param._params is not None  # mypy
+        if flat_param._params is None:
+            raise AssertionError("Expected _params to be not None")  # mypy
         all_grad_none = True
         requires_grad = False
         for param in flat_param._params:
@@ -2571,12 +2613,16 @@ def _reset_is_grad_none(self) -> None:
             "Expects to only be called in the post-backward after gradient computation",
         )
         flat_param = self.flat_param
-        assert flat_param._params is not None  # mypy
+        if flat_param._params is None:
+            raise AssertionError("Expected _params to be not None")  # mypy
         for i, param in enumerate(flat_param._params):  # type: ignore[arg-type]
             # As long as the parameter requires gradient, it should receive a
             # meaningful gradient (even if the gradient happens to be zeros)
             if param.requires_grad:
-                assert flat_param._is_grad_none_mask is not None  # mypy
+                if flat_param._is_grad_none_mask is None:
+                    raise AssertionError(
+                        "Expected _is_grad_none_mask to be not None"
+                    )  # mypy
                 flat_param._is_grad_none_mask[i] = False
 
     #######################
 
@@ -161,7 +161,8 @@ def _ext_pre_load_state_dict_transform(
     if fsdp_extension is not None:
         return fsdp_extension.pre_load_state_dict_transform(tensor)
 
-    assert type(tensor) is ShardedTensor
+    if type(tensor) is not ShardedTensor:
+        raise AssertionError(f"Expected ShardedTensor, got {type(tensor)}")
     shards = tensor.local_shards()
     return (tensor, shards)
 
 
@@ -502,9 +502,10 @@ def foreach_reduce(
         ):
             if (shard_dim := fsdp_param.fsdp_placement.dim) == 0:
                 continue
-            assert unsharded_grad.size(shard_dim) % world_size == 0, (
-                f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
-            )
+            if unsharded_grad.size(shard_dim) % world_size != 0:
+                raise AssertionError(
+                    f"Shard({shard_dim}) requires even sharding: {unsharded_grad.size()=} {world_size=}"
+                )
             chunks = torch.chunk(unsharded_grad, world_size, dim=shard_dim)
             unsharded_grads[i] = torch.cat(chunks, dim=0)
 
@@ -621,7 +622,10 @@ def foreach_reduce(
                     # ensure that the D2H copy finishes before the optimizer
                     fsdp_param.grad_offload_event = post_reduce_stream.record_event()
             if to_accumulate_grad:
-                assert isinstance(fsdp_param.sharded_param.grad, DTensor)
+                if not isinstance(fsdp_param.sharded_param.grad, DTensor):
+                    raise AssertionError(
+                        f"Expected fsdp_param.sharded_param.grad to be DTensor, got {type(fsdp_param.sharded_param.grad)}"
+                    )
                 fsdp_param.sharded_param.grad._local_tensor += new_sharded_grad
             else:
                 new_sharded_dtensor_grad = fsdp_param.to_sharded_dtensor(
 
@@ -17,9 +17,10 @@
 
 
 def detect_compiled_autograd():
-    assert not torch.compiler.is_compiling(), (
-        "`detect_compiled_autograd()` is designed to be called in eager mode"
-    )
+    if torch.compiler.is_compiling():
+        raise AssertionError(
+            "`detect_compiled_autograd()` is designed to be called in eager mode"
+        )
     global _compiled_autograd_enabled
     import torch._dynamo.compiled_autograd as ca