ROCm
diff --git a/‎torch/nn/attention/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/attention/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/attention/_utils.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/attention/_utils.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/attention/bias.py‎
Lines changed: 2 additions & 2 deletions b/‎torch/nn/attention/bias.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torch/nn/attention/experimental/_paged_attention.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/attention/experimental/_paged_attention.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/attention/flex_attention.py‎
Lines changed: 6 additions & 6 deletions b/‎torch/nn/attention/flex_attention.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎torch/nn/backends/thnn.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/backends/thnn.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/cpp.py‎
Lines changed: 6 additions & 6 deletions b/‎torch/nn/cpp.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎torch/nn/modules/module.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/modules/module.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 1 addition & 1 deletion b/‎torch/nn/parallel/data_parallel.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎torch/nn/parameter.py‎
Lines changed: 5 additions & 5 deletions b/‎torch/nn/parameter.py‎
Lines changed: 5 additions & 5 deletions
@@ -90,7 +90,7 @@ def _cur_sdpa_kernel_backends(with_priority: bool = False):
     return backends
 
 
-def _sdpa_kernel(backends: Iterable, set_priority: bool = False):
+def _sdpa_kernel(backends: Iterable, set_priority: bool = False) -> None:
     for name, val in _backend_names.items():
         enabled = getattr(SDPBackend, val) in backends
         getattr(torch._C, f"_set_sdp_use_{name}")(enabled)
 
@@ -40,7 +40,7 @@ def _validate_sdpa_input(
     dropout_p=0.0,
     is_causal=False,
     scale=None,
-):
+) -> None:
     if query.dtype != key.dtype or query.dtype != value.dtype:
         raise ValueError(
             f"Expected query, key, and value to have the same dtype, "
 
@@ -117,7 +117,7 @@ class CausalBias(torch.Tensor):
     .. warning:: This class is a prototype and subject to change.
     """
 
-    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int):
+    def __init__(self, variant: CausalVariant, seq_len_q: int, seq_len_kv: int) -> None:
         """
         Initializes the CausalBias instance with a specified variant and sequence lengths.
 
@@ -296,7 +296,7 @@ def __torch_function__(cls, func, types, args=(), kwargs=None):
             return cls._dispatch(*args, **kwargs)
         return super().__torch_function__(func, types, args, kwargs)
 
-    def __repr__(self):  # type:ignore[override]
+    def __repr__(self) -> str:  # type:ignore[override]
         return self._materialize().__repr__()
 
 
 
@@ -40,7 +40,7 @@ def __init__(
         page_size: int,
         max_batch_size: int,
         device: str = "cuda",
-    ):
+    ) -> None:
         # number of pages
         self.n_pages = n_pages
 
 
@@ -550,7 +550,7 @@ def __init__(
         full_q_indices: Optional[Tensor],
         BLOCK_SIZE: tuple[int, int],
         mask_mod: _mask_mod_signature,
-    ):
+    ) -> None:
         if kv_indices.dim() < 2:
             raise RuntimeError("BlockMask must have at least 2 dimensions")
         assert kv_num_blocks is not None, "kv_num_blocks must be provided"
@@ -682,7 +682,7 @@ def shape(self):
         *batch_dims, _, _ = self.kv_indices.shape
         return tuple(batch_dims) + self.seq_lengths
 
-    def __str__(self):
+    def __str__(self) -> str:
         s = f"BlockMask(shape={self.shape}, sparsity={self.sparsity():.2f}%, \n"
         mask_str = self.to_string().strip()
         s += mask_str
@@ -760,7 +760,7 @@ def causal_mask(b, h, q_idx, kv_idx):
             compute_q_blocks=self.q_indices is not None,
         )
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         def shape_or_none(x: Optional[torch.Tensor]):
             return x.shape if x is not None else None
 
@@ -864,7 +864,7 @@ def create_block_vis(*batch_idx):
 
             vis = ", ".join(reversed(descriptors)) + "\n"
 
-            def summarize_section(section):
+            def summarize_section(section) -> str:
                 percentage = section.float().mean().item()
                 if percentage == 1:
                     return "█"
@@ -1289,15 +1289,15 @@ def _apply_kernel_options(
     return kernel_options
 
 
-def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor):
+def _validate_embed_dim(query: Tensor, key: Tensor, value: Tensor) -> None:
     if query.size(-1) != key.size(-1):
         raise ValueError(
             f"Expect query and key/value to have the same embedding dimension "
             f"but got E={query.size(-1)} and E={key.size(-1)}."
         )
 
 
-def _validate_device(query: Tensor, key: Tensor, value: Tensor):
+def _validate_device(query: Tensor, key: Tensor, value: Tensor) -> None:
     """TODO: Remove once non cuda/cpu devices support is added
     We only need to check query since we have already that q,k,v are on the same device
     """
 
@@ -2,5 +2,5 @@
 # this is for historical pickle deserialization, it is not used otherwise
 
 
-def _get_thnn_function_backend():
+def _get_thnn_function_backend() -> None:
     pass
@@ -14,7 +14,7 @@ class OrderedDictWrapper:
     so using properties does not work.
     """
 
-    def __init__(self, cpp_module, attr):
+    def __init__(self, cpp_module, attr) -> None:
         self.cpp_module = cpp_module
         self.attr = attr
 
@@ -37,10 +37,10 @@ def values(self):
     def __iter__(self):
         return self.cpp_dict.__iter__()
 
-    def __len__(self):
+    def __len__(self) -> int:
         return self.cpp_dict.__len__()
 
-    def __contains__(self, key):
+    def __contains__(self, key) -> bool:
         return self.cpp_dict.__contains__(key)
 
     def __getitem__(self, key):
@@ -50,7 +50,7 @@ def __getitem__(self, key):
 class ModuleWrapper(nn.Module):
     """A subclass of ``torch.nn.Module`` that wraps a C++ frontend module and delegates all access."""
 
-    def __init__(self, cpp_module):
+    def __init__(self, cpp_module) -> None:
         # Assign before the super class constructor so ``self.training`` can be
         # assigned to in the super class constructor.
         self.cpp_module = cpp_module
@@ -83,8 +83,8 @@ def training(self):
         return self.cpp_module.training
 
     @training.setter
-    def training(self, mode):
+    def training(self, mode) -> None:
         self.cpp_module.train(mode)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return self.cpp_module.__repr__()
@@ -3040,7 +3040,7 @@ def _replicate_for_data_parallel(self):
 
         return replica
 
-    def compile(self, *args, **kwargs):
+    def compile(self, *args, **kwargs) -> None:
         """
         Compile this Module's forward using :func:`torch.compile`.
 
 
@@ -30,7 +30,7 @@ def _check_balance(device_ids: Sequence[Union[int, torch.device]]) -> None:
     device_ids = [_get_device_index(x, True) for x in device_ids]
     dev_props = _get_devices_properties(device_ids)
 
-    def warn_imbalance(get_prop):
+    def warn_imbalance(get_prop) -> bool:
         values = [get_prop(props) for props in dev_props]
         min_pos, min_val = min(enumerate(values), key=operator.itemgetter(1))
         max_pos, max_val = max(enumerate(values), key=operator.itemgetter(1))
 
@@ -18,7 +18,7 @@
 # Metaclass to combine _TensorMeta and the instance check override for Parameter.
 class _ParameterMeta(torch._C._TensorMeta):
     # Make `isinstance(t, Parameter)` return True for custom tensor instances that have the _is_param flag.
-    def __instancecheck__(self, instance):
+    def __instancecheck__(self, instance) -> bool:
         if self is Parameter:
             if isinstance(instance, torch.Tensor) and getattr(
                 instance, "_is_param", False
@@ -82,7 +82,7 @@ def __deepcopy__(self, memo):
             return result
 
     # pyrefly: ignore [bad-override]
-    def __repr__(self):
+    def __repr__(self) -> str:
         return "Parameter containing:\n" + super().__repr__()
 
     def __reduce_ex__(self, proto):
@@ -125,7 +125,7 @@ class UninitializedTensorMixin:
         torch._has_compatible_shallow_copy_type,
     ]
 
-    def materialize(self, shape, device=None, dtype=None):
+    def materialize(self, shape, device=None, dtype=None) -> None:
         r"""Create a Parameter or Tensor with the same properties of the uninitialized one.
 
         Given a shape, it materializes a parameter in the same device
@@ -163,7 +163,7 @@ def share_memory_(self):
             "`module.share_memory()`."
         )
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         return f"<{self.__class__.__name__}>"
 
     def __reduce_ex__(self, proto):
@@ -235,7 +235,7 @@ def __deepcopy__(self, memo):
 # Metaclass to combine _TensorMeta and the instance check override for Buffer.
 class _BufferMeta(torch._C._TensorMeta):
     # Make `isinstance(t, Buffer)` return True for custom tensor instances that have the _is_buffer flag.
-    def __instancecheck__(self, instance):
+    def __instancecheck__(self, instance) -> bool:
         if self is Buffer:
             if isinstance(instance, torch.Tensor) and getattr(
                 instance, "_is_buffer", False