update critical typing issues

skyw · skyw · commit 770cf646c9a0 · 2025-12-22T11:25:51.000-08:00
Signed-off-by: Hao Wu &lt;skyw@nvidia.com&gt;
diff --git a/emerging_optimizers/psgd/psgd.py b/emerging_optimizers/psgd/psgd.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
-from typing import Callable, List, Tuple, override
+from typing import Callable, override
 
 import torch
 from torch.optim.optimizer import ParamsT
@@ -154,7 +154,7 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
 def _init_psgd_kron_states(
     grad: torch.Tensor,
     precond_init_scale: float = 1.0,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     """Initialize the Kronecker factor matrices and Lipschitz constants.
 
     Args:
@@ -165,8 +165,8 @@ def _init_psgd_kron_states(
         q_list: List of Kronecker factors.
         lip_const_list: List of Lipschitz constants for the Kronecker factors.
     """
-    q_list: List[torch.Tensor] = []
-    lip_const_list: List[torch.Tensor] = []
+    q_list: list[torch.Tensor] = []
+    lip_const_list: list[torch.Tensor] = []
 
     # Create identity matrices scaled by precond_init_scale for each dimension
     for size in grad.shape:
@@ -177,13 +177,13 @@ def _init_psgd_kron_states(
 
 
 def _update_precond_procrustes(
-    q_list: List[torch.Tensor],
-    lip_const_list: List[torch.Tensor],
+    q_list: list[torch.Tensor],
+    lip_const_list: list[torch.Tensor],
     exp_avg: torch.Tensor,
     damping_noise_scale: float = 1e-9,
     precond_lr: float = 0.1,
     beta_lip: float = 0.9,
-) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
     r"""Update the Kron preconditioner Q using procrustes step and uniformization.
 
     Args:
@@ -201,8 +201,8 @@ def _update_precond_procrustes(
     dampened_momentum = exp_avg + (damping_noise_scale + 1e-7 * exp_avg.abs()) * torch.randn_like(exp_avg)
     pg = psgd_kron_contractions.apply_preconditioner(q_list, dampened_momentum)
     total_numel = pg.numel()
-    updated_q_list: List[torch.Tensor] = []
-    updated_lip_const_list: List[torch.Tensor] = []
+    updated_q_list: list[torch.Tensor] = []
+    updated_lip_const_list: list[torch.Tensor] = []
     for dim, q in enumerate(q_list):
         # compute gradient covariance
         precond_grad_cov = psgd_kron_contractions.partial_contraction(pg, pg, dim)
@@ -229,7 +229,7 @@ def _update_matrix_preconditioner(
     total_numel: int,
     precond_lr: float,
     beta_lip: float,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     r"""Update matrix-structured preconditioner with adaptive Lipschitz constant.
 
     Args:
@@ -259,7 +259,7 @@ def _update_1d_preconditioner(
     total_numel: int,
     precond_lr: float,
     beta_lip: float,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     r"""Update 1D preconditioner with adaptive Lipschitz constant.
 
     Args:
diff --git a/emerging_optimizers/psgd/psgd_utils.py b/emerging_optimizers/psgd/psgd_utils.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List
-
 import torch
 
 
@@ -25,7 +23,7 @@
 
 
 @torch.compile  # type: ignore[misc]
-def uniformize_q_in_place(Q_list: List[torch.Tensor]) -> None:
+def uniformize_q_in_place(Q_list: list[torch.Tensor]) -> None:
     """Balance the dynamic ranges of kronecker factors in place to prevent numerical underflow or overflow.
 
     Each tensor in `Q_list` is rescaled so that its maximum absolute entry
diff --git a/emerging_optimizers/scalar_optimizers/adam.py b/emerging_optimizers/scalar_optimizers/adam.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Tuple
-
 import torch
 
 
@@ -28,7 +26,7 @@ def calculate_adam_update(
     grad: torch.Tensor,
     exp_avg: torch.Tensor,
     exp_avg_sq: torch.Tensor,
-    betas: Tuple[float, float],
+    betas: tuple[float, float],
     correct_bias: bool,
     use_nesterov: bool,
     step: int,
diff --git a/emerging_optimizers/scalar_optimizers/lion.py b/emerging_optimizers/scalar_optimizers/lion.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional
-
 import torch
 
 
@@ -28,7 +26,7 @@ def calculate_lion_update(
     grad: torch.Tensor,
     exp_avg: torch.Tensor,
     momentum_beta: float,
-    momentum_beta2: Optional[float] = None,
+    momentum_beta2: float | None = None,
 ) -> torch.Tensor:
     """Performs the Lion update.
 
diff --git a/emerging_optimizers/soap/soap.py b/emerging_optimizers/soap/soap.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 from functools import partial
 from itertools import chain
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable
 
 
 # TODO(@boxiangw): remove this once bump to python 3.12
@@ -86,14 +86,14 @@ def __init__(
         self,
         params: ParamsT,
         lr: float,
-        betas: Tuple[float, float] = (0.9, 0.95),
+        betas: tuple[float, float] = (0.9, 0.95),
         shampoo_beta: float = 0.95,
         eps: float = 1e-8,
         weight_decay: float = 0.01,
         *,
         weight_decay_method: opt_mixin.WeightDecayT = "decoupled",
         use_nesterov: bool = False,
-        precondition_frequency: Union[int, Callable[[int], int]] = 1,
+        precondition_frequency: int | Callable[[int], int] = 1,
         adam_warmup_steps: int = 0,
         precondition_1d: bool = False,
         correct_bias: bool = True,
@@ -293,7 +293,7 @@ def step(self, closure: Callable[[], float] | None = None) -> float | None:
 def init_kronecker_factors(
     grad: torch.Tensor,
     precondition_1d: bool = False,
-) -> List[torch.Tensor]:
+) -> list[torch.Tensor]:
     """Initializes the kronecker factor matrices for the SOAP optimizer.
 
     This function creates the initial Kronecker factor matrices (L and R) used for
@@ -338,7 +338,7 @@ def init_kronecker_factors(
         >>> print(precond_2d[1].shape)  # (20, 20)
 
     """
-    kronecker_factor_list: List[torch.Tensor] = []
+    kronecker_factor_list: list[torch.Tensor] = []
 
     if grad.dim() == 1:
         if not precondition_1d:
@@ -358,7 +358,7 @@ def init_kronecker_factors(
 
 @torch.no_grad()  # type: ignore[misc]
 def update_kronecker_factors(
-    kronecker_factor_list: List[torch.Tensor],
+    kronecker_factor_list: list[torch.Tensor],
     grad: torch.Tensor,
     shampoo_beta: float,
     precondition_1d: bool = False,
@@ -414,10 +414,10 @@ def update_kronecker_factors(
 
 @torch.no_grad()  # type: ignore[misc]
 def update_kronecker_factors_kl_shampoo(
-    kronecker_factor_list: List[torch.Tensor],
+    kronecker_factor_list: list[torch.Tensor],
     grad: torch.Tensor,
     shampoo_beta: float,
-    eigenbasis_list: List[torch.Tensor],
+    eigenbasis_list: list[torch.Tensor],
     eps: float,
     eigval_exp: float = -1.0,
 ) -> None:
@@ -457,16 +457,16 @@ def update_kronecker_factors_kl_shampoo(
 
 @torch.no_grad()  # type: ignore[misc]
 def update_eigenbasis_and_momentum(
-    kronecker_factor_list: List[torch.Tensor],
-    eigenbasis_list: List[torch.Tensor],
+    kronecker_factor_list: list[torch.Tensor],
+    eigenbasis_list: list[torch.Tensor],
     exp_avg_sq: torch.Tensor,
     momentum: torch.Tensor,
     use_eigh: bool = False,
     use_adaptive_criteria: bool = False,
-    adaptive_update_tolerance: Optional[float] = None,
+    adaptive_update_tolerance: float | None = None,
     power_iter_steps: int = 1,
     convert_to_float: bool = True,
-) -> Tuple[List[torch.Tensor], torch.Tensor, torch.Tensor]:
+) -> tuple[list[torch.Tensor], torch.Tensor, torch.Tensor]:
     """Updates the eigenbases using QR decomposition and power iteration or eigh.
 
     This function performs an update of the eigenbases (QL and QR)
@@ -559,8 +559,8 @@ def update_eigenbasis_and_momentum(
 @torch.compile  # type: ignore[misc]
 def precondition(
     grad: torch.Tensor,
-    eigenbasis_list: Optional[List[torch.Tensor]] = None,
-    dims: Optional[List[List[int]]] = None,
+    eigenbasis_list: list[torch.Tensor] | None = None,
+    dims: list[list[int]] | None = None,
 ) -> torch.Tensor:
     """Projects the gradient to and from the eigenbases of the kronecker factor matrices.
 
@@ -610,7 +610,7 @@ def precondition(
 def _is_eigenbasis_update_step(
     step: int,
     adam_warmup_steps: int,
-    precondition_frequency: Union[int, Callable[[int], int]],
+    precondition_frequency: int | Callable[[int], int],
 ) -> bool:
     """Checks if amortized computation of the eigenbasis should be recomputed.
 
diff --git a/emerging_optimizers/soap/soap_utils.py b/emerging_optimizers/soap/soap_utils.py
@@ -12,27 +12,30 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional, Tuple
+from typing import TypeAlias
 
 import torch
 
 from emerging_optimizers.utils import eig as eig_utils
 
 
+TensorList: TypeAlias = list[torch.Tensor]
+
+
 __all__ = [
     "get_eigenbasis_eigh",
     "get_eigenbasis_qr",
 ]
 
 
 def get_eigenbasis_eigh(
-    kronecker_factor_list: List[torch.Tensor],
+    kronecker_factor_list: TensorList,
     convert_to_float: bool = True,
-    eigenbasis_list: Optional[List[torch.Tensor]] = None,
+    eigenbasis_list: TensorList | None = None,
     use_adaptive_criteria: bool = False,
-    adaptive_update_tolerance: Optional[float] = None,
-    eps: Optional[float] = None,
-) -> List[torch.Tensor]:
+    adaptive_update_tolerance: float | None = None,
+    eps: float | None = None,
+) -> TensorList:
     """Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
 
     Args:
@@ -66,7 +69,7 @@ def get_eigenbasis_eigh(
         adaptive_update_tolerance = 1e-7
 
     # cast the kronecker factor matrices to float32 if convert_to_float is True
-    casted_matrix_list: List[torch.Tensor] = []
+    casted_matrix_list: TensorList = []
     for kronecker_factor in kronecker_factor_list:
         if kronecker_factor.numel() == 0:
             casted_matrix_list.append(torch.empty(0, device=kronecker_factor.device))
@@ -76,7 +79,7 @@ def get_eigenbasis_eigh(
         else:
             casted_matrix_list.append(kronecker_factor)
 
-    updated_eigenbasis_list: List[torch.Tensor] = []
+    updated_eigenbasis_list: TensorList = []
 
     # use adaptive early exit criteria
     if use_adaptive_criteria and eigenbasis_list is not None:
@@ -112,14 +115,14 @@ def get_eigenbasis_eigh(
 
 
 def get_eigenbasis_qr(
-    kronecker_factor_list: List[torch.Tensor],
-    eigenbasis_list: List[torch.Tensor],
+    kronecker_factor_list: TensorList,
+    eigenbasis_list: TensorList,
     exp_avg_sq: torch.Tensor,
     convert_to_float: bool = True,
     use_adaptive_criteria: bool = False,
-    adaptive_update_tolerance: Optional[float] = None,
+    adaptive_update_tolerance: float | None = None,
     power_iter_steps: int = 1,
-) -> Tuple[List[torch.Tensor], torch.Tensor]:
+) -> tuple[TensorList, torch.Tensor]:
     """Updates the eigenbases of the preconditioner using power iteration and QR.
 
     Computes using multiple rounds of power iteration followed by QR decomposition (orthogonal iteration).
@@ -175,8 +178,8 @@ def get_eigenbasis_qr(
     if adaptive_update_tolerance is None:
         adaptive_update_tolerance = 1e-7
 
-    casted_matrix_list: List[torch.Tensor] = []
-    casted_eigenbasis_list: List[torch.Tensor] = []
+    casted_matrix_list: TensorList = []
+    casted_eigenbasis_list: TensorList = []
     for kronecker_factor, eigenbasis in zip(kronecker_factor_list, eigenbasis_list, strict=True):
         # If the tensor is empty, propagate an empty tensor to the output lists.
         if kronecker_factor.numel() == 0:
@@ -195,7 +198,7 @@ def get_eigenbasis_qr(
     if convert_to_float and exp_avg_sq.dtype != torch.float:
         exp_avg_sq = exp_avg_sq.to(torch.float)
 
-    updated_eigenbasis_list: List[torch.Tensor] = []
+    updated_eigenbasis_list: TensorList = []
     for ind, (kronecker_factor, eigenbasis) in enumerate(zip(casted_matrix_list, casted_eigenbasis_list, strict=True)):
         if kronecker_factor.numel() == 0:
             updated_eigenbasis_list.append(torch.empty(0, device=kronecker_factor.device))
diff --git a/emerging_optimizers/utils/eig.py b/emerging_optimizers/utils/eig.py
@@ -12,8 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple
-
 import torch
 from absl import logging
 from torch import Tensor
@@ -30,8 +28,8 @@
 def eigh_with_fallback(
     x: Tensor,
     force_double: bool = False,
-    eps: Optional[float] = None,
-    output_dtype: Optional[torch.dtype] = None,
+    eps: float | None = None,
+    output_dtype: torch.dtype | None = None,
 ) -> tuple[Tensor, Tensor]:
     r"""torch.linalg.eigh() function with double precision fallback
 
@@ -190,7 +188,7 @@ def orthogonal_iteration(
     exp_avg_sq: torch.Tensor,
     convert_to_float: bool,
     power_iter_steps: int,
-) -> Tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor]:
     """Computes the eigenbases of the preconditioner using power iteration and QR decomposition.
 
     This function performs multiple rounds of power iteration followed by QR decomposition
@@ -275,7 +273,7 @@ def _is_diagonal(x: Tensor) -> bool:
     return not x.triu(diagonal=1).any() and not x.tril(diagonal=-1).any()
 
 
-def _try_handle_diagonal_matrix(x: Tensor) -> Optional[tuple[Tensor, Tensor]]:
+def _try_handle_diagonal_matrix(x: Tensor) -> tuple[Tensor, Tensor] | None:
     """Checks if matrix A is diagonal and returns its eigenvalues/vectors in ascending order if so.
 
     Args:
diff --git a/emerging_optimizers/utils/precondition_schedules.py b/emerging_optimizers/utils/precondition_schedules.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 import math
 from abc import ABC, abstractmethod
-from typing import Dict
 
 
 __all__ = [
@@ -160,7 +159,7 @@ class StepSchedule(PreconditionSchedule):
         })
     """
 
-    def __init__(self, schedule_dict: Dict[int, int], start_step: int = 0):
+    def __init__(self, schedule_dict: dict[int, int], start_step: int = 0):
         """Initialize with a dictionary mapping steps to frequencies.
 
         Args: