add PSGD PRO, fix existing PSGD

ClashLuke · ClashLuke · commit 318cf5f11287 · 2026-03-28T15:08:22.000Z
diff --git a/README.md b/README.md
@@ -70,7 +70,7 @@ Muon, MuonLaProp, OrthoLaProp, LaPropOrtho
 SOAP, PaLMSOAP, PrecondScheduleSOAP, PrecondSchedulePaLMSOAP, SOAPNAdam, SOAPAdEMAMix, ForeachSOLP
 
 **PSGD (Kronecker):**
-PSGDKron, CachedPSGDKron, DelayedPSGD, CachedDelayedPSGDKron, PurePSGD, NewtonPSGDKron, NewtonHybrid2PSGDKron
+PSGDPRO, PSGDKron, CachedPSGDKron, DelayedPSGD, CachedDelayedPSGDKron, PurePSGD, NewtonPSGDKron, NewtonHybrid2PSGDKron
 
 `Newton`-PSGD requires a closure passed to `step()`.
 
diff --git a/heavyball/__init__.py b/heavyball/__init__.py
@@ -1023,6 +1023,82 @@ class NewtonHybrid2PSGDKron(ForeachCachedNewtonPSGD):
     hvp_interval = 2
 
 
+class PSGDPRO(C.BaseOpt):
+    """
+    PSGD with Q0.5EQ1.5 (PRO/Procrustes) preconditioner update.
+    Solve-free alternative to standard PSGD-Kron (EQ method).
+    Reference: https://github.com/lixilinx/psgd_torch
+    """
+
+    cached: bool = False
+    exp_avg_input: bool = True
+
+    def __init__(
+        self,
+        params,
+        lr=0.001,
+        beta=None,
+        betas=(0.9, 0.999),
+        weight_decay=0.0,
+        preconditioner_update_probability=C.use_default,
+        max_size_triangular=2048,
+        min_ndim_triangular=2,
+        memory_save_mode=None,
+        momentum_into_precond_update=True,
+        warmup_steps: int = 0,
+        merge_dims: bool = False,
+        split: bool = False,
+        foreach: bool = True,
+        q_dtype="float32",
+        stochastic_schedule: bool = False,
+        storage_dtype: str = "float32",
+        mars: bool = False,
+        caution: bool = False,
+        mars_gamma: float = 0.0025,
+        cached: Optional[bool] = C.use_default,
+        exp_avg_input: Optional[bool] = C.use_default,
+        gradient_clipping: C.str_or_fn = C.use_default,
+        update_clipping: C.str_or_fn = C.use_default,
+        precond_grad_accum: bool = False,
+        lower_bound_beta: float = 0.9,
+        dampening: float = 2**-13,
+        precond_update_power_iterations: int = 2,
+        precond_init_scale=None,
+        precond_init_scale_scale: float = 1,
+        precond_init_scale_power: Optional[float] = None,
+        precond_lr: float = 0.1,
+        compile_step: bool = C.use_default,
+        promote: bool = C.use_default,
+        ecc: str | None = None,
+        param_ecc: str | None = None,
+        **kwargs,
+    ):
+        cached = C.default(cached, self.cached)
+        exp_avg_input = C.default(exp_avg_input, self.exp_avg_input)
+        update_clipping = C.default(update_clipping, utils.trust_region_clip_)
+
+        params, defaults = C._build_defaults(locals())
+        defaults["store_triu_as_line"] = False
+        defaults["inverse_free"] = False
+
+        self.precond_schedule = C.default(
+            defaults.pop("preconditioner_update_probability"), utils.precond_update_prob_schedule()
+        )
+
+        super().__init__(
+            params,
+            defaults,
+            foreach,
+            gradient_clipping,
+            update_clipping,
+            False,
+            fns=(
+                *(C.exp_avg,) * exp_avg_input,
+                functools.partial(C.scale_by_psgd_pro, cached=cached),
+            ),
+        )
+
+
 class ForeachPSGDLRA(C.BaseOpt):
     """
     Originally from Evan Walters and Omead Pooladzandi, 2024
diff --git a/heavyball/chainable.py b/heavyball/chainable.py
@@ -792,6 +792,27 @@ def _init_psgd_kron(state, group, update, grad, param, cached: bool = False, pro
     state["Q_cache"] = [torch.empty_like(q) for q in Q]
 
 
+def _init_psgd_pro_kron(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
+    Q = utils.init_Q_exprs(
+        grad,
+        group["precond_init_scale"],
+        group["precond_init_scale_scale"],
+        group["precond_init_scale_power"],
+        group["max_size_triangular"],
+        group["min_ndim_triangular"],
+        group["memory_save_mode"],
+        None,
+        None,
+        dtype=getattr(torch, group["q_dtype"]),
+    )
+    state["Q"] = Q
+    state["running_lower_bound"] = [torch.zeros((1,), device=q.device, dtype=torch.float64) for q in Q]
+    state["step"] = torch.zeros((), device=param.device, dtype=torch.float64)
+    if not cached:
+        return
+    state["Q_cache"] = [torch.empty_like(q) for q in Q]
+
+
 def _init_psgd_lra(state, group, update, grad, param, cached: bool = False, prob: Optional[callable] = None):
     state["U"], state["V"], state["d"] = utils.init_lra(
         grad,
@@ -1094,6 +1115,56 @@ def _update_psgd_precond(
     return None
 
 
+def _update_psgd_pro_precond(
+    cached,
+    Q_cache,
+    group,
+    param,
+    grad,
+    Q,
+    running_lower_bound,
+    step,
+    prob: Optional[callable] = None,
+) -> None:
+    if prob is None:
+        prob = utils.precond_update_prob_schedule()
+
+    if not group["is_preconditioning"]:
+        return
+
+    utils.psgd_pro_update_precond(
+        grad,
+        group["precond_lr"],
+        Q,
+        running_lower_bound,
+        group["lower_bound_beta"],
+        group["precond_update_power_iterations"],
+        group["dampening"],
+    )
+
+    if isinstance(prob, float):
+        float_prob = prob
+    else:
+        float_prob = prob(group["step"])
+    group["is_cached"] = should_use_cache = cached and float_prob < 0.5
+
+    if not should_use_cache or not cached:
+        return
+
+    for i, (c_, q_) in enumerate(zip(Q_cache, Q)):
+        if c_ is None:
+            c_ = (
+                torch.empty_like(q_)
+                if q_.ndim == 1
+                else torch.empty(q_.shape[0], q_.shape[0], device=q_.device, dtype=q_.dtype)
+            )
+            Q_cache[i] = c_
+        if q_.ndim == 2:
+            torch.matmul(q_.T, q_, out=c_)
+        else:
+            torch.mul(q_, q_, out=c_)
+
+
 def _cached_psgd_precond_grad(group, update, Q, Q_cache, grad):
     kwargs = {"ea": update, "caution": group["caution"], "grad": grad}
     if group.get("is_cached", False) and Q_cache[0] is not None:
@@ -1297,6 +1368,51 @@ def update_by_delayed_psgd(
     raise SkipUpdate from None
 
 
+@needs_full_param
+@SqueezeGrad
+@PrecondGradAccumGuard
+@general_guard("Q", "Q_cache", "running_lower_bound", "step", init_fn=_init_psgd_pro_kron, skip_first=False)
+@no_state_no_foreach
+def scale_by_psgd_pro(
+    group,
+    update,
+    grad,
+    param,
+    update_to_precond,
+    Q,
+    Q_cache,
+    running_lower_bound: List[Tensor],
+    step: Tensor,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
+    _update_psgd_pro_precond(cached, Q_cache, group, param, update_to_precond, Q, running_lower_bound, step, prob)
+    return _cached_psgd_precond_grad(group, update, Q, Q_cache, grad)
+
+
+@needs_full_param
+@SqueezeGrad
+@PrecondGradAccumGuard
+@general_guard("Q", "Q_cache", "running_lower_bound", "step", init_fn=_init_psgd_pro_kron, skip_first=False)
+@no_state_no_foreach
+def update_by_psgd_pro(
+    group,
+    update,
+    grad,
+    param,
+    update_to_precond,
+    Q,
+    Q_cache,
+    running_lower_bound: List[Tensor],
+    step: Tensor,
+    cached: bool = False,
+    prob: Optional[callable] = None,
+):
+    _update_psgd_pro_precond(cached, Q_cache, group, param, update_to_precond, Q, running_lower_bound, step, prob)
+    _fused_cached_psgd_precond_grad(group, update, param, update, Q, Q_cache)
+    raise SkipUpdate from None
+
+
 def palm_beta2(state, group, update, grad, param):
     beta2 = 1 - group["step"] ** -group["beta2_scale"]
     group["betas"] = (utils.get_beta1(group), beta2)
diff --git a/heavyball/utils.py b/heavyball/utils.py
@@ -2525,9 +2525,10 @@ def lra_precond(U: Tensor, V: Tensor, d: Tensor, g: Tensor):
 
 @decorator_knowngood
 def dampen_grad(g: Tensor, damp: float = 2**-13):
-    # https://github.com/lixilinx/psgd_torch/blob/1943e66596111e78157ca1b72b31c1dfdf0653ef/preconditioned_stochastic_gradient_descent.py#L50
+    # https://github.com/lixilinx/psgd_torch/blob/89b4cead31b7ad1494c4cf4dc39f4cbf920ff14d/psgd.py
     v = torch.randn_like(g)
-    return v, g + damp * g.abs().mean() * v
+    damping = damp + torch.finfo(g.dtype).eps * g.abs()
+    return v, g + damping * v
 
 
 @decorator_knowngood
@@ -2768,6 +2769,44 @@ def max_singular_value(A: Tensor, max_svd: int = 0, use_cholesky: bool = False,
     return max_singular_value_power_iter(A, None, iterations=power_iter)
 
 
+@decorator_knowngood
+def max_eigenvalue_spd(A_outer: Tensor, power_iter: int = 4) -> Tensor:
+    """Power iteration for the largest eigenvalue of a symmetric positive (semi)definite matrix.
+    Exploits A = A^T: A^T A = A^2, so v -> A^T(Av) = v -> A(Av), saving a transpose.
+    Uses x @ A.mT (gemm transB=true) for faster BLAS dispatch than A.mv(x)."""
+    if A_outer.ndim < 2:
+        return A_outer.max()
+    x_norm, max_idx = A_outer.norm(dim=1).max(dim=0)
+    x_norm = promote(x_norm)
+
+    def _inner():
+        x = A_outer.index_select(0, max_idx).flatten().contiguous()
+        A = stochastic_round_(A_outer / x_norm)
+        x = x / x_norm
+
+        def _mv(x):
+            return promote((x.to(A.dtype) @ A.mT) @ A.mT)
+
+        for _ in range(power_iter):
+            x = F.normalize(_mv(x), dim=0)
+        return (x @ _mv(x)).to(x_norm.dtype).sqrt() * x_norm
+
+    return cond(x_norm > 0, _inner, lambda: x_norm.squeeze().clone()).squeeze()
+
+
+@decorator_knowngood
+def procrustes_step(Q: Tensor, max_step_size: float = 1 / 8) -> None:
+    R = (Q.T - Q).contiguous()
+    R_norm = max_singular_value(R, power_iter=2) + torch.finfo(R.dtype).smallest_normal
+    R = R / R_norm
+    RQ = R @ Q
+    RRQ = R @ RQ
+    tr_RQ = RQ.diagonal().sum()
+    tr_RRQ = RRQ.diagonal().sum()
+    a = torch.where(tr_RRQ < 0, torch.clamp(-tr_RQ / tr_RRQ, max=max_step_size), max_step_size)
+    Q.add_(a * (RQ + 0.5 * a * RRQ))
+
+
 @decorator_knowngood
 def clamped_max_singular_value(
     A: Tensor, min: float, max_svd: int = 0, use_cholesky: bool = False, power_iter: int = 16
@@ -2927,22 +2966,11 @@ def _chebychef_coeff(degree: int, device, eps: float = 1e-8):
     return coeff0.float(), coeffs.float()
 
 
-@decorator_knowngood
-def _psgd_default_preconditioner_grad(
-    terms: List[Tuple[Tensor, Tensor]],
-    Q: List[Tensor],
-) -> List[Tensor]:
-    out = []
-    for q, (x, y) in zip(Q, terms):
-        x = promote(x)
-        y = promote(y)
-        update = x - y
-        if q.ndim < 2:
-            update = promote(q) * update
-        else:
-            update = (promote(q) @ update).triu()
-        out.append(update)
-    return out
+def _update_lb(ell: Tensor, lb_state: Tensor, beta: Tensor) -> Tensor:
+    ell = promote(ell)
+    ell = ell.maximum(promote(lb_state) + (ell - promote(lb_state)) * (1 - beta))
+    copy_stochastic_(lb_state, ell)
+    return ell
 
 
 @decorator
@@ -2965,15 +2993,61 @@ def psgd_update_precond(
     precond_lr, beta2, lower_bount_beta = scalar_guard(precond_lr, beta2, lower_bount_beta, G)
 
     A, conjB = psgd_calc_A_and_conjB(G, Q, V)
-    terms = [(compiled_einsum(exprG, A, A), compiled_einsum(exprG, conjB, conjB)) for exprG in exprGs]
-    del A, conjB, V
-    updates = _psgd_default_preconditioner_grad(terms, Q)
-    _psgd_precond_update_(
-        updates, oq, running_lower_bound, lower_bount_beta, precond_lr, store_triu_as_line, power_iter
-    )
+    del V
+
+    for oq_i, q, exprG, lb_state in zip(oq, Q, exprGs, running_lower_bound):
+        term1 = promote(compiled_einsum(exprG, A, A))
+        term2 = promote(compiled_einsum(exprG, conjB, conjB))
+
+        if q.ndim < 2:
+            ell = _update_lb((term1 + term2).max(), lb_state, lower_bount_beta)
+            update = promote(q) * (term1 - term2)
+        else:
+            ell = _update_lb(max_eigenvalue_spd(term1 + term2, power_iter=power_iter), lb_state, lower_bount_beta)
+            update = (term1 - term2).triu() @ promote(q)
+            if store_triu_as_line:
+                update = triu_to_line([update])[0][1]
+
+        real_oq = oq_i[1] if isinstance(oq_i, tuple) else oq_i
+        copy_stochastic_(real_oq, promote(real_oq) - update / ell * precond_lr)
     return None
 
 
+@decorator
+def psgd_pro_update_precond(
+    G: Tensor,
+    precond_lr: float,
+    Q: List[Tensor],
+    running_lower_bound: List[Tensor],
+    lower_bount_beta: float,
+    power_iter: int,
+    dampening: float,
+) -> None:
+    """Update Kronecker product preconditioner Q with Q0.5EQ1.5 (PRO) method."""
+    psgd_balance_Q(Q)
+    exprGs = calcG_expr(ndim_tuple(Q), G.ndim)
+    precond_lr, lower_bount_beta = scalar_guard(precond_lr, lower_bount_beta, G)
+
+    damping = dampening + torch.finfo(G.dtype).eps * G.abs()
+    Pg = psgd_precond_grad(G + damping * torch.randn_like(G), Q)
+
+    total_numel = G.numel()
+    for q, exprG, lb_state in zip(Q, exprGs, running_lower_bound):
+        term1 = promote(compiled_einsum(exprG, Pg, Pg))
+        q_ = promote(q)
+
+        if q.ndim < 2:
+            term2 = total_numel / max(1, q.numel())
+            ell = _update_lb(term1.max() + term2, lb_state, lower_bount_beta)
+            copy_stochastic_(q, q_ - q_ * (term1 - term2) / ell * precond_lr)
+        else:
+            term2 = total_numel / q.shape[0]
+            ell = _update_lb(max_eigenvalue_spd(term1, power_iter=power_iter) + term2, lb_state, lower_bount_beta)
+            copy_stochastic_(q, q_ - (term1 @ q_ - term2 * q_) / ell * precond_lr)
+            procrustes_step(q)
+    del Pg
+
+
 @decorator_knowngood
 def bf16_matmul(x: Tensor, y: Tensor):
     return (promote(x) @ promote(y)).to(x.dtype)