polishing dict learning. kernelizing rss_grad. optimizing bcd.

CielAl · CielAl · commit 88a890c7d3a4 · 2026-01-28T23:48:15.000-05:00
diff --git a/torch_staintools/augmentor/factory.py b/torch_staintools/augmentor/factory.py
@@ -58,12 +58,16 @@ def build(method: AUG_TYPE_SUPPORTED,
         aug_method: Callable
         match method:
             case 'macenko' | 'vahadane':
-                return Augmentor.build(method=method, concentration_method=concentration_method,
-                                       rng=rng, target_stain_idx=target_stain_idx,
+                return Augmentor.build(method=method,
+                                       concentration_method=concentration_method,
+                                       rng=rng,
+                                       target_stain_idx=target_stain_idx,
                                        sigma_alpha=sigma_alpha,
-                                       sigma_beta=sigma_beta, luminosity_threshold=luminosity_threshold,
+                                       sigma_beta=sigma_beta,
+                                       luminosity_threshold=luminosity_threshold,
                                        use_cache=use_cache,
                                        regularizer=regularizer,
-                                       cache_size_limit=cache_size_limit, device=device, load_path=load_path)
+                                       cache_size_limit=cache_size_limit,
+                                       device=device, load_path=load_path)
             case _:
                 raise NotImplementedError(f"{method} not implemented.")
diff --git a/torch_staintools/constants/config.py b/torch_staintools/constants/config.py
@@ -12,7 +12,7 @@ class _Config:
     DICT_POSITIVE_CODE: bool = True
 
 
-CONFIG = _Config()
+CONFIG: _Config = _Config()
 
 
 
diff --git a/torch_staintools/constants/param.py b/torch_staintools/constants/param.py
@@ -20,7 +20,7 @@ class _Param:
     OPTIM_SPARSE_DEFAULT_MAX_ITER: float = 50
 
 
-PARAM = _Param()
+PARAM: _Param = _Param()
 
 
 
diff --git a/torch_staintools/functional/concentration/implementation.py b/torch_staintools/functional/concentration/implementation.py
@@ -23,7 +23,7 @@
 @dataclass(frozen=True)
 class ConcentCfg:
     algorithm: METHOD_FACTORIZE = 'fista'
-    regularizer: float = CONFIG.OPTIM_DEFAULT_SPARSE_LAMBDA
+    regularizer: float = PARAM.OPTIM_DEFAULT_SPARSE_LAMBDA
     rng: Optional[torch.Generator] = None
     maxiter: int = PARAM.OPTIM_SPARSE_DEFAULT_MAX_ITER
     lr: Optional[float] = None
diff --git a/torch_staintools/functional/optimization/dict_learning.py b/torch_staintools/functional/optimization/dict_learning.py
@@ -5,20 +5,24 @@
 from .sparse_util import METHOD_SPARSE, validate_code, initialize_dict, collate_params
 import torch
 import torch.nn.functional as F
-from typing import Optional, cast
+from typing import Optional, cast, Tuple
 from ..eps import get_eps
 from torch_staintools.constants import CONFIG
 
+
+@torch.compile
 def update_dict_cd(dictionary: torch.Tensor, x: torch.Tensor, code: torch.Tensor,
                    positive: bool = True,
-                   dead_thresh=1e-7, rng: torch.Generator = None):
+                   dead_thresh=1e-7,
+                   rng: torch.Generator = None) -> Tuple[torch.Tensor, torch.Tensor]:
     """Update the dictionary (stain matrix) using Block Coordinate Descent algorithm.
 
     Can satisfy the positive constraint of dictionaries if specified.
-
+    Side effects: code is updated inplace.
 
     Args:
-        dictionary:  Tensor of shape (n_features, n_components) Value of the dictionary at the previous iteration.
+        dictionary: Tensor of shape (n_features, n_components).
+            Value of the dictionary at the previous iteration.
         x: Tensor of shape (n_samples, n_components)
             Sparse coding of the data against which to optimize the dictionary.
         code:  Tensor of shape (n_samples, n_components)
@@ -28,7 +32,7 @@ def update_dict_cd(dictionary: torch.Tensor, x: torch.Tensor, code: torch.Tensor
         rng: torch.Generator for initialization of dictionary and code.
 
     Returns:
-
+        torch.Tensor, torch.Tensor, corresponding to the weight and the updated code.
     """
     n_components = dictionary.size(1)
 
@@ -38,10 +42,18 @@ def update_dict_cd(dictionary: torch.Tensor, x: torch.Tensor, code: torch.Tensor
     for k in range(n_components):
         d_k = dictionary[:, k]
         z_k = code[:, k]
-        update_term = torch.outer(z_k, d_k)
-        # Update k'th atom
-        R += update_term
-        new_d_k = torch.mv(R.T, z_k)
+
+        # vanilla.  new_d =  (R + z*d^T)^T * z
+        # new_d = R^T*z + (d*z^T)*z = R^T*z + d*(z^T*z)
+        # update_term = torch.outer(z_k, d_k)
+        # R += update_term
+        # new_d_k = torch.mv(R.T, z_k) # target
+
+        # R^T*z
+        rtz = torch.mv(R.T, z_k)
+        ztz = torch.dot(z_k, z_k)
+        new_d_k = rtz + (d_k * ztz)
+
         if positive:
             new_d_k = torch.clamp(new_d_k, min=0)
 
@@ -60,14 +72,22 @@ def update_dict_cd(dictionary: torch.Tensor, x: torch.Tensor, code: torch.Tensor
         d_k_standard = new_d_k / (d_norm + get_eps(dictionary))
         d_k_final = torch.where(is_dead, d_k_random, d_k_standard)
         z_k_final = torch.where(is_dead, torch.zeros_like(z_k), z_k)
+
+        # fused
+        # must be done before updating the dict
+        r_delta = torch.outer(z_k, d_k) - torch.outer(z_k_final, d_k_final)
+
         dictionary[:, k] = d_k_final
         code[:, k] = z_k_final
-        R -= torch.outer(z_k_final, d_k_final)
 
-    return dictionary
+        #R -= torch.outer(z_k_final, d_k_final)
+        R += r_delta
+
+    return dictionary, code
 
 
-def update_dict_ridge(x, code, lambd=1e-4):
+@torch.compile
+def update_dict_ridge(x: torch.Tensor, code: torch.Tensor, lambd: float) -> Tuple[torch.Tensor, torch.Tensor]:
     """Update an (unconstrained) dictionary with ridge regression
 
     This is equivalent to a Newton step with the (L2-regularized) squared
@@ -80,17 +100,17 @@ def update_dict_ridge(x, code, lambd=1e-4):
         lambd:  weight decay parameter
 
     Returns:
-
+        torch.Tensor, torch.Tensor, corresponding to the weight and the unmodified code.
     """
 
     rhs = torch.mm(code.T, x)
     M = torch.mm(code.T, code)
     M.diagonal().add_(lambd * x.size(0))
     L = torch.linalg.cholesky(M)
-    V = torch.cholesky_solve(rhs, L).T
+    weight = torch.cholesky_solve(rhs, L).T
 
-    V = F.normalize(V, dim=0, eps=1e-12)
-    return V
+    weight = F.normalize(weight, dim=0, eps=1e-12)
+    return weight, code
 
 
 def sparse_code(x: torch.Tensor,
@@ -118,7 +138,6 @@ def sparse_code(x: torch.Tensor,
             raise ValueError("invalid algorithm parameter '{}'.".format(algorithm))
     return z
 
-
 def dict_learning_loop(x: torch.Tensor,
                        z0: torch.Tensor,
                        weight: torch.Tensor,
@@ -135,7 +154,6 @@ def dict_learning_loop(x: torch.Tensor,
 
     for _ in range(steps):
         # infer sparse coefficients and compute loss
-
         z = sparse_code(x, weight, alpha, z0, algorithm=cast(METHOD_SPARSE, algorithm),
                         lr=lr, maxiter=maxiter, tol=tol,
                         positive_code=CONFIG.DICT_POSITIVE_CODE).contiguous()
@@ -145,36 +163,37 @@ def dict_learning_loop(x: torch.Tensor,
         if CONFIG.DICT_PERSIST_CODE:
             z0 = z
         else:
-            z0 = validate_code(algorithm, init, None, weight, x, rng)
+            z0 = validate_code(algorithm, init, z0=None, x=x, weight=weight, rng=rng)
 
         # update dictionary
         if CONFIG.DICT_POSITIVE_DICTIONARY:
-            weight = update_dict_cd(weight, x, z, positive=True, rng=rng)
+            weight, z = update_dict_cd(weight, x, z, positive=True, rng=rng)
         else:
-            weight = update_dict_ridge(x, z, lambd=lambd_ridge)
+            weight, z = update_dict_ridge(x, z, lambd=lambd_ridge)
 
     return weight
 
 
 def dict_learning(x: torch.Tensor,
                   n_components: int,
                   algorithm: METHOD_SPARSE,
-                  *, alpha: float = 1e-1,
-                  lambd_ridge: float = 1e-2,
-                  steps: int = 60,
-                  rng: torch.Generator = None,
-                  init: Optional[str] = 'zero',
-                  lr: Optional[float] = None,
-                  maxiter: int = 50,
-                  tol: float = 1e-5, ):
+                  *, alpha: float,
+                  lambd_ridge: float,
+                  steps: int,
+                  rng: Optional[torch.Generator],
+                  init: Optional[str],
+                  lr: Optional[float],
+                  maxiter: int,
+                  tol: float, ):
     n_samples, n_features = x.shape
+    # pixel x c
     x = x.contiguous()
-
+    # c x stain
     weight = initialize_dict(n_features=n_features, n_components=n_components, device=x.device,
                              rng=rng, positive_dict=CONFIG.DICT_POSITIVE_DICTIONARY)
 
     # initialize
-    z0 = validate_code(algorithm, init, None, weight, x, rng)
+    z0 = validate_code(algorithm, init, z0=None, x=x, weight=weight, rng=rng)
     assert z0 is not None
     lr, alpha, tol = collate_params(z0, x, lr, weight, alpha, tol)
     return dict_learning_loop(x, z0, weight, alpha, algorithm, lambd_ridge=lambd_ridge,
diff --git a/torch_staintools/functional/optimization/solver.py b/torch_staintools/functional/optimization/solver.py
@@ -56,9 +56,16 @@ def cd_update(z, b):
     return z
 
 def rss_grad(z_k: torch.Tensor, x: torch.Tensor, weight: torch.Tensor):
+    # kernelize it?
     resid = torch.matmul(z_k, weight.T) - x
     return torch.matmul(resid, weight)
 
+def rss_grad_fast(z_k: torch.Tensor, hessian: torch.Tensor, b: torch.Tensor):
+    return torch.mm(z_k, hessian) - b
+
+def _grad_precompute(x: torch.Tensor, weight: torch.Tensor):
+    # return Hessian and bias
+    return torch.mm(weight.T, weight), torch.mm(x, weight)
 
 def softshrink(x: torch.Tensor, lambd: torch.Tensor) -> torch.Tensor:
     lambd = lambd.clamp_min(0)
@@ -67,8 +74,8 @@ def softshrink(x: torch.Tensor, lambd: torch.Tensor) -> torch.Tensor:
 
 def ista_step(
     z: torch.Tensor,
-    x: torch.Tensor,
-    weight: torch.Tensor,
+    hessian: torch.Tensor,
+    b: torch.Tensor,
     alpha: torch.Tensor,
     lr: torch.Tensor,
     positive: bool,
@@ -77,8 +84,10 @@ def ista_step(
 
     Args:
         z: code. num_pixels x num_stain
-        x: OD space. num_pixels x num_channel
-        weight: init from stain matrix --> num_channel x num_stain
+        # x: OD space. num_pixels x num_channel
+        # weight: init from stain matrix --> num_channel x num_stain
+        hessian: precomputed wtw
+        b: precomputed xw
         alpha: tensor form of the ista penalizer
         lr: tensor form of step size
         positive: if force z to be positive
@@ -88,7 +97,8 @@ def ista_step(
 
 
     z_k_safe = torch.nan_to_num(z, nan=0.0, posinf=0.0, neginf=0.0)
-    g = rss_grad(z_k_safe, x, weight) # same shape as z
+    # g = rss_grad(z_k_safe, x, weight) # same shape as z
+    g = rss_grad_fast(z_k_safe, hessian, b)
     g_safe = torch.nan_to_num(g, nan=0.0, posinf=0.0, neginf=0.0)
 
     # guard lr
@@ -108,15 +118,15 @@ def fista_step(
         z: torch.Tensor,
         y: torch.Tensor,
         t: torch.Tensor,
-        x: torch.Tensor,
-        weight: torch.Tensor,
+        hessian: torch.Tensor,
+        b: torch.Tensor,
         alpha: torch.Tensor,
         lr: torch.Tensor,
         positive_code: bool,
         tol: float
 ):
 
-    z_next = ista_step(y, x, weight, alpha, lr, positive_code)
+    z_next = ista_step(y, hessian, b, alpha, lr, positive_code)
     delta = z_next - z
     diff = delta.abs().sum()
     just_finished = diff <= tol
@@ -128,12 +138,12 @@ def fista_step(
 
 
 @torch.compile
-def ista_loop(z: torch.Tensor, x: torch.Tensor, weight: torch.Tensor,
+def ista_loop(z: torch.Tensor, hessian: torch.Tensor, b: torch.Tensor,
               alpha: torch.Tensor, lr: torch.Tensor,
               tol: float, maxiter: int, positive_code: bool):
     is_converged = torch.tensor(False, device=z.device, dtype=torch.bool)
     for _ in range(maxiter):
-        z_next = ista_step(z, x, weight, alpha, lr, positive_code)
+        z_next = ista_step(z, hessian, b, alpha, lr, positive_code)
         # check convergence
         diff = (z - z_next).abs().sum()
         just_finished = diff <= tol
@@ -146,8 +156,8 @@ def ista_loop(z: torch.Tensor, x: torch.Tensor, weight: torch.Tensor,
 @torch.compile
 def fista_loop(
         z: torch.Tensor,
-        x: torch.Tensor,
-        weight: torch.Tensor,
+        hessian: torch.Tensor,
+        b: torch.Tensor,
         alpha: torch.Tensor,
         lr: torch.Tensor,
         tol: float,
@@ -158,8 +168,10 @@ def fista_loop(
 
     Args:
         z: Initial guess
-        x: Data input (OD space)
-        weight: Dictionary matrix
+        # x: Data input (OD space)
+        # weight: Dictionary matrix
+        hessian: precomputed wtw
+        b: precomputed xw
         alpha: Regularization strength
         lr: Learning rate
         maxiter: Maximum iterations
@@ -176,7 +188,7 @@ def fista_loop(
     for i in range(maxiter):
 
         z_next, y_next, t_next, just_finished = fista_step(z, y, t,
-                                                           x, weight,
+                                                           hessian, b,
                                                            alpha, lr,
                                                            positive_code, tol)
 
@@ -212,8 +224,10 @@ def ista(x: torch.Tensor, z0: torch.Tensor,
     z0 = z0.contiguous()
     x = x.contiguous()
     weight = weight.contiguous()
-
-    return ista_loop(z0, x, weight, alpha, lr, tol, maxiter, positive_code)
+    hessian, b = _grad_precompute(x, weight)
+    # hessian = hessian.contiguous()
+    # b = b.contiguous()
+    return ista_loop(z0, hessian, b, alpha, lr, tol, maxiter, positive_code)
 
 
 def fista(x: torch.Tensor, z0: torch.Tensor,
@@ -240,5 +254,7 @@ def fista(x: torch.Tensor, z0: torch.Tensor,
     z0 = z0.contiguous()
     x = x.contiguous()
     weight = weight.contiguous()
-
-    return fista_loop(z0, x, weight, alpha, lr, tol, maxiter, positive_code)
+    hessian, b = _grad_precompute(x, weight)
+    # hessian = hessian.contiguous()
+    # b = b.contiguous()
+    return fista_loop(z0, hessian, b, alpha, lr, tol, maxiter, positive_code)
diff --git a/torch_staintools/functional/optimization/sparse_util.py b/torch_staintools/functional/optimization/sparse_util.py
@@ -1,4 +1,4 @@
-from typing import Optional, Literal, get_args, Tuple
+from typing import Optional, Literal, get_args, Tuple, cast
 import torch
 from torch.nn import functional as F
 from torch_staintools.constants import PARAM
@@ -81,14 +81,14 @@ def initialize_dict(n_features: int, n_components: int,
 
 
 def validate_code(algorithm: METHOD_SPARSE,
-                  init: str, z0: Optional[torch.Tensor],
-                  x: torch.Tensor, weight, rng):
+                  init: Optional[MODE_INIT], z0: Optional[torch.Tensor],
+                  x: torch.Tensor, weight: torch.Tensor, rng):
     # initialize code variable
     n_samples = x.size(0)
     n_components = weight.size(1)
     init = _init_defaults.get(algorithm, 'zero') if init is None else init
     if z0 is None:
-        z0 = initialize_code(x, weight, mode=init, rng=rng)
+        z0 = initialize_code(x, weight, mode=cast(MODE_INIT, init), rng=rng)
     assert z0.shape == (n_samples, n_components)
     return z0
 
diff --git a/torch_staintools/functional/stain_extraction/macenko.py b/torch_staintools/functional/stain_extraction/macenko.py

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ class _Config:`
`12`	`12`	`DICT_POSITIVE_CODE: bool = True`
`13`	`13`
`14`	`14`
`15`		`-CONFIG = _Config()`
	`15`	`+CONFIG: _Config = _Config()`
`16`	`16`
`17`	`17`
`18`	`18`
Original file line number	Diff line number	Diff line change
`@@ -20,7 +20,7 @@ class _Param:`
`20`	`20`	`OPTIM_SPARSE_DEFAULT_MAX_ITER: float = 50`
`21`	`21`
`22`	`22`
`23`		`-PARAM = _Param()`
	`23`	`+PARAM: _Param = _Param()`
`24`	`24`
`25`	`25`
`26`	`26`