Replace Gram matrix V projection with factored form

stmcgovern · stmcgovern · commit 0e9b3fe06327 · 2026-03-06T17:23:35.000Z
Replace dV -= dV @ (V_high^T @ V_high) with the factored form
dV -= (dV @ V_high^T) @ V_high.  Under FSDP2 this replaces an
(M, M) all-reduce with a (k_high, M) all-gather — M/k_high fewer
bytes (2x for square weights, 7x for down_proj) and all-gather is
cheaper per byte.

Add opt-in caching of the all-gathered V_high (OSFT_CACHE_V=1,
default off).  V_high is frozen, so the cache is exact.

Includes bench_v_proj.py benchmark and 10 new tests.
diff --git a/benchmarks/bench_v_proj.py b/benchmarks/bench_v_proj.py
@@ -0,0 +1,166 @@
+"""Benchmark: Gram vs factored V projection under FSDP2-style sharding.
+
+Measures the V projection step of project_gradient_to_orthogonal_space()
+with Llama-8B shapes.  Compares three modes:
+
+  Gram:     all-reduce (M, M) Gram matrix, then dV -= dV @ G
+  Factored: all-gather (k_high, M) V_high, then dV -= (dV @ V^T) @ V
+  Cached:   reuse V_high from prior step (no comm), same matmuls
+
+Usage:
+  python bench_v_proj.py           # requires 2 GPUs
+"""
+
+import os
+import time
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+
+
+def bench_target(name, k_high, k_low, M, P, dev, n_iters=100):
+    """Benchmark one OSFT target shape.  Returns dict of timings."""
+    local_k_high = k_high // P
+    local_k_low = k_low // P
+
+    # V_high has orthonormal rows (from SVD) — create via QR in float32
+    V_full_f32 = torch.linalg.qr(
+        torch.randn(M, k_high, device=dev)
+    )[0].T[:k_high]  # (k_high, M) orthonormal rows
+    V_full = V_full_f32.to(torch.bfloat16)
+
+    # Each rank's shard of V_high
+    local_V = V_full[rank * local_k_high:(rank + 1) * local_k_high].contiguous()
+    local_dV = torch.randn(local_k_low, M, device=dev, dtype=torch.bfloat16)
+    dV_save = local_dV.clone()
+
+    # Pre-allocate for all-gather
+    V_ag = torch.empty_like(V_full)
+
+    # Warmup
+    for _ in range(10):
+        local_dV.copy_(dV_save)
+        G = torch.mm(local_V.T, local_V)
+        dist.all_reduce(G)
+        local_dV.add_(torch.mm(local_dV, G), alpha=-1.0)
+
+        local_dV.copy_(dV_save)
+        dist.all_gather_into_tensor(V_ag, local_V)
+        coeff = torch.mm(local_dV, V_ag.T)
+        local_dV.addmm_(coeff, V_ag, alpha=-1.0)
+    torch.cuda.synchronize()
+    dist.barrier()
+
+    # --- Gram form: all-reduce (M, M), then dV -= dV @ G ---
+    torch.cuda.synchronize()
+    dist.barrier()
+    t0 = time.perf_counter()
+    for _ in range(n_iters):
+        local_dV.copy_(dV_save)
+        G = torch.mm(local_V.T, local_V)
+        dist.all_reduce(G)
+        local_dV.add_(torch.mm(local_dV, G), alpha=-1.0)
+    torch.cuda.synchronize()
+    gram_ms = (time.perf_counter() - t0) / n_iters * 1000
+
+    # --- Factored form: all-gather (k_high, M), then two matmuls ---
+    torch.cuda.synchronize()
+    dist.barrier()
+    t0 = time.perf_counter()
+    for _ in range(n_iters):
+        local_dV.copy_(dV_save)
+        dist.all_gather_into_tensor(V_ag, local_V)
+        coeff = torch.mm(local_dV, V_ag.T)
+        local_dV.addmm_(coeff, V_ag, alpha=-1.0)
+    torch.cuda.synchronize()
+    fact_ms = (time.perf_counter() - t0) / n_iters * 1000
+
+    # --- Cached: no comm, just matmuls ---
+    torch.cuda.synchronize()
+    dist.barrier()
+    t0 = time.perf_counter()
+    for _ in range(n_iters):
+        local_dV.copy_(dV_save)
+        coeff = torch.mm(local_dV, V_ag.T)
+        local_dV.addmm_(coeff, V_ag, alpha=-1.0)
+    torch.cuda.synchronize()
+    cached_ms = (time.perf_counter() - t0) / n_iters * 1000
+
+    # Correctness: verify in float32 (bf16 accumulation differs by ~5%)
+    dV_f32 = dV_save.float()
+    V_f32 = V_full_f32
+    gram_f32 = dV_f32 - dV_f32 @ (V_f32.T @ V_f32)
+    fact_f32 = dV_f32 - (dV_f32 @ V_f32.T) @ V_f32
+    max_diff = (fact_f32 - gram_f32).abs().max().item()
+
+    return {
+        "name": name, "k_high": k_high, "M": M,
+        "gram_bytes": M * M * 2, "fact_bytes": k_high * M * 2,
+        "gram_ms": gram_ms, "fact_ms": fact_ms, "cached_ms": cached_ms,
+        "max_diff": max_diff,
+    }
+
+
+# Global for shard indexing
+rank = 0
+
+
+def run(rank_, world_size):
+    global rank
+    rank = rank_
+    os.environ.update(MASTER_ADDR="localhost", MASTER_PORT="29500",
+                      RANK=str(rank), WORLD_SIZE=str(world_size))
+    dist.init_process_group("nccl")
+    torch.cuda.set_device(rank)
+    torch.manual_seed(42)
+    dev = f"cuda:{rank}"
+
+    # Llama-8B shapes, URR=0.5
+    targets = [
+        # name,       k_high, k_low, M
+        ("down_proj",  2048,   2048,  14336),  # (4096, 14336) → ratio = 7x
+        ("q_proj",     2048,   2048,  4096),   # (4096, 4096)  → ratio = 2x
+    ]
+
+    results = []
+    for name, k_high, k_low, M in targets:
+        r = bench_target(name, k_high, k_low, M, world_size, dev)
+        results.append(r)
+
+    if rank == 0:
+        gpu = torch.cuda.get_device_name(0)
+        print(f"V projection benchmark — {world_size}× {gpu}, Llama-8B shapes, bf16")
+        print("=" * 70)
+        print()
+        for r in results:
+            ratio = r["M"] / r["k_high"]
+            print(f"  {r['name']:10s}  V_high ({r['k_high']}, {r['M']})"
+                  f"    M/k_high = {ratio:.0f}x")
+            print(f"  {'':10s}  Gram        Factored    Cached")
+            print(f"  {'':10s}  ----------  ----------  ----------")
+            print(f"  {'comm':10s}  all-reduce  all-gather  none")
+            print(f"  {'bytes':10s}  {r['gram_bytes']/1e6:>7.0f} MB  {r['fact_bytes']/1e6:>7.0f} MB  {0:>7.0f} MB")
+            print(f"  {'time':10s}  {r['gram_ms']:>7.2f} ms  {r['fact_ms']:>7.2f} ms  {r['cached_ms']:>7.2f} ms")
+            print(f"  {'speedup':10s}  {'—':>10s}  {r['gram_ms']/r['fact_ms']:>7.1f}x    {r['gram_ms']/r['cached_ms']:>7.1f}x")
+            ok = "ok" if r["max_diff"] < 1e-4 else "FAIL"
+            print(f"  {'correct':10s}  —           f32 max diff = {r['max_diff']:.1e} ({ok})")
+            print()
+
+        # Aggregate: 32 layers × 7 targets (4 q/k/v/o + 2 gate/up + 1 down)
+        sq = next(r for r in results if r["name"] == "q_proj")
+        dp = next(r for r in results if r["name"] == "down_proj")
+        gram_tot = (6 * sq["gram_ms"] + dp["gram_ms"]) * 32
+        fact_tot = (6 * sq["fact_ms"] + dp["fact_ms"]) * 32
+        cached_tot = (6 * sq["cached_ms"] + dp["cached_ms"]) * 32
+        print(f"  Aggregate (32 layers × 7 targets = 224):")
+        print(f"  {'':10s}  Gram        Factored    Cached")
+        print(f"  {'total':10s}  {gram_tot:>7.0f} ms  {fact_tot:>7.0f} ms  {cached_tot:>7.0f} ms")
+        print(f"  {'speedup':10s}  {'—':>10s}  {gram_tot/fact_tot:>7.1f}x    {gram_tot/cached_tot:>7.1f}x")
+        print()
+
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    mp.spawn(run, args=(2,), nprocs=2, join=True)
diff --git a/src/mini_trainer/osft_utils.py b/src/mini_trainer/osft_utils.py
@@ -30,6 +30,20 @@
     os.getenv("OSFT_CACHE_CLEAR_INTERVAL", 5)
 )  # Clear GPU cache every N parameters during matrix reconstruction
 
+# Opt-in: cache the all-gathered (full) V_high to avoid repeating the
+# all-gather on every step.  V_high is frozen (requires_grad=False), so the
+# cache is exact.
+#
+# V projection uses the factored form: dV -= (dV @ V_high^T) @ V_high.
+# Under FSDP2, V_high is dim-0 sharded, so the factored form requires an
+# all-gather of V_high (k_high × M).  Caching stores the all-gathered result.
+#
+# Default OFF because the cache is REPLICATED on every FSDP2 rank (not
+# sharded), adding ~5.1 GB per rank for Llama-8B (all 224 targets in bf16).
+# For Llama-70B+ the cache exceeds GPU memory.  Set to "1" only after
+# confirming sufficient memory headroom.
+OSFT_CACHE_V = os.getenv("OSFT_CACHE_V", "0") == "1"
+
 
 def _supports_use_batch() -> bool:
     """Check if torch.distributed send/recv_object_list support the use_batch parameter (PyTorch 2.9+)."""
@@ -514,11 +528,34 @@ def reconstruct_weight_matrix(
     return reconstructed
 
 
-def project_gradient_to_orthogonal_space(svd_dict: SVDDecompositionDict):
+def project_gradient_to_orthogonal_space(
+    svd_dict: SVDDecompositionDict,
+    cache_holder: "nn.Module | None" = None,
+):
     """
-    Projects the gradient of the low-rank parameters (U_low, V_low) to be orthogonal to the frozen high-rank subspace.
+    Projects the gradient of the low-rank parameters (U_low, V_low) to be
+    orthogonal to the frozen high-rank subspace.
+
+    Both projections use the factored form:
+      dU -= U_high @ (U_high^T @ dU)
+      dV -= (dV @ V_high^T) @ V_high
 
-    This step ensures that learning new tasks does not interfere with previously learned representations by enforcing an orthogonality constraint.
+    Under FSDP2, U_high is dim-0 sharded along N (the large dimension), so
+    U_high^T @ dU contracts over the sharded dim → partial sum → all-reduce
+    of a small (k_high, k_low) matrix.
+
+    V_high is dim-0 sharded along k_high (the small dimension), so the
+    factored form requires an all-gather of V_high to get the full
+    (k_high, M) tensor.  This is M/k_high fewer bytes than the Gram
+    matrix all-reduce (k_high × M vs M × M) — 2x for square weights,
+    7x for down_proj where k_high = min(N, M) × (1 - URR).
+
+    Args:
+        svd_dict: Dictionary containing the SVD decomposition components.
+        cache_holder: Optional module on which to cache the all-gathered
+            V_high.  V_high is frozen, so the cache is exact.  When provided
+            and OSFT_CACHE_V is enabled, V_high is all-gathered once and
+            reused on subsequent steps.
 
     TODO(osilkin): Add mixed-precision gradients here
     """
@@ -551,21 +588,37 @@ def project_gradient_to_orthogonal_space(svd_dict: SVDDecompositionDict):
         else:
             dU.copy_(local_dU)
 
-    # Repeat projection for V_low using V_high
+    # Project V_low gradients: dV -= (dV @ V_high^T) @ V_high
+    # All-gather V_high from FSDP2 shards (or use cache) — see docstring for cost analysis.
     if svd_dict["V_low"].grad is not None:
         dV = svd_dict["V_low"].grad
         local_V_high = getattr(V_high, "to_local", lambda: V_high)()
         local_dV = getattr(dV, "to_local", lambda: dV)()
 
-        # Compute Gram matrix G = V_high^T @ V_high for global projection across row-sharded V_high
-        # Assumes column dimension is consistent across ranks (row sharding over singular vectors)
-        G_local = torch.mm(local_V_high.transpose(0, 1), local_V_high)
-        if dist.is_initialized() and dist.get_world_size() > 1:
-            dist.all_reduce(G_local, op=dist.ReduceOp.SUM)
+        # V_high is frozen — reuse cached all-gathered tensor when available.
+        can_cache = OSFT_CACHE_V and cache_holder is not None
+        cached = getattr(cache_holder, "_osft_v_high_full", None) if can_cache else None
 
-        # Apply projection: dV = dV - dV @ G (use local shard of dV)
-        update = torch.mm(local_dV, G_local)
-        local_dV.add_(update, alpha=-1.0)
+        if cached is not None:
+            V_high_full = cached
+        else:
+            if dist.is_initialized() and dist.get_world_size() > 1:
+                world_size = dist.get_world_size()
+                V_high_full = torch.empty(
+                    local_V_high.shape[0] * world_size, local_V_high.shape[1],
+                    dtype=local_V_high.dtype, device=local_V_high.device,
+                )
+                dist.all_gather_into_tensor(V_high_full, local_V_high)
+            else:
+                V_high_full = local_V_high
+            if can_cache:
+                # .detach() ensures plain Tensor, not nn.Parameter — avoids
+                # nn.Module.__setattr__ registering it into state_dict.
+                cache_holder._osft_v_high_full = V_high_full.detach()
+
+        # Two local matmuls — no (M, M) intermediate
+        coeff = torch.mm(local_dV, V_high_full.transpose(0, 1))  # (k_low/P, k_high)
+        local_dV.addmm_(coeff, V_high_full, alpha=-1.0)          # (k_low/P, M)
 
         if hasattr(dV, "_local_tensor"):
             dV._local_tensor.copy_(local_dV)
@@ -966,6 +1019,10 @@ def _reset_osft_metadata(self):
             self.osft_paramspec_registry = {}
             self._osft_handles = {}
             self.osft_params = {}
+            # Clear any cached all-gathered V_high — V_high changes on reinit.
+            for module in self.modules():
+                if hasattr(module, "_osft_v_high_full"):
+                    del module._osft_v_high_full
 
         @staticmethod
         def _load_non_distributed(
@@ -1982,7 +2039,14 @@ def project_gradients(self):
             with the high-rank subspace encoding prior task knowledge.
 
             This method should be called after backpropagation and before optimizer step.
+
+            When ``OSFT_CACHE_V=1`` is set, the all-gathered V_high tensor is
+            cached on each module after the first step.  V_high is frozen, so
+            the cache is exact.  This eliminates per-step V all-gather traffic.
+            Default is off because the cache is replicated on every FSDP2 rank
+            (~5.1 GB for Llama-8B, infeasible for 70B+).
             """
+            caches_populated_this_call = 0
             for module in self.modules():
                 # Only process real OSFT-attached linear modules, not the top-level container
                 if (
@@ -1991,11 +2055,27 @@ def project_gradients(self):
                     and hasattr(module, "osft_S_high")
                     and hasattr(module, "osft_V_high")
                 ):
+                    had_cache = hasattr(module, "_osft_v_high_full")
                     try:
                         svd_dict = self.get_svd_dict_for_module(module)
                     except ValueError as err:
                         raise ValueError(f"error in projecting gradients for module: {module}") from err
-                    project_gradient_to_orthogonal_space(svd_dict)
+                    project_gradient_to_orthogonal_space(svd_dict, cache_holder=module)
+                    if not had_cache and hasattr(module, "_osft_v_high_full"):
+                        caches_populated_this_call += 1
+
+            if caches_populated_this_call > 0:
+                total_bytes = sum(
+                    module._osft_v_high_full.nelement() * module._osft_v_high_full.element_size()
+                    for module in self.modules()
+                    if hasattr(module, "_osft_v_high_full")
+                )
+                log_rank_0(
+                    f"Cached {caches_populated_this_call} V_high tensors "
+                    f"({total_bytes / 1e9:.2f} GB). "
+                    f"Subsequent steps skip V all-gathers. "
+                    f"Set OSFT_CACHE_V=0 to disable."
+                )
 
         def prepare_state_dict_for_save(self, state_dict):
             """Reconstruct dense weights into ``state_dict`` for saving with memory optimization."""
diff --git a/tests/test_osft.py b/tests/test_osft.py