Merge branch 'functional'

ClashLuke · ClashLuke · commit bb868ccaff43 · 2022-11-27T16:11:13.000+01:00
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
     name='truegrad',
     license='BSD',
     description='PyTorch interface for TrueGrad-AdamW',
-    version='1.0.0',
+    version='2.0.0',
     long_description=README,
     url='https://github.com/clashluke/truegrad',
     packages=setuptools.find_packages(),
diff --git a/truegrad/functional.py b/truegrad/functional.py
@@ -1,3 +1,4 @@
+import typing
 from typing import Any, Callable, List, Tuple
 
 import torch
@@ -156,6 +157,64 @@ def backward(ctx, dy: torch.Tensor) -> Tuple[None, torch.Tensor]:
         return dy.reshape(ctx.original_shape), None
 
 
+class TransposeFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, weight: torch.Tensor, dims: typing.List[int]) -> torch.Tensor:
+        out = TrueGradTensor(weight.transpose(*dims).detach().requires_grad_(True))
+        if weight.requires_grad:
+            ctx.save_for_backward(weight)
+            ctx.out = out
+            ctx.dims = dims
+        return out
+
+    @staticmethod
+    def backward(ctx, dy: torch.Tensor) -> Tuple[None, torch.Tensor]:
+        if not ctx.saved_tensors:
+            return None, None
+        wgt, = ctx.saved_tensors
+        if ctx.out.sum_grad_squared is not None:
+            wgt.sum_grad_squared = ctx.out.sum_grad_squared.transpose(*ctx.dims)
+        return dy.transpose(*ctx.dims), None
+
+
+class ChunkFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, weight: torch.Tensor, chunks: int, dim: int):
+        out = tuple(TrueGradTensor(c) for c in weight.chunk(chunks, dim))
+        if weight.requires_grad:
+            ctx.save_for_backward(weight)
+            ctx.out = out
+            ctx.dim = dim
+        return out
+
+    @staticmethod
+    def backward(ctx, *dy: torch.Tensor):
+        if not ctx.saved_tensors:
+            return None, None, None
+        wgt, = ctx.saved_tensors
+        wgt.sum_grad_squared = torch.cat([o.sum_grad_squared for o in ctx.out], dim=ctx.dim)
+        return torch.cat(dy, dim=ctx.dim), None, None
+
+
+class SplitFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, weight: torch.Tensor, split_size: int, dim: int):
+        out = tuple(TrueGradTensor(c) for c in weight.split(split_size, dim))
+        if weight.requires_grad:
+            ctx.save_for_backward(weight)
+            ctx.out = out
+            ctx.dim = dim
+        return out
+
+    @staticmethod
+    def backward(ctx, *dy: torch.Tensor):
+        if not ctx.saved_tensors:
+            return None, None, None
+        wgt, = ctx.saved_tensors
+        wgt.sum_grad_squared = torch.cat([o.sum_grad_squared for o in ctx.out], dim=ctx.dim)
+        return torch.cat(dy, dim=ctx.dim), None, None
+
+
 class ExpandFn(torch.autograd.Function):
     @staticmethod
     def forward(ctx, weight: torch.Tensor, new_shape: List[int]) -> torch.Tensor:
@@ -221,6 +280,9 @@ def _fn(x: torch.Tensor):
 einsum = EinsumFn.apply
 gather = GatherFn.apply
 reshape = ReshapeFn.apply
+transpose = TransposeFn.apply
+chunk = ChunkFn.apply
+split = SplitFn.apply
 expand = ExpandFn.apply
 wrap = WrapFn.apply
 
diff --git a/truegrad/nn/__init__.py b/truegrad/nn/__init__.py
@@ -5,7 +5,9 @@
 from torch.utils._pytree import tree_map
 
 from truegrad.functional import add, gather, mul, wrap
-from truegrad.nn import functional as F
+from truegrad.nn import functional
+
+F = functional
 
 
 class Normalization(nn.Module):
@@ -150,7 +152,7 @@ def __init__(self, num_features: int, eps=1e-05, elementwise_affine=True, device
 class Linear(nn.Module):
     def __init__(self, in_features: int, out_features: int, bias: bool = True) -> None:
         super(Linear, self).__init__()
-        self.weight = nn.Parameter(torch.randn((in_features, out_features)) / in_features ** 0.5)
+        self.weight = nn.Parameter(torch.randn((out_features, in_features)) / in_features ** 0.5)
         self.bias = nn.Parameter(torch.zeros((out_features,))) if bias else None
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/truegrad/nn/functional.py b/truegrad/nn/functional.py
@@ -8,7 +8,7 @@
 from torch import Tensor, nn
 from torch.nn import functional as F, grad
 
-from truegrad.functional import add, einsum, matmul, mul, reshape
+from truegrad.functional import add, chunk, einsum, matmul, mul, reshape, split, transpose
 
 _torch_functional = {k: getattr(F, k) for k in dir(F)}
 _torch = {k: getattr(torch, k) for k in dir(torch)}
@@ -551,7 +551,7 @@ def leaky_relu_(input: Tensor, negative_slope: float = 0.01):
 
 @call_torch
 def linear(input: Tensor, weight: Tensor, bias: Optional[Tensor]):
-    input = matmul(input, weight)
+    input = matmul(input, transpose(weight, (0, 1)))
     if bias is None:
         return input
     return add(input, bias)
@@ -641,21 +641,21 @@ def _in_projection_packed(
     if k is v:
         if q is k:
             # self-attention
-            return linear(q, w, b).chunk(3, dim=-1)
+            return linear(q, w, b).chunk(3, -1)
         else:
             # encoder-decoder attention
-            w_q, w_kv = w.split([E, E * 2])
+            w_q, w_kv = split(w, [E, E * 2], 0)
             if b is None:
                 b_q = b_kv = None
             else:
-                b_q, b_kv = b.split([E, E * 2])
-            return (linear(q, w_q, b_q),) + linear(k, w_kv, b_kv).chunk(2, dim=-1)
+                b_q, b_kv = split(b, [E, E * 2], 0)
+            return (linear(q, w_q, b_q),) + linear(k, w_kv, b_kv).chunk(2, -1)
     else:
-        w_q, w_k, w_v = w.chunk(3)
+        w_q, w_k, w_v = chunk(w, 3, 0)
         if b is None:
             b_q = b_k = b_v = None
         else:
-            b_q, b_k, b_v = b.chunk(3)
+            b_q, b_k, b_v = chunk(b, 3, 0)
         return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)
 
 
@@ -965,7 +965,7 @@ def multi_head_attention_forward(query: Tensor, key: Tensor, value: Tensor, embe
         if in_proj_bias is None:
             b_q = b_k = b_v = None
         else:
-            b_q, b_k, b_v = in_proj_bias.chunk(3)
+            b_q, b_k, b_v = chunk(in_proj_bias, 3, 0)
         q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
 
     # prep attention mask
diff --git a/truegrad/utils.py b/truegrad/utils.py
@@ -16,9 +16,20 @@ def _apply_fn(module: torch.nn.Module):
             _apply_fn(mod)
 
 
-def patch_torch():
-    tg_dir = dir(truegrad.nn)
-    for name in dir(torch.nn):
+def _patch(tg, th):
+    tg_dir = dir(tg)
+    for name in dir(th):
         if name not in tg_dir:
             continue
-        setattr(torch.nn, name, getattr(truegrad.nn, name))
+        item = getattr(tg, name)
+        if not hasattr(item, "__module__"):
+            continue
+        if item.__module__ != tg.__name__:
+            continue
+        setattr(th, name, item)
+
+
+def patch_torch():
+    _patch(truegrad.nn.functional, torch.nn.functional)
+    _patch(truegrad.nn.functional, torch)
+    _patch(truegrad.nn, torch.nn)