first pass

lucidrains · lucidrains · commit c501fd9b2676 · 2025-01-18T08:39:06.000-08:00
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -0,0 +1,36 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Upload Python Package
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  deploy:
+
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build
+    - name: Build package
+      run: python -m build
+    - name: Publish package
+      uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/GAF_microbatch_pytorch/GAF.py b/GAF_microbatch_pytorch/GAF.py
@@ -0,0 +1,194 @@
+from __future__ import annotations
+
+from functools import partial
+from typing import Literal, Callable
+
+import torch
+from torch import nn
+from torch.nn import Module, Linear
+from torch.autograd import Function
+import torch.nn.functional as F
+
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torch.func import functional_call, vjp, vmap
+
+from einops import einsum, rearrange, repeat, reduce
+
+# helper functions
+
+def exists(v):
+    return v is not None
+
+def default(v, d):
+    return v if exists(v) else d
+
+# distance used for gradient agreement
+# they found cosine distance to work the best, at a threshold of ~0.96
+
+def l2norm(t):
+    return F.normalize(t, p = 2, dim  = -1)
+
+def cosine_sim_distance(grads):
+    grads = rearrange(grads, 'b ... -> b (...)')
+    normed = l2norm(grads)
+    dist = einsum(normed, normed, 'i d, j d -> i j')
+    return 1. - dist
+
+def filter_gradients_by_agreement(
+    grads,
+    threshold,
+    strategy: Literal[
+        'accept_max_neighbors',
+        'accept_min_neighbors'
+    ] = 'accept_max_neighbors',
+    accept_batch_frac = 0.2
+):
+    """ main gradient filtering function """
+
+    batch = grads.shape[0]
+
+    dist = cosine_sim_distance(grads) # (batch, batch) cosine sim gradient distance
+
+    accept_mask = dist < threshold
+
+    num_neighbors_within_dist = accept_mask.sum(dim = -1)
+
+    if (num_neighbors_within_dist == 1).all():
+        return torch.zeros_like(grads)
+
+    # take the most naive approach
+
+    if strategy == 'accept_max_neighbors':
+        # accept the gradient and its neighbors that is the majority
+
+        center_ind = num_neighbors_within_dist.argmax(dim = -1)
+
+        accept_mask = accept_mask[center_ind]
+
+    elif strategy == 'accept_min_neighbors':
+        # reject any gradients that does not have at least `batch * accept_batch_frac` similar gradients within the same batch
+
+        accept_mask = num_neighbors_within_dist >= max(batch * accept_batch_frac, 2)
+    else:
+        raise ValueError(f'unknown strategy {strategy}')
+
+    if not accept_mask.any():
+        return torch.zeros_like(grads)
+
+    if accept_mask.all():
+        return grads
+
+    renorm_scale = batch / accept_mask.sum().item()
+
+    # filter out the gradients
+
+    grads[~accept_mask] = 0.
+
+    # renormalize based on how many accepted
+
+    grads *= renorm_scale
+
+    return grads
+
+# custom linear
+
+class GAF(Function):
+
+    @classmethod
+    def forward(self, ctx, tree_spec, *tree_nodes):
+
+        package = tree_unflatten(tree_nodes, tree_spec)
+
+        net = package['net']
+        params, buffers = package['params_buffers']
+        filter_gradients_fn = package['filter_gradients_fn']
+        inp_tensor, args, kwargs = package['inputs']
+
+        batch = inp_tensor.shape[0]
+
+        def fn(params, buffers, inp_tensor):
+            return functional_call(net, (params, buffers), (inp_tensor, *args), kwargs)
+
+        fn = vmap(fn, in_dims = (0, None, 0))
+
+        params = {name: repeat(t, '... -> b ...', b = batch) for name, t in params.items()}
+
+        output, vjpfunc = vjp(fn, params, buffers, inp_tensor)
+
+        ctx._saved_info_for_backwards = (vjpfunc, filter_gradients_fn, args, kwargs)
+        return output
+
+    @classmethod
+    def backward(self, ctx, do):
+
+        vjp_func, filter_gradients_fn, args, kwargs = ctx._saved_info_for_backwards
+
+        dparams, dbuffers, dinp = vjp_func(do)
+
+        filtered_dparams = {name: filter_gradients_fn(dparam) for name, dparam in dparams.items()}
+
+        package = dict(
+            net = None,
+            params_buffers = (filtered_dparams, dbuffers),
+            inputs = (dinp, None, None)
+        )
+
+        tree_nodes, _ = tree_flatten(package)
+
+        output = (None, *tree_nodes)
+        return output
+
+gaf_function = GAF.apply
+
+# main function
+
+class GAFWrapper(Module):
+    """
+    a wrapper for a neural network that automatically starts filtering all the gradients by their intra-batch agreement - not across machines as in the paper
+    """
+    def __init__(
+        self,
+        net: Module,
+        filter_distance_thres = 0.97,
+        filter_gradients = True,
+        filter_gradients_fn: Callable | None = None
+    ):
+        super().__init__()
+
+        self.net = net
+
+        # gradient agreement filtering related
+
+        self.filter_gradients = filter_gradients
+        self.filter_distance_thres = filter_distance_thres
+
+        if not exists(filter_gradients_fn):
+            filter_gradients_fn = partial(filter_gradients_by_agreement, threshold = filter_distance_thres)
+
+        self.filter_gradients_fn = filter_gradients_fn
+
+    def forward(
+        self,
+        inp_tensor,
+        *args,
+        **kwargs
+    ):
+        only_one_dim_or_no_batch = inp_tensor.ndim == 1 or inp_tensor.shape[0] == 1
+
+        if not self.filter_gradients or only_one_dim_or_no_batch:
+            return self.net(inp_tensor, *args, **kwargs)
+
+        params = dict(self.net.named_parameters())
+        buffers = dict(self.net.named_buffers())
+
+        package = dict(
+            net = self.net,
+            params_buffers = (params, buffers),
+            inputs = (inp_tensor, args, kwargs),
+            filter_gradients_fn = self.filter_gradients_fn
+        )
+
+        tree_nodes, tree_spec = tree_flatten(package)
+
+        out = gaf_function(tree_spec, *tree_nodes)
+        return out
diff --git a/GAF_microbatch_pytorch/__init__.py b/GAF_microbatch_pytorch/__init__.py
@@ -0,0 +1 @@
+from GAF_microbatch_pytorch.GAF import GAFWrapper
diff --git a/README.md b/README.md
@@ -1,2 +1,64 @@
-# GAF-microbatch-pytorch
-Implementation of Gradient Agreement Filtering, from Chaubard et al. of Stanford, but for single machine microbatches, in Pytorch
+## Gradient Agreement Filtering (microbatch) - Pytorch
+
+Implementation of [Gradient Agreement Filtering](https://arxiv.org/abs/2412.18052), from Chaubard et al. of Stanford, but for single machine microbatches, in Pytorch.
+
+Whether it is just a means to filter out outlier label noise, or actually has some ties to better generalization, thought it was worth exploring either way.
+
+The official repository that does filtering done for macrobatches is [here](https://github.com/Fchaubard/gradient_agreement_filtering)
+
+## Install
+
+```bash
+$ pip install GAF-microbatch-pytorch
+```
+
+## Usage
+
+```python
+import torch
+
+# mock network
+
+from torch import nn
+
+net = nn.Sequential(
+    nn.Linear(512, 256),
+    nn.SiLU(),
+    nn.Linear(256, 128)
+)
+
+# import the gradient agreement filtering (GAF) wrapper
+
+from GAF_microbatch_pytorch import GAFWrapper
+
+# just wrap your neural net
+
+gaf_net = GAFWrapper(
+    net,
+    filter_distance_thres = 0.97
+)
+
+# your batch of data
+
+x = torch.randn(16, 1024, 512)
+
+# forward and backwards as usual
+
+out = gaf_net(x)
+
+out.sum().backward()
+
+# gradients should be filtered by set threshold comparing per sample gradients within batch, as in paper
+
+```
+
+## Citations
+
+```bibtex
+@inproceedings{Chaubard2024BeyondGA,
+    title   = {Beyond Gradient Averaging in Parallel Optimization: Improved Robustness through Gradient Agreement Filtering},
+    author  = {Francois Chaubard and Duncan Eddy and Mykel J. Kochenderfer},
+    year    = {2024},
+    url     = {https://api.semanticscholar.org/CorpusID:274992650}
+}
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,58 @@
+[project]
+name = "GAF-microbatch-pytorch"
+version = "0.0.1"
+description = "Gradient Agreement Filtering"
+authors = [
+    { name = "Phil Wang", email = "lucidrains@gmail.com" }
+]
+readme = "README.md"
+requires-python = ">= 3.9"
+license = { file = "LICENSE" }
+keywords = [
+    'artificial intelligence',
+    'deep learning',
+    'label noise',
+    'gradient filtering'
+]
+
+classifiers=[
+    'Development Status :: 4 - Beta',
+    'Intended Audience :: Developers',
+    'Topic :: Scientific/Engineering :: Artificial Intelligence',
+    'License :: OSI Approved :: MIT License',
+    'Programming Language :: Python :: 3.9',
+]
+
+dependencies = [
+    "torch>=2.4",
+    "einops>=0.8.0"
+]
+
+[project.urls]
+Homepage = "https://pypi.org/project/GAF-microbatch-pytorch/"
+Repository = "https://github.com/lucidrains/GAF-microbatch-pytorch"
+
+[project.optional-dependencies]
+examples = []
+test = [
+    "pytest"
+]
+
+[tool.pytest.ini_options]
+pythonpath = [
+  "."
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[tool.rye]
+managed = true
+dev-dependencies = []
+
+[tool.hatch.metadata]
+allow-direct-references = true
+
+[tool.hatch.build.targets.wheel]
+packages = ["GAF_microbatch_pytorch"]

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+from GAF_microbatch_pytorch.GAF import GAFWrapper`