|
| 1 | +from __future__ import annotations |
| 2 | + |
| 3 | +from functools import partial |
| 4 | +from typing import Literal, Callable |
| 5 | + |
| 6 | +import torch |
| 7 | +from torch import nn |
| 8 | +from torch.nn import Module, Linear |
| 9 | +from torch.autograd import Function |
| 10 | +import torch.nn.functional as F |
| 11 | + |
| 12 | +from torch.utils._pytree import tree_flatten, tree_unflatten |
| 13 | +from torch.func import functional_call, vjp, vmap |
| 14 | + |
| 15 | +from einops import einsum, rearrange, repeat, reduce |
| 16 | + |
| 17 | +# helper functions |
| 18 | + |
| 19 | +def exists(v): |
| 20 | + return v is not None |
| 21 | + |
| 22 | +def default(v, d): |
| 23 | + return v if exists(v) else d |
| 24 | + |
| 25 | +# distance used for gradient agreement |
| 26 | +# they found cosine distance to work the best, at a threshold of ~0.96 |
| 27 | + |
| 28 | +def l2norm(t): |
| 29 | + return F.normalize(t, p = 2, dim = -1) |
| 30 | + |
| 31 | +def cosine_sim_distance(grads): |
| 32 | + grads = rearrange(grads, 'b ... -> b (...)') |
| 33 | + normed = l2norm(grads) |
| 34 | + dist = einsum(normed, normed, 'i d, j d -> i j') |
| 35 | + return 1. - dist |
| 36 | + |
| 37 | +def filter_gradients_by_agreement( |
| 38 | + grads, |
| 39 | + threshold, |
| 40 | + strategy: Literal[ |
| 41 | + 'accept_max_neighbors', |
| 42 | + 'accept_min_neighbors' |
| 43 | + ] = 'accept_max_neighbors', |
| 44 | + accept_batch_frac = 0.2 |
| 45 | +): |
| 46 | + """ main gradient filtering function """ |
| 47 | + |
| 48 | + batch = grads.shape[0] |
| 49 | + |
| 50 | + dist = cosine_sim_distance(grads) # (batch, batch) cosine sim gradient distance |
| 51 | + |
| 52 | + accept_mask = dist < threshold |
| 53 | + |
| 54 | + num_neighbors_within_dist = accept_mask.sum(dim = -1) |
| 55 | + |
| 56 | + if (num_neighbors_within_dist == 1).all(): |
| 57 | + return torch.zeros_like(grads) |
| 58 | + |
| 59 | + # take the most naive approach |
| 60 | + |
| 61 | + if strategy == 'accept_max_neighbors': |
| 62 | + # accept the gradient and its neighbors that is the majority |
| 63 | + |
| 64 | + center_ind = num_neighbors_within_dist.argmax(dim = -1) |
| 65 | + |
| 66 | + accept_mask = accept_mask[center_ind] |
| 67 | + |
| 68 | + elif strategy == 'accept_min_neighbors': |
| 69 | + # reject any gradients that does not have at least `batch * accept_batch_frac` similar gradients within the same batch |
| 70 | + |
| 71 | + accept_mask = num_neighbors_within_dist >= max(batch * accept_batch_frac, 2) |
| 72 | + else: |
| 73 | + raise ValueError(f'unknown strategy {strategy}') |
| 74 | + |
| 75 | + if not accept_mask.any(): |
| 76 | + return torch.zeros_like(grads) |
| 77 | + |
| 78 | + if accept_mask.all(): |
| 79 | + return grads |
| 80 | + |
| 81 | + renorm_scale = batch / accept_mask.sum().item() |
| 82 | + |
| 83 | + # filter out the gradients |
| 84 | + |
| 85 | + grads[~accept_mask] = 0. |
| 86 | + |
| 87 | + # renormalize based on how many accepted |
| 88 | + |
| 89 | + grads *= renorm_scale |
| 90 | + |
| 91 | + return grads |
| 92 | + |
| 93 | +# custom linear |
| 94 | + |
| 95 | +class GAF(Function): |
| 96 | + |
| 97 | + @classmethod |
| 98 | + def forward(self, ctx, tree_spec, *tree_nodes): |
| 99 | + |
| 100 | + package = tree_unflatten(tree_nodes, tree_spec) |
| 101 | + |
| 102 | + net = package['net'] |
| 103 | + params, buffers = package['params_buffers'] |
| 104 | + filter_gradients_fn = package['filter_gradients_fn'] |
| 105 | + inp_tensor, args, kwargs = package['inputs'] |
| 106 | + |
| 107 | + batch = inp_tensor.shape[0] |
| 108 | + |
| 109 | + def fn(params, buffers, inp_tensor): |
| 110 | + return functional_call(net, (params, buffers), (inp_tensor, *args), kwargs) |
| 111 | + |
| 112 | + fn = vmap(fn, in_dims = (0, None, 0)) |
| 113 | + |
| 114 | + params = {name: repeat(t, '... -> b ...', b = batch) for name, t in params.items()} |
| 115 | + |
| 116 | + output, vjpfunc = vjp(fn, params, buffers, inp_tensor) |
| 117 | + |
| 118 | + ctx._saved_info_for_backwards = (vjpfunc, filter_gradients_fn, args, kwargs) |
| 119 | + return output |
| 120 | + |
| 121 | + @classmethod |
| 122 | + def backward(self, ctx, do): |
| 123 | + |
| 124 | + vjp_func, filter_gradients_fn, args, kwargs = ctx._saved_info_for_backwards |
| 125 | + |
| 126 | + dparams, dbuffers, dinp = vjp_func(do) |
| 127 | + |
| 128 | + filtered_dparams = {name: filter_gradients_fn(dparam) for name, dparam in dparams.items()} |
| 129 | + |
| 130 | + package = dict( |
| 131 | + net = None, |
| 132 | + params_buffers = (filtered_dparams, dbuffers), |
| 133 | + inputs = (dinp, None, None) |
| 134 | + ) |
| 135 | + |
| 136 | + tree_nodes, _ = tree_flatten(package) |
| 137 | + |
| 138 | + output = (None, *tree_nodes) |
| 139 | + return output |
| 140 | + |
| 141 | +gaf_function = GAF.apply |
| 142 | + |
| 143 | +# main function |
| 144 | + |
| 145 | +class GAFWrapper(Module): |
| 146 | + """ |
| 147 | + a wrapper for a neural network that automatically starts filtering all the gradients by their intra-batch agreement - not across machines as in the paper |
| 148 | + """ |
| 149 | + def __init__( |
| 150 | + self, |
| 151 | + net: Module, |
| 152 | + filter_distance_thres = 0.97, |
| 153 | + filter_gradients = True, |
| 154 | + filter_gradients_fn: Callable | None = None |
| 155 | + ): |
| 156 | + super().__init__() |
| 157 | + |
| 158 | + self.net = net |
| 159 | + |
| 160 | + # gradient agreement filtering related |
| 161 | + |
| 162 | + self.filter_gradients = filter_gradients |
| 163 | + self.filter_distance_thres = filter_distance_thres |
| 164 | + |
| 165 | + if not exists(filter_gradients_fn): |
| 166 | + filter_gradients_fn = partial(filter_gradients_by_agreement, threshold = filter_distance_thres) |
| 167 | + |
| 168 | + self.filter_gradients_fn = filter_gradients_fn |
| 169 | + |
| 170 | + def forward( |
| 171 | + self, |
| 172 | + inp_tensor, |
| 173 | + *args, |
| 174 | + **kwargs |
| 175 | + ): |
| 176 | + only_one_dim_or_no_batch = inp_tensor.ndim == 1 or inp_tensor.shape[0] == 1 |
| 177 | + |
| 178 | + if not self.filter_gradients or only_one_dim_or_no_batch: |
| 179 | + return self.net(inp_tensor, *args, **kwargs) |
| 180 | + |
| 181 | + params = dict(self.net.named_parameters()) |
| 182 | + buffers = dict(self.net.named_buffers()) |
| 183 | + |
| 184 | + package = dict( |
| 185 | + net = self.net, |
| 186 | + params_buffers = (params, buffers), |
| 187 | + inputs = (inp_tensor, args, kwargs), |
| 188 | + filter_gradients_fn = self.filter_gradients_fn |
| 189 | + ) |
| 190 | + |
| 191 | + tree_nodes, tree_spec = tree_flatten(package) |
| 192 | + |
| 193 | + out = gaf_function(tree_spec, *tree_nodes) |
| 194 | + return out |
0 commit comments