Add Model Breadcrumbs merge method (#228)

cg123 · web-flow · commit 09c63e6e1809 · 2024-04-30T13:55:31.000-07:00
Implements the method described in [Model Breadcrumbs: Scaling Multi-Task Model Merging with Sparse Masks](https://arxiv.org/abs/2312.06795).
diff --git a/README.md b/README.md
@@ -116,16 +116,18 @@ Several examples of merge configurations are available in [`examples/`](examples
 
 A quick overview of the currently supported merge methods:
 
-| Method                                                                                       | `merge_method` value | Multi-Model | Uses base model |
-| -------------------------------------------------------------------------------------------- | -------------------- | ----------- | --------------- |
-| Linear ([Model Soups](https://arxiv.org/abs/2203.05482))                                     | `linear`             | ✅          | ❌              |
-| SLERP                                                                                        | `slerp`              | ❌          | ✅              |
-| [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                          | `task_arithmetic`    | ✅          | ✅              |
-| [TIES](https://arxiv.org/abs/2306.01708)                                                     | `ties`               | ✅          | ✅              |
-| [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)            | `dare_ties`          | ✅          | ✅              |
-| [DARE](https://arxiv.org/abs/2311.03099) [Task Arithmetic](https://arxiv.org/abs/2212.04089) | `dare_linear`        | ✅          | ✅              |
-| Passthrough                                                                                  | `passthrough`        | ❌          | ❌              |
-| [Model Stock](https://arxiv.org/abs/2403.19522)                                              | `model_stock`        | ✅          | ✅              |
+| Method                                                                                           | `merge_method` value | Multi-Model | Uses base model |
+| ------------------------------------------------------------------------------------------------ | -------------------- | ----------- | --------------- |
+| Linear ([Model Soups](https://arxiv.org/abs/2203.05482))                                         | `linear`             | ✅          | ❌              |
+| SLERP                                                                                            | `slerp`              | ❌          | ✅              |
+| [Task Arithmetic](https://arxiv.org/abs/2212.04089)                                              | `task_arithmetic`    | ✅          | ✅              |
+| [TIES](https://arxiv.org/abs/2306.01708)                                                         | `ties`               | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [TIES](https://arxiv.org/abs/2306.01708)                | `dare_ties`          | ✅          | ✅              |
+| [DARE](https://arxiv.org/abs/2311.03099) [Task Arithmetic](https://arxiv.org/abs/2212.04089)     | `dare_linear`        | ✅          | ✅              |
+| Passthrough                                                                                      | `passthrough`        | ❌          | ❌              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)                                            | `breadcrumbs`        | ✅          | ✅              |
+| [Model Breadcrumbs](https://arxiv.org/abs/2312.06795) + [TIES](https://arxiv.org/abs/2306.01708) | `breadcrumbs_ties`   | ✅          | ✅              |
+| [Model Stock](https://arxiv.org/abs/2403.19522)                                                  | `model_stock`        | ✅          | ✅              |
 
 ### Linear
 
@@ -168,6 +170,17 @@ Parameters: same as [TIES](#ties) for `dare_ties`, or [Linear](#linear) for `dar
 
 `passthrough` is a no-op that simply passes input tensors through unmodified. It is meant to be used for layer-stacking type merges where you have only one input model. Useful for frankenmerging.
 
+### [Model Breadcrumbs](https://arxiv.org/abs/2312.06795)
+
+An extension of task arithmetic that discards both small and and extremely large differences from the base model. As with DARE, the Model Breadcrumbs algorithm can be used with (`breadcrumbs_ties`) or without (`breadcrumbs`) the sign consensus algorithm of TIES.
+
+Parameters: same as [Linear](#linear), plus:
+
+- `density` - fraction of weights in differences from the base model to retain
+- `gamma` - fraction of largest magnitude differences to remove
+
+Note that `gamma` corresponds with the parameter `β` described in the paper, while `density` is the final density of the sparsified tensors (related to `γ` and `β` by `density = 1 - γ - β`). For good default values, try `density: 0.9` and `gamma: 0.01`.
+
 ### [Model Stock](https://arxiv.org/abs/2403.19522)
 
 Uses some neat geometric properties of fine tuned models to compute good weights for linear interpolation. Requires at least three models, including a base model.
diff --git a/mergekit/merge_methods/__init__.py b/mergekit/merge_methods/__init__.py
@@ -61,6 +61,20 @@ def get(method: str) -> MergeMethod:
             default_normalize=False,
             default_rescale=True,
         )
+    elif method == "breadcrumbs":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=None,
+            sparsification_method=SparsificationMethod.magnitude_outliers,
+            default_normalize=False,
+            default_rescale=False,
+        )
+    elif method == "breadcrumbs_ties":
+        return GeneralizedTaskArithmeticMerge(
+            consensus_method=ConsensusMethod.sum,
+            sparsification_method=SparsificationMethod.magnitude_outliers,
+            default_normalize=False,
+            default_rescale=False,
+        )
     elif method == "model_stock":
         return ModelStockMerge()
     raise RuntimeError(f"Unimplemented merge method {method}")
diff --git a/mergekit/merge_methods/generalized_task_arithmetic.py b/mergekit/merge_methods/generalized_task_arithmetic.py
@@ -52,10 +52,18 @@ def parameters(self) -> List[ConfigParameterDef]:
         ]
 
     def tensor_parameters(self) -> List[ConfigParameterDef]:
-        return [
+        res = [
             ConfigParameterDef(name="weight", required=True),
             ConfigParameterDef(name="density", required=False, default_value=1.0),
         ]
+        if self.sparsification_method == SparsificationMethod.magnitude_outliers:
+            res.append(
+                ConfigParameterDef(
+                    name="gamma",
+                    default_value=0.01,
+                )
+            )
+        return res
 
     def make_task(
         self,
@@ -111,11 +119,16 @@ def execute(
         # sparsify
         if self.method.sparsification_method:
             for tv_info in tvs:
+                kwargs = {}
+                if "gamma" in tv_info:
+                    kwargs["gamma"] = tv_info["gamma"]
+
                 tv_info["delta"] = sparsify(
                     tv_info["delta"],
                     density=tv_info["density"],
                     method=self.method.sparsification_method,
                     rescale=self.rescale,
+                    **kwargs,
                 )
 
         deltas = torch.stack([tv["delta"] for tv in tvs], dim=0)
diff --git a/mergekit/sparsify.py b/mergekit/sparsify.py
@@ -21,6 +21,7 @@
 class SparsificationMethod(str, Enum):
     magnitude = "magnitude"
     random = "random"
+    magnitude_outliers = "magnitude_outliers"
 
 
 def rescale_sum(tensor: torch.Tensor, mask: torch.Tensor):
@@ -41,7 +42,7 @@ def magnitude(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tens
     if density >= 1:
         return tensor
 
-    k = int(density * tensor.view(-1).shape[0])
+    k = int(density * tensor.numel())
 
     assert k > 0, "not gonna zero out the whole tensor buddy"
     mask = torch.zeros_like(tensor)
@@ -59,6 +60,48 @@ def magnitude(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tens
     return res
 
 
+def magnitude_outliers(
+    tensor: torch.Tensor, density: float, rescale: bool, gamma: float = 0.01
+):
+    """Masks out smallest values in addition to large outliers.
+
+    The `gamma` proportion of the largest weights are first removed, then the
+    smallest weights are removed to achieve the desired density.
+
+    Args:
+        tensor (torch.Tensor): The tensor to sparsify.
+        density (float): The proportion of weights to retain.
+        gamma (float): Percent of largest weights to remove.
+    """
+    if density >= 1:
+        return tensor
+
+    num_elems = tensor.numel()
+    target_n = int(density * num_elems)
+    n_top = int(gamma * num_elems)
+    n_bot = num_elems - target_n - n_top
+
+    if n_bot < 0:
+        # cut down on the number of large weights to remove in
+        # order to hit the target density
+        n_top += n_bot
+        n_bot = 0
+
+    w = tensor.abs().view(-1)
+    if w.device.type == "cpu":
+        w = w.float()
+    indices = torch.sort(w, descending=False).indices
+    mask = torch.zeros_like(tensor)
+
+    mask.view(-1)[indices[n_bot:-n_top]] = 1
+
+    if rescale:
+        res = rescale_sum(tensor, mask)
+    else:
+        res = tensor * mask
+    return res
+
+
 def bernoulli(tensor: torch.Tensor, density: float, rescale: bool) -> torch.Tensor:
     if density >= 1:
         return tensor
@@ -82,11 +125,14 @@ def sparsify(
     tensor: torch.Tensor,
     density: float,
     method: SparsificationMethod,
+    gamma: float = 0,
     rescale: bool = False,
 ) -> torch.Tensor:
     if method == SparsificationMethod.magnitude:
         return magnitude(tensor, density=density, rescale=rescale)
     elif method == SparsificationMethod.random:
         return bernoulli(tensor, density=density, rescale=rescale)
+    elif method == SparsificationMethod.magnitude_outliers:
+        return magnitude_outliers(tensor, density=density, rescale=rescale, gamma=gamma)
     else:
         raise NotImplementedError(method)
diff --git a/tests/test_basic_merges.py b/tests/test_basic_merges.py
@@ -105,6 +105,12 @@ def test_task_arithmetic_merge(self, model_a, model_b, model_c):
         )
         run_and_check_merge(config)
 
+    def test_breadcrumbs_merge(self, model_a, model_b, model_c):
+        config = self.two_model_config(
+            model_a, model_b, merge_method="breadcrumbs", base_model=model_c
+        )
+        run_and_check_merge(config)
+
     def test_ties_merge(self, model_a, model_b, model_c):
         config = self.two_model_config(
             model_a,
diff --git a/tests/test_sparsify.py b/tests/test_sparsify.py
@@ -28,6 +28,21 @@ def test_partial_density(self, sample_tensor):
         )
         assert torch.count_nonzero(result) == sample_tensor.view(-1).shape[0] // 2
 
+    def test_outliers(self, sample_tensor):
+        for gamma_0 in [0.1, 0.2, 0.5, 1.0]:
+            for density in [0.1, 0.3, 0.5, 0.6, 0.9, 1.0]:
+                sparsity = 1 - density
+                gamma = gamma_0 * sparsity
+                result = sparsify(
+                    sample_tensor,
+                    density=density,
+                    method=SparsificationMethod.magnitude_outliers,
+                    gamma=gamma,
+                )
+                assert torch.count_nonzero(result) == int(
+                    sample_tensor.view(-1).shape[0] * density
+                )
+
 
 class TestBernoulli:
     NUM_ITERATIONS = 1000