Add uniform_random gate mode to mergekit-moe (#303)

cg123 · web-flow · commit ca96e86b24b0 · 2024-05-03T22:58:48.000-07:00
To better match initialization of `nn.Linear`.
diff --git a/mergekit/moe/config.py b/mergekit/moe/config.py
@@ -41,10 +41,13 @@ class MoEMergeConfig(BaseModel):
 
     base_model: ModelReference
     experts: List[Expert]
-    gate_mode: str = "hidden"  # possible values: "hidden", "cheap_embed", "random"
+    gate_mode: str = (
+        "hidden"  # possible values: "hidden", "cheap_embed", "random", "uniform_random"
+    )
     # "hidden" uses hidden state vectors for the given prompts for each layer
     # "cheap_embed" uses the average of token embeddings for the prompts, same for each layer
     # "random" is random
+    # "uniform_random" matches default initialization for torch.nn.Linear
     dtype: Optional[str] = None
     experts_per_token: int = 2
     shared_experts: Optional[List[Expert]] = None
diff --git a/mergekit/moe/router.py b/mergekit/moe/router.py
@@ -14,6 +14,7 @@
 # along with this program. If not, see http://www.gnu.org/licenses/.
 
 import logging
+import math
 from typing import Dict, List, Union
 
 import torch
@@ -99,6 +100,17 @@ def get_gate_params(
         return torch.randn(
             (model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
         )
+    elif mode == "uniform_random":
+        in_features = model_cfg.hidden_size
+        scale = math.sqrt(1.0 / in_features)
+        return (
+            torch.rand(
+                (model_cfg.num_hidden_layers, len(experts), model_cfg.hidden_size)
+            )
+            * 2
+            * scale
+            - scale
+        )
     elif mode == "cheap_embed":
         embed = model_ref.lazy_loader(lazy_unpickle=lazy_unpickle).get_tensor(
             "model.embed_tokens.weight"