Fix initialization error

degenfabian · degenfabian · commit 680d4e7a817e · 2025-08-16T05:17:20.000+02:00
diff --git a/transformer_lens/model_bridge/__init__.py b/transformer_lens/model_bridge/__init__.py
@@ -23,7 +23,6 @@
     EmbeddingBridge,
     NormalizationBridge,
     JointQKVAttentionBridge,
-    JointGateUpMLPBridge,
     LinearBridge,
     MLPBridge,
     MoEBridge,
@@ -50,7 +49,6 @@
     "EmbeddingBridge",
     "NormalizationBridge",
     "JointQKVAttentionBridge",
-    "JointGateUpMLPBridge",
     "LinearBridge",
     "MLPBridge",
     "MoEBridge",
diff --git a/transformer_lens/model_bridge/generalized_components/__init__.py b/transformer_lens/model_bridge/generalized_components/__init__.py
@@ -24,9 +24,6 @@
 from transformer_lens.model_bridge.generalized_components.joint_qkv_attention import (
     JointQKVAttentionBridge,
 )
-from transformer_lens.model_bridge.generalized_components.joint_gate_up_mlp import (
-    JointGateUpMLPBridge,
-)
 from transformer_lens.model_bridge.generalized_components.unembedding import (
     UnembeddingBridge,
 )
@@ -37,7 +34,6 @@
     "EmbeddingBridge",
     "NormalizationBridge",
     "JointQKVAttentionBridge",
-    "JointGateUpMLPBridge",
     "LinearBridge",
     "MLPBridge",
     "MoEBridge",
diff --git a/transformer_lens/model_bridge/generalized_components/joint_gate_up_mlp.py b/transformer_lens/model_bridge/generalized_components/joint_gate_up_mlp.py
diff --git a/transformer_lens/model_bridge/supported_architectures/gpt_oss.py b/transformer_lens/model_bridge/supported_architectures/gpt_oss.py
@@ -2,14 +2,11 @@
 
 from typing import Any
 
-import torch
-
 from transformer_lens.model_bridge.architecture_adapter import ArchitectureAdapter
 from transformer_lens.model_bridge.generalized_components import (
     AttentionBridge,
     BlockBridge,
     EmbeddingBridge,
-    JointGateUpMLPBridge,
     LinearBridge,
     MLPBridge,
     NormalizationBridge,
@@ -46,17 +43,8 @@ def __init__(self, cfg: Any) -> None:
                         name="mlp",
                         submodules={
                             "router": LinearBridge(name="router"),
-                            "experts": BlockBridge(
+                            "experts": MLPBridge(
                                 name="experts",
-                                submodules={
-                                    "gate_up": JointGateUpMLPBridge(
-                                        name="gate_up_proj",
-                                        gate_up_config={
-                                            "split_gate_up_matrix": self.split_gate_up_matrix
-                                        },
-                                    ),
-                                    "down": LinearBridge(name="down_proj"),
-                                },
                             ),
                         },
                     ),
@@ -65,29 +53,3 @@ def __init__(self, cfg: Any) -> None:
             "ln_final": NormalizationBridge(name="model.norm"),
             "unembed": UnembeddingBridge(name="lm_head"),
         }
-
-    def split_gate_up_matrix(
-        self, original_mlp_component: Any
-    ) -> tuple[torch.nn.Linear, torch.nn.Linear]:
-        gate_up_weight = original_mlp_component.gate_up_proj
-        gate_up_bias = original_mlp_component.gate_up_proj_bias
-
-        # In GPT-OSS, all the gate projection weights lie at even indices,
-        # all the up projection weights lie at odd indices
-        gate_weight = gate_up_weight[..., ::2]
-        up_weight = gate_up_weight[..., 1::2]
-
-        gate_bias = gate_up_bias[..., ::2]
-        up_bias = gate_up_bias[..., 1::2]
-
-        gate_projection = torch.nn.Linear(gate_weight.shape[0], gate_weight.shape[1], bias=True)
-
-        gate_projection.weight = torch.nn.Parameter(gate_weight)
-        gate_projection.bias = torch.nn.Parameter(gate_bias)
-
-        up_projection = torch.nn.Linear(up_weight.shape[0], up_weight.shape[1])
-
-        up_projection.weight = torch.nn.Parameter(up_weight)
-        up_projection.bias = torch.nn.Parameter(up_bias)
-
-        return gate_projection, up_projection