transposed w2 to have reduction dim be innermost dim

Chillee · Chillee · commit 635db739ce02 · 2024-03-08T16:36:00.000-08:00
diff --git a/mixtral-moe/generate.py b/mixtral-moe/generate.py
@@ -24,7 +24,7 @@ def device_sync(device):
 
 torch._inductor.config.coordinate_descent_tuning = True
 torch._inductor.config.triton.unique_kernel_names = True
-torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
+# torch._inductor.config.fx_graph_cache = True # Experimental feature to reduce compilation times, will be on by default in future
 
 
 # support running without installing as a package
diff --git a/mixtral-moe/model.py b/mixtral-moe/model.py
@@ -183,21 +183,22 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona
         y = self.wo(y)
         return y
 
+import torch.distributed
 
 class ConditionalFeedForward(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.w1 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim))
-        self.w2 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim))
+        self.w2 = nn.Parameter(torch.empty(config.num_experts, config.dim, config.intermediate_size))
         self.w3 = nn.Parameter(torch.empty(config.num_experts, config.intermediate_size, config.dim))
 
     def forward(self, x: Tensor, expert_indices: Tensor) -> Tensor:
-        w1_weights = self.w1[expert_indices].transpose(-1, -2) # [T, A, D, D]
-        w3_weights = self.w3[expert_indices].transpose(-1, -2) # [T, A, D, D]
+        w1_weights = self.w1[expert_indices] # [T, A, D, D]
+        w3_weights = self.w3[expert_indices] # [T, A, D, D]
         w2_weights = self.w2[expert_indices]  # [T, A, D, D]
-        x1 = F.silu(torch.einsum('ti,taio -> tao', x, w1_weights))
-        x3 = torch.einsum('ti, taio -> tao', x, w3_weights)
-        expert_outs =  torch.einsum('tao, taoi -> tai', (x1 * x3), w2_weights)
+        x1 = F.silu(torch.einsum('ti,taoi -> tao', x, w1_weights))
+        x3 = torch.einsum('ti, taoi -> tao', x, w3_weights)
+        expert_outs =  torch.einsum('tao, taio -> tai', (x1 * x3), w2_weights)
         return expert_outs
 
 
diff --git a/mixtral-moe/quantize.py b/mixtral-moe/quantize.py
@@ -75,11 +75,11 @@ def create_quantized_state_dict(self):
                 cur_state_dict[f"{fqn}.weight"] = int8_weight
                 cur_state_dict[f"{fqn}.scales"] = scales.to(mod.weight.dtype)
             elif isinstance(mod, ConditionalFeedForward):
-                num_experts, intermediate_size, dim = mod.w1.shape
                 for weight_idx in range(0, 3):
                     weight_name = f"w{weight_idx + 1}"
                     scales_name = f"scales{weight_idx + 1}"
                     weight = getattr(mod, weight_name)
+                    num_experts, intermediate_size, dim = weight.shape
 
                     bit8_weight_list = []
                     scales_list = []
@@ -125,20 +125,20 @@ def __init__(self, num_experts, intermediate_size, dim, target_dtype):
         self.target_dtype = target_dtype
 
         self.register_buffer("w1", torch.empty(num_experts, intermediate_size, dim, dtype=target_dtype))
-        self.register_buffer("w2", torch.empty(num_experts, intermediate_size, dim, dtype=target_dtype))
+        self.register_buffer("w2", torch.empty(num_experts, dim, intermediate_size, dtype=target_dtype))
         self.register_buffer("w3", torch.empty(num_experts, intermediate_size, dim, dtype=target_dtype))
 
         self.register_buffer("scales1", torch.empty(num_experts, intermediate_size, dtype=torch.bfloat16))
-        self.register_buffer("scales2", torch.empty(num_experts, intermediate_size, dtype=torch.bfloat16))
+        self.register_buffer("scales2", torch.empty(num_experts, dim, dtype=torch.bfloat16))
         self.register_buffer("scales3", torch.empty(num_experts, intermediate_size, dtype=torch.bfloat16))
 
     def forward(self, x, expert_indices):
-        w1_weights = (self.w1.to(x.dtype)[expert_indices] * self.scales1[expert_indices].to(x.dtype).unsqueeze(-1)).transpose(-1, -2)  # [T, A, D, D]
-        w3_weights = (self.w3.to(x.dtype)[expert_indices] * self.scales3[expert_indices].to(x.dtype).unsqueeze(-1)).transpose(-1, -2)  # [T, A, D, D]
-        w2_weights = (self.w2.to(x.dtype)[expert_indices] * self.scales2[expert_indices].to(x.dtype).unsqueeze(-1))  # [T, A, D, D]
-        x1 = F.silu(torch.einsum('ti,taio -> tao', x, w1_weights))
-        x3 = torch.einsum('ti, taio -> tao', x, w3_weights)
-        expert_outs = torch.einsum('tao, taoi -> tai', (x1 * x3), w2_weights)
+        w1_weights = self.w1.to(x.dtype)[expert_indices] # [T, A, D, D]
+        w3_weights = self.w3.to(x.dtype)[expert_indices] # [T, A, D, D]
+        w2_weights = self.w2.to(x.dtype)[expert_indices]
+        x1 = F.silu(torch.einsum('ti,taoi -> tao', x, w1_weights) * self.scales1[expert_indices].to(x.dtype))
+        x3 = torch.einsum('ti, taoi -> tao', x, w3_weights) * self.scales3[expert_indices].to(x.dtype)
+        expert_outs = torch.einsum('tao, taio -> tai', (x1 * x3), w2_weights) * self.scales2[expert_indices].to(x.dtype)  # [T, A, D, D]
         return expert_outs
 
 
diff --git a/mixtral-moe/scripts/convert_hf_checkpoint.py b/mixtral-moe/scripts/convert_hf_checkpoint.py
@@ -76,9 +76,11 @@ def convert_hf_checkpoint(
             del final_result[key]
             del final_result[key.replace("wq", "wk")]
             del final_result[key.replace("wq", "wv")]
-        if "w1" in key or "w2" in key or "w3" in key:
+        elif "w1" in key or "w3" in key:
             final_result[key] = final_result[key].reshape(config.num_experts, config.intermediate_size, config.dim).contiguous()
-        if "gate" in key:
+        elif "w2" in key:
+            final_result[key] = final_result[key].reshape(config.num_experts, config.dim, config.intermediate_size).contiguous()
+        elif "gate" in key:
             final_result[key] = final_result[key].contiguous()
 
     print(f"Saving checkpoint to {checkpoint_dir / 'model.pth'}")
diff --git a/mixtral-moe/tp.py b/mixtral-moe/tp.py
@@ -99,7 +99,7 @@ def shard_qkv(qkv, dim, weight_splits):
 def _apply_tp_moe_ffn(mlp: MOEFeedForward) -> None:
     mlp.cond_ffn.w1 = nn.Parameter(shard(mlp.cond_ffn.w1, 1), requires_grad=False)
     mlp.cond_ffn.w3 = nn.Parameter(shard(mlp.cond_ffn.w3, 1), requires_grad=False)
-    mlp.cond_ffn.w2 = nn.Parameter(shard(mlp.cond_ffn.w2, 1), requires_grad=False)
+    mlp.cond_ffn.w2 = nn.Parameter(shard(mlp.cond_ffn.w2, 2), requires_grad=False)
 
     if hasattr(mlp.cond_ffn, "scales1"):
         mlp.cond_ffn.scales1 = nn.Parameter(shard(mlp.cond_ffn.scales1, 1), requires_grad=False)