Address PR feedback

tdakhran · tdakhran · commit e008d6ccf705 · 2025-09-03T09:34:13.000+02:00
diff --git a/examples/models/lfm2/convert_weights.py b/examples/models/lfm2/convert_weights.py
@@ -12,16 +12,13 @@
 _LFM_2_TO_META = {
     "model.embed_tokens.weight": "tok_embeddings.weight",
     "model.embedding_norm.weight": "norm.weight",
-
     "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
     "model.layers.{}.self_attn.q_proj.weight": "layers.{}.attention.wq.weight",
     "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
     "model.layers.{}.self_attn.out_proj.weight": "layers.{}.attention.wo.weight",
     "model.layers.{}.self_attn.k_layernorm.weight": "layers.{}.attention.k_norm_fn.weight",
     "model.layers.{}.self_attn.q_layernorm.weight": "layers.{}.attention.q_norm_fn.weight",
-
     "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
-
     "model.layers.{}.operator_norm.weight": "layers.{}.attention_norm.weight",
 }
 
@@ -48,7 +45,9 @@ def lfm_2_to_meta(state_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor
 
         # split in_proj
         if new_key.endswith(".conv.in_proj.weight"):
-            for name, split_value in zip(["B_proj", "C_proj", "x_proj"], torch.chunk(value, 3, dim=0)):
+            for name, split_value in zip(
+                ["B_proj", "C_proj", "x_proj"], torch.chunk(value, 3, dim=0)
+            ):
                 converted_state_dict[new_key.replace("in_proj", name)] = split_value
         else:
             converted_state_dict[new_key] = value
diff --git a/examples/models/lfm2/short_conv.py b/examples/models/lfm2/short_conv.py
@@ -1,10 +1,10 @@
 import torch
-from torch import nn
-
-from executorch.examples.models.llama.norm import RMSNorm
 from executorch.examples.models.llama.attention import ForwardOptions
 from executorch.examples.models.llama.feed_forward import FeedForward
 
+from executorch.examples.models.llama.norm import RMSNorm
+from torch import nn
+
 
 class ShortConv(nn.Module):
     def __init__(
@@ -61,10 +61,14 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         #  So, assuming prefill is done on an empty cache, concatenating conv_state to the beginning of the sequence acts similary to
         ## using nn.Conv1d(padding=L_cache-1) (for prefill) without no manual padding.
         ## However, the manual padding has the added benefit of being correct during decode, when the cache is not initialized to 0.
-        Bx = torch.cat([self.conv_state, Bx], dim=-1)  # (batch_size, dim, seq_len + L_cache - 1)
+        Bx = torch.cat(
+            [self.conv_state, Bx], dim=-1
+        )  # (batch_size, dim, seq_len + L_cache - 1)
 
         ## Update the conv_state
-        new_conv_state = Bx[..., -(self.conv.weight.size(-1) - 1) :]  # (batch_size, dim, L_cache - 1)
+        new_conv_state = Bx[
+            ..., -(self.L_cache - 1) :
+        ]  # (batch_size, dim, L_cache - 1)
         with torch.no_grad():
             self.conv_state.copy_(new_conv_state)
 
@@ -83,15 +87,20 @@ def reset_cache(self):
 class ShortConvBlock(nn.Module):
     def __init__(self, dim: int, hidden_dim: int, norm_eps: float):
         super().__init__()
-        # hardcode 3 for now
-        L_cache = 3
-        self.conv = ShortConv(dim, L_cache, bias=False)
+        self.L_cache = 3  # hardcode 3 for now
+        self.conv = ShortConv(dim, self.L_cache, bias=False)
         self.feed_forward = FeedForward(dim, hidden_dim)
         self.ffn_norm = RMSNorm(dim, norm_eps)
         # use attention_norm norm instead of operator_norm to unify with TransformerBlock
         self.attention_norm = RMSNorm(dim, norm_eps)
 
-    def forward(self, x, freqs_cos=None, freqs_sin=None, _unused_attn_options: ForwardOptions = None):  # x: 1xN
+    def forward(
+        self,
+        x,
+        freqs_cos=None,
+        freqs_sin=None,
+        _unused_attn_options: ForwardOptions = None,
+    ):  # x: 1xN
         h = self.conv.forward(self.attention_norm(x))
         h = x + h
         out = h + self.feed_forward(self.ffn_norm(h))
diff --git a/examples/models/llama/feed_forward.py b/examples/models/llama/feed_forward.py
@@ -1,5 +1,6 @@
-from torch import nn
 import torch.nn.functional as F
+from torch import nn
+
 
 class FeedForward(nn.Module):
     def __init__(self, dim: int, hidden_dim: int):
@@ -11,5 +12,3 @@ def __init__(self, dim: int, hidden_dim: int):
 
     def forward(self, x):
         return self.w2(F.silu(self.w1(x)) * self.w3(x))
-
-
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -12,17 +12,16 @@
 import torch
 import torch.nn.functional as F
 
+from executorch.examples.models.lfm2.short_conv import ShortConvBlock
 from executorch.examples.models.llama.attention import (
     Attention,
     ATTENTION_REGISTRY,
     ForwardOptions,
 )
-from executorch.examples.models.lfm2.short_conv import ShortConvBlock
-
+from executorch.examples.models.llama.feed_forward import FeedForward
 from executorch.examples.models.llama.model_args import ModelArgs
 from executorch.examples.models.llama.norm import RMSNorm
 from executorch.examples.models.llama.rope import Rope
-from executorch.examples.models.llama.feed_forward import FeedForward
 from torch import nn
 
 
@@ -247,12 +246,15 @@ def construct_transformer(model_args: ModelArgs) -> Transformer:
         # hybrid models define layer_types
         if model_args.layer_types and model_args.layer_types[layer_id] == "conv":
             layers.append(
-                ShortConvBlock(dim=model_args.dim, hidden_dim=model_args.hidden_dim, norm_eps=model_args.norm_eps)
+                ShortConvBlock(
+                    dim=model_args.dim,
+                    hidden_dim=model_args.hidden_dim,
+                    norm_eps=model_args.norm_eps,
+                )
             )
-            continue
-
-        attention = cls(model_args, layer_id, rope, **model_args.attention_kwargs)
-        transformer_block = TransformerBlock(model_args, attention)
-        layers.append(transformer_block)
+        else:
+            attention = cls(model_args, layer_id, rope, **model_args.attention_kwargs)
+            transformer_block = TransformerBlock(model_args, attention)
+            layers.append(transformer_block)
 
     return Transformer(model_args, layers, rope)