clementpoiret
diff --git a/‎src/equimo/experimental/text.py‎
Lines changed: 4 additions & 4 deletions b/‎src/equimo/experimental/text.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/equimo/layers/attention.py‎
Lines changed: 3 additions & 3 deletions b/‎src/equimo/layers/attention.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/equimo/layers/convolution.py‎
Lines changed: 17 additions & 17 deletions b/‎src/equimo/layers/convolution.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎src/equimo/layers/sharing.py‎
Lines changed: 6 additions & 6 deletions b/‎src/equimo/layers/sharing.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎src/equimo/models/fastervit.py‎
Lines changed: 8 additions & 7 deletions b/‎src/equimo/models/fastervit.py‎
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/equimo/models/lowformer.py‎
Lines changed: 5 additions & 5 deletions b/‎src/equimo/models/lowformer.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/equimo/models/mlla.py‎
Lines changed: 6 additions & 6 deletions b/‎src/equimo/models/mlla.py‎
Lines changed: 6 additions & 6 deletions
@@ -1,5 +1,5 @@
 from pathlib import Path
-from typing import Callable, Optional, Sequence
+from typing import Callable, Optional, Sequence, Tuple
 from urllib.parse import urlparse
 
 import equinox as eqx
@@ -136,7 +136,7 @@ class Transformer(eqx.Module):
         blocks: A list of `AttentionBlock` instances forming the transformer stack.
     """
 
-    blocks: list[AttentionBlock]
+    blocks: Tuple[AttentionBlock, ...]
 
     def __init__(
         self,
@@ -164,7 +164,7 @@ def __init__(
 
         act_layer = get_act(act_layer)
 
-        self.blocks = [
+        self.blocks = tuple(
             AttentionBlock(
                 dim=dim,
                 num_heads=num_heads,
@@ -173,7 +173,7 @@ def __init__(
                 key=keys[i],
             )
             for i in range(depth)
-        ]
+        )
 
     def __call__(
         self,
 
@@ -1671,7 +1671,7 @@ class RFAttention(eqx.Module):
     eps: float = eqx.field(static=True)
 
     qkv: eqx.nn.Conv2d
-    aggreg: list[eqx.nn.Conv2d]
+    aggreg: Tuple[eqx.nn.Conv2d, ...]
     proj: SingleConvBlock
 
     def __init__(
@@ -1709,7 +1709,7 @@ def __init__(
             use_bias=use_bias,
             key=key_qkv,
         )
-        self.aggreg = [
+        self.aggreg = tuple(
             eqx.nn.Conv2d(
                 in_channels=3 * total_dim,
                 out_channels=3 * total_dim,
@@ -1720,7 +1720,7 @@ def __init__(
                 use_bias=use_bias,
             )
             for scale in scales
-        ]
+        )
         # TODO: test different normalizations
         self.proj = SingleConvBlock(
             in_channels=self.total_dim,
 
@@ -409,7 +409,7 @@ class C2f(eqx.Module):
 
     conv1: SingleConvBlock
     conv2: SingleConvBlock
-    blocks: list[ConvBottleneck]
+    blocks: Tuple[ConvBottleneck, ...]
 
     def __init__(
         self,
@@ -445,7 +445,7 @@ def __init__(
             key=key_conv2,
         )
 
-        self.blocks = [
+        self.blocks = tuple(
             ConvBottleneck(
                 in_channels=self.hidden_channels,
                 out_channels=self.hidden_channels,
@@ -456,7 +456,7 @@ def __init__(
                 key=key_blocks[i],
             )
             for i in range(n)
-        ]
+        )
 
     def __call__(
         self,
@@ -589,7 +589,7 @@ class C3k2(eqx.Module):
 
     conv1: SingleConvBlock
     conv2: SingleConvBlock
-    blocks: list[ConvBottleneck] | list[C3k]
+    blocks: Tuple[ConvBottleneck, ...] | Tuple[C3k, ...]
 
     def __init__(
         self,
@@ -627,7 +627,7 @@ def __init__(
         )
 
         if c3k:
-            self.blocks = [
+            self.blocks = tuple(
                 C3k(
                     in_channels=self.hidden_channels,
                     out_channels=self.hidden_channels,
@@ -637,9 +637,9 @@ def __init__(
                     key=key_blocks[i],
                 )
                 for i in range(n)
-            ]
+            )
         else:
-            self.blocks = [
+            self.blocks = tuple(
                 ConvBottleneck(
                     in_channels=self.hidden_channels,
                     out_channels=self.hidden_channels,
@@ -648,7 +648,7 @@ def __init__(
                     key=key_blocks[i],
                 )
                 for i in range(n)
-            ]
+            )
 
     def __call__(
         self,
@@ -1078,12 +1078,12 @@ class GenericGhostModule(eqx.Module):
     cheap_operation: eqx.nn.Conv2d
 
     # Training
-    primary_rpr_conv: list[eqx.nn.Conv2d]
+    primary_rpr_conv: Tuple[eqx.nn.Conv2d, ...]
     primary_rpr_scale: eqx.nn.Conv2d | eqx.nn.Identity
     primary_shared_norm: eqx.nn.GroupNorm
     primary_activation: Callable
 
-    cheap_rpr_conv: list[eqx.nn.Conv2d]
+    cheap_rpr_conv: Tuple[eqx.nn.Conv2d, ...]
     cheap_rpr_scale: eqx.nn.Conv2d | eqx.nn.Identity
     cheap_shared_norm: eqx.nn.GroupNorm
     cheap_activation: Callable
@@ -1156,7 +1156,7 @@ def __init__(
 
         # Primary training branches
         init_num_groups = nearest_power_of_2_divisor(init_channels, 32)
-        self.primary_rpr_conv = [
+        self.primary_rpr_conv = tuple(
             eqx.nn.Conv2d(
                 in_channels=in_channels,
                 out_channels=init_channels,
@@ -1167,7 +1167,7 @@ def __init__(
                 key=key_ps[i],
             )
             for i in range(num_conv_branches)
-        ]
+        )
         self.primary_rpr_scale = (
             eqx.nn.Conv2d(
                 in_channels=in_channels,
@@ -1186,7 +1186,7 @@ def __init__(
 
         # Cheap training branches (depthwise)
         newchannels_num_groups = nearest_power_of_2_divisor(new_channels, 32)
-        self.cheap_rpr_conv = [
+        self.cheap_rpr_conv = tuple(
             eqx.nn.Conv2d(
                 in_channels=init_channels,
                 out_channels=new_channels,
@@ -1198,7 +1198,7 @@ def __init__(
                 key=key_cs[i],
             )
             for i in range(self.num_conv_branches)
-        ]
+        )
         self.cheap_rpr_scale = (
             eqx.nn.Conv2d(
                 in_channels=init_channels,
@@ -1344,7 +1344,7 @@ class GhostBottleneck(eqx.Module):
     ghost2: "GenericGhostModule"
 
     dw_conv: eqx.nn.Conv2d | eqx.nn.Identity
-    dw_rpr_conv: list[eqx.nn.Conv2d]  # depthwise conv branches (no bias)
+    dw_rpr_conv: Tuple[eqx.nn.Conv2d, ...]  # depthwise conv branches (no bias)
     dw_rpr_scale: eqx.nn.Conv2d | eqx.nn.Identity  # optional 1x1 depthwise (no bias)
     dw_shared_norm: eqx.nn.GroupNorm | eqx.nn.Identity
 
@@ -1393,7 +1393,7 @@ def __init__(
         # Depthwise stage (only if stride > 1)
         if stride > 1:
             # Training-time branches (depthwise, no bias); no activation; shared GN after sum
-            self.dw_rpr_conv = [
+            self.dw_rpr_conv = tuple(
                 eqx.nn.Conv2d(
                     in_channels=mid_channels,
                     out_channels=mid_channels,
@@ -1405,7 +1405,7 @@ def __init__(
                     key=k_dw_list[i],
                 )
                 for i in range(3)
-            ]
+            )
             # Optional scale branch (1x1, depthwise, stride=stride)
             self.dw_rpr_scale = (
                 eqx.nn.Conv2d(
 
@@ -1,5 +1,5 @@
 import math
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import equinox as eqx
 import jax
@@ -72,8 +72,8 @@ class LayerSharing(eqx.Module):
 
     repeat: int = eqx.field(static=True)
 
-    loras: List[LoRA]
-    dropouts: List[eqx.nn.Dropout]
+    loras: Tuple[LoRA, ...]
+    dropouts: Tuple[eqx.nn.Dropout, ...]
     f: eqx.Module
 
     def __init__(
@@ -93,8 +93,8 @@ def __init__(
         keys = jr.split(key, repeat)
         self.repeat = repeat
 
-        self.dropouts = [eqx.nn.Dropout(drop_rate) for i in range(self.repeat)]
-        self.loras = [
+        self.dropouts = tuple(eqx.nn.Dropout(drop_rate) for i in range(self.repeat))
+        self.loras = tuple(
             LoRA(
                 in_features=dim,
                 out_features=dim,
@@ -103,7 +103,7 @@ def __init__(
                 key=keys[i],
             )
             for i in range(self.repeat)
-        ]
+        )
 
         self.f = f
 
 
@@ -1,4 +1,4 @@
-from typing import Callable, List, Literal, Optional
+from typing import Callable, List, Literal, Optional, Tuple
 
 import equinox as eqx
 import jax
@@ -157,7 +157,7 @@ class BlockChunk(eqx.Module):
     window_size: bool = eqx.field(static=True)
     do_gt: bool = eqx.field(static=True)
 
-    blocks: List[eqx.Module]
+    blocks: Tuple[eqx.Module, ...]
     downsample: eqx.Module
     global_tokenizer: Optional[TokenInitializer]
 
@@ -195,7 +195,7 @@ def __init__(
             k for k, v in kwargs.items() if isinstance(v, list) and len(v) == depth
         ]
 
-        self.blocks = []
+        blocks = []
         for i in range(depth):
             config = kwargs | {k: kwargs[k][i] for k in keys_to_spread}
 
@@ -210,14 +210,15 @@ def __init__(
                 }
 
             wrapper = LayerSharingWithCT if self.is_hat else LayerSharing
-            self.blocks.append(
+            blocks.append(
                 wrapper(
                     dim=kwargs.get("dim"),
                     f=block(**config, key=block_subkeys[i]),
                     repeat=repeat,
                     key=block_subkeys[i],
                 ),
             )
+        self.blocks = tuple(blocks)
 
         self.downsample = downsampler(dim=kwargs.get("dim"), key=key_ds)
 
@@ -331,7 +332,7 @@ class FasterViT(eqx.Module):
     """
 
     patch_embed: ConvPatchEmbed
-    blocks: List[eqx.Module]
+    blocks: Tuple[eqx.Module, ...]
     norm: eqx.Module
     head: eqx.Module
 
@@ -398,7 +399,7 @@ def __init__(
         hat = to_list(hat, n_chunks)
         attn_layer = to_list(attn_layer, n_chunks)
         window_size = to_list(window_size, n_chunks)
-        self.blocks = [
+        self.blocks = tuple(
             BlockChunk(
                 block=ConvBlock if i < 2 else HATBlock,
                 repeat=repeat,
@@ -427,7 +428,7 @@ def __init__(
                 key=block_subkeys[i],
             )
             for i, depth in enumerate(depths)
-        ]
+        )
 
         num_features = int(dim * 2 ** (len(depths) - 1))
         self.norm = norm_layer(num_features)
 
@@ -15,7 +15,7 @@
 
 class BlockChunk(eqx.Module):
     residuals: list[bool] = eqx.field(static=True)
-    blocks: list[DSConv | MBConv | LowFormerBlock]
+    blocks: Tuple[DSConv | MBConv | LowFormerBlock, ...]
 
     def __init__(
         self,
@@ -123,7 +123,7 @@ def __init__(
                     )
                 residuals.append(False)
 
-        self.blocks = blocks
+        self.blocks = tuple(blocks)
         self.residuals = residuals
 
     def __call__(
@@ -146,7 +146,7 @@ def __call__(
 
 class LowFormer(eqx.Module):
     input_stem: eqx.nn.Sequential
-    blocks: list[BlockChunk]
+    blocks: Tuple[BlockChunk, ...]
     head: eqx.nn.Linear | eqx.nn.Identity
 
     def __init__(
@@ -220,7 +220,7 @@ def __init__(
             ]
         )
 
-        self.blocks = [
+        self.blocks = tuple(
             BlockChunk(
                 in_channels=widths[i - 1] if i > 0 else width_stem,
                 out_channels=widths[i],
@@ -241,7 +241,7 @@ def __init__(
             for i, (depth, att_stride, block_type, key_block) in enumerate(
                 zip(depths, att_strides, block_types, key_blocks)
             )
-        ]
+        )
 
         self.head = (
             eqx.nn.Linear(
 
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Optional, Tuple
 
 import equinox as eqx
 import jax
@@ -38,7 +38,7 @@ class Mlla(eqx.Module):
 
     patch_embed: eqx.Module
     pos_drop: eqx.Module
-    blocks: List[eqx.Module]
+    blocks: Tuple[eqx.Module, ...]
     head: eqx.Module
 
     def __init__(
@@ -52,7 +52,7 @@ def __init__(
         patch_size: int = 4,
         depths: List[int] = [2, 2, 6, 2],
         num_heads: List[int] = [3, 6, 12, 24],
-        attentions_layers: List[eqx.Module] | eqx.Module = LinearAttention,
+        attentions_layers: Tuple[eqx.Module, ...] | eqx.Module = LinearAttention,
         drop_rate: float = 0.0,
         drop_path_rate: float = 0.0,
         drop_path_uniform: bool = False,
@@ -101,8 +101,8 @@ def __init__(
         )
 
         num_heads = to_list(num_heads, n_chunks)
-        attentions_layers = to_list(attentions_layers, n_chunks)
-        self.blocks = [
+        attentions_layers = tuple(to_list(attentions_layers, n_chunks))
+        self.blocks = tuple(
             BlockChunk(
                 block=MllaBlock,
                 repeat=repeat,
@@ -124,7 +124,7 @@ def __init__(
                 key=block_subkeys[i],
             )
             for i, depth in enumerate(depths)
-        ]
+        )
 
         self.norm = eqx.nn.LayerNorm(self.num_features)
         self.head = (