argument fix

Muhammed Hasan Celik · Muhammed Hasan Celik · commit 5c65f82f8f2c · 2025-07-07T12:14:13.000-07:00
diff --git a/src/grelu/model/blocks.py b/src/grelu/model/blocks.py
@@ -34,6 +34,7 @@ class LinearBlock(nn.Module):
         act_func: Name of activation function
         dropout: Dropout probability
         norm: If True, apply layer normalization
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layer
         bias: If True, include bias term.
         dtype: Data type of the weights
         device: Device on which to store the weights
@@ -46,15 +47,15 @@ def __init__(
         act_func: str = "relu",
         dropout: float = 0.0,
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         bias: bool = True,
         dtype=None,
         device=None,
     ) -> None:
         super().__init__()
 
         self.norm = Norm(
-            func="layer" if norm else None, in_dim=in_len, **norm_kwargs, dtype=dtype, device=device
+            func="layer" if norm else None, in_dim=in_len, **(norm_kwargs or dict()), dtype=dtype, device=device
         )
         self.linear = nn.Linear(in_len, out_len, bias=bias, dtype=dtype, device=device)
         self.dropout = Dropout(dropout)
@@ -123,7 +124,7 @@ def __init__(
         dropout: float = 0.0,
         norm: bool = True,
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         residual: bool = False,
         order: str = "CDNRA",
         bias: bool = True,
@@ -152,15 +153,15 @@ def __init__(
                     in_dim=out_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
             else:
                 self.norm = Norm(
                     norm_type,
                     in_dim=in_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
         else:
             self.norm = Norm(None)
@@ -231,7 +232,7 @@ class ChannelTransformBlock(nn.Module):
         act_func: Name of the activation function
         dropout: Dropout probability
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
-        norm_kwargs: Additional arguments to be passed to the normalization layer
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         order: A string representing the order in which operations are
             to be performed on the input. For example, "CDNA" means that the
             operations will be performed in the order: convolution, dropout,
@@ -250,7 +251,7 @@ def __init__(
         dropout: float = 0.0,
         order: str = "CDNA",
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         if_equal: bool = False,
         dtype=None,
         device=None,
@@ -274,15 +275,15 @@ def __init__(
                     in_dim=out_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
             else:
                 self.norm = Norm(
                     "batch",
                     in_dim=in_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
         else:
             self.norm = Norm(None)
@@ -441,6 +442,7 @@ class ConvTower(nn.Module):
         pool_size: Width of the pooling layers
         dropout: Dropout probability
         norm: If True, apply batch norm
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         residual: If True, apply residual connection
         order: A string representing the order in which operations are
             to be performed on the input. For example, "CDNRA" means that the
@@ -464,7 +466,7 @@ def __init__(
         dilation_mult: float = 1,
         act_func: str = "relu",
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         pool_func: Optional[str] = None,
         pool_size: Optional[int] = None,
         residual: bool = False,
@@ -565,15 +567,16 @@ class FeedForwardBlock(nn.Module):
         in_len: Length of the input tensor
         dropout: Dropout probability
         act_func: Name of the activation function
-        kwargs: Additional arguments to be passed to the linear layers
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        **kwargs: Additional arguments to be passed to the linear layers
     """
 
     def __init__(
         self,
         in_len: int,
         dropout: float = 0.0,
         act_func: str = "relu",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -693,6 +696,7 @@ class TransformerBlock(nn.Module):
         key_len: Length of the key vectors
         value_len: Length of the value vectors.
         pos_dropout: Dropout probability in the positional embeddings
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         dtype: Data type of the weights
         device: Device on which to store the weights
     """
@@ -710,7 +714,7 @@ def __init__(
         key_len: Optional[int] = None,
         value_len: Optional[int] = None,
         pos_dropout: Optional[float] = None,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         dtype=None,
         device=None,
     ) -> None:
@@ -795,10 +799,10 @@ class TransformerTower(nn.Module):
         key_len: Length of the key vectors
         value_len: Length of the value vectors.
         pos_dropout: Dropout probability in the positional embeddings
-        attn_dropout: Dropout probability in the output layer
-        ff_droppout: Dropout probability in the linear feed-forward layers
-        flash_attn: If True, uses Flash Attention with Rotational Position Embeddings. key_len, value_len,
-            pos_dropout and n_pos_features are ignored.
+        attn_dropout: Dropout probability in the attention layer
+        ff_dropout: Dropout probability in the feed-forward layers
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        flash_attn: If True, uses Flash Attention with Rotational Position Embeddings
         dtype: Data type of the weights
         device: Device on which to store the weights
     """
@@ -814,7 +818,7 @@ def __init__(
         pos_dropout: float = 0.0,
         attn_dropout: float = 0.0,
         ff_dropout: float = 0.0,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         flash_attn: bool = False,
         dtype=None,
         device=None,
@@ -865,17 +869,19 @@ class UnetBlock(nn.Module):
         in_channels: Number of channels in the input
         y_in_channels: Number of channels in the higher-resolution representation.
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
-        norm_kwargs: Additional arguments to be passed to the normalization layer
-        device: Device on which to store the weights
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        act_func: Name of the activation function
         dtype: Data type of the weights
+        device: Device on which to store the weights
     """
 
     def __init__(
         self,
         in_channels: int,
         y_in_channels: int,
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
+        act_func="gelu_borzoi",
         dtype=None,
         device=None,
     ) -> None:
@@ -887,7 +893,7 @@ def __init__(
             norm=True,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
-            act_func="gelu_borzoi",
+            act_func=act_func,
             order="NACDR",
             dtype=dtype,
             device=device,
@@ -899,7 +905,7 @@ def __init__(
             norm=True,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
-            act_func="gelu_borzoi",
+            act_func=act_func,
             order="NACD",
             if_equal=True,
             dtype=dtype,
@@ -932,16 +938,17 @@ class UnetTower(nn.Module):
         in_channels: Number of channels in the input
         y_in_channels: Number of channels in the higher-resolution representations.
         n_blocks: Number of U-net blocks
+        act_func: Name of the activation function
         kwargs: Additional arguments to be passed to the U-net blocks
     """
 
     def __init__(
-        self, in_channels: int, y_in_channels: List[int], n_blocks: int, **kwargs
+        self, in_channels: int, y_in_channels: List[int], n_blocks: int, act_func: str = "gelu_borzoi", **kwargs
     ) -> None:
         super().__init__()
         self.blocks = nn.ModuleList()
         for y_c in y_in_channels:
-            self.blocks.append(UnetBlock(in_channels, y_c, **kwargs))
+            self.blocks.append(UnetBlock(in_channels, y_c, act_func=act_func, **kwargs))
 
     def forward(self, x: Tensor, ys: List[Tensor]) -> Tensor:
         """
diff --git a/src/grelu/model/heads.py b/src/grelu/model/heads.py
@@ -27,6 +27,8 @@ class ConvHead(nn.Module):
         norm: If True, batch normalization will be included.
         act_func: Activation function for the convolutional layer
         pool_func: Pooling function.
+        norm: If True, batch normalization will be included.
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layer
         dtype: Data type for the layers.
         device: Device for the layers.
     """
@@ -38,7 +40,7 @@ def __init__(
         act_func: Optional[str] = None,
         pool_func: Optional[str] = None,
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         dtype=None,
         device=None,
     ) -> None:
@@ -56,7 +58,7 @@ def __init__(
             self.n_tasks,
             act_func=self.act_func,
             norm=self.norm,
-            norm_kwargs=norm_kwargs,
+            norm_kwargs=(norm_kwargs or dict()),
             dtype=dtype,
             device=device,
         )
diff --git a/src/grelu/model/models.py b/src/grelu/model/models.py
@@ -518,17 +518,19 @@ def __init__(
         pos_dropout: float = 0.01,
         attn_dropout: float = 0.05,
         ff_dropout: float = 0.2,
-        norm_kwargs: Optional[dict] = {"eps" : 0.001},
+        norm_kwargs: Optional[dict] = None,
         n_heads: int = 8,
         n_pos_features: int = 32,
         # Head
         crop_len: int = 16,
+        act_func: str = "gelu_borzoi",
         final_act_func: Optional[str] = None,
         final_pool_func: Optional[str] = "avg",
         flash_attn=False,
         dtype=None,
         device=None,
     ) -> None:
+        norm_kwargs = norm_kwargs or {"eps": 0.001}
         super().__init__(
             embedding=BorzoiTrunk(
                 stem_channels=stem_channels,
@@ -548,6 +550,7 @@ def __init__(
                 n_pos_features=n_pos_features,
                 crop_len=crop_len,
                 flash_attn=flash_attn,
+                act_func=act_func,
                 dtype=dtype,
                 device=device,
             ),
@@ -577,10 +580,13 @@ def __init__(
         n_transformers: int = 8,
         # head
         crop_len=0,
+        act_func="gelu_borzoi",
+        norm_kwargs: Optional[dict] = None,
         final_pool_func="avg",
         dtype=None,
         device=None,
     ):
+        norm_kwargs = norm_kwargs or {"eps": 0.001}
         model = BorzoiModel(
             crop_len=crop_len,
             n_tasks=7611,
@@ -595,9 +601,10 @@ def __init__(
             pos_dropout=0.01,
             attn_dropout=0.05,
             ff_dropout=0.2,
-            norm_kwargs={"eps": 0.001},
+            norm_kwargs=norm_kwargs,
             n_heads=8,
             n_pos_features=32,
+            act_func=act_func,
             final_act_func=None,
             final_pool_func=None,
             dtype=dtype,
diff --git a/src/grelu/model/trunks/borzoi.py b/src/grelu/model/trunks/borzoi.py
@@ -36,6 +36,7 @@ def __init__(
         n_blocks: int,
         norm_type="batch",
         norm_kwargs=None,
+        act_func="gelu_borzoi",
         dtype=None,
         device=None,
     ) -> None:
@@ -71,7 +72,7 @@ def __init__(
                     norm=True,
                     norm_type=norm_type,
                     norm_kwargs=norm_kwargs,
-                    act_func="gelu_borzoi",
+                    act_func=act_func,
                     order="NACDR",
                     pool_func="max",
                     pool_size=2,
@@ -150,6 +151,7 @@ def __init__(
         flash_attn: bool,
         norm_type="batch",
         norm_kwargs=None,
+        act_func="gelu_borzoi",
         dtype=None,
         device=None,
     ) -> None:
@@ -164,6 +166,7 @@ def __init__(
             n_blocks=n_conv,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
+            act_func=act_func,
             dtype=dtype,
             device=device,
         )
@@ -188,14 +191,15 @@ def __init__(
             y_in_channels=[channels, self.conv_tower.filters[-2]],
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
+            act_func=act_func,
             dtype=dtype,
             device=device,
         )
         self.pointwise_conv = ConvBlock(
             in_channels=channels,
             out_channels=round(channels * 1.25),
             kernel_size=1,
-            act_func="gelu_borzoi",
+            act_func=act_func,
             dropout=0.1,
             norm=True,
             norm_type=norm_type,
@@ -204,7 +208,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.act = Activation("gelu_borzoi")
+        self.act = Activation(act_func)
         self.crop = Crop(crop_len=crop_len)
 
     def forward(self, x: Tensor) -> Tensor: