Merge pull request #153 from MuhammedHasan/better-defaults-borzoi-activation-norm

avantikalal · web-flow · commit a109d12bfe25 · 2025-07-07T16:59:39.000-07:00
better-defaults-borzoi-activation-norm
diff --git a/src/grelu/model/blocks.py b/src/grelu/model/blocks.py
@@ -34,6 +34,7 @@ class LinearBlock(nn.Module):
         act_func: Name of activation function
         dropout: Dropout probability
         norm: If True, apply layer normalization
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layer
         bias: If True, include bias term.
         dtype: Data type of the weights
         device: Device on which to store the weights
@@ -46,15 +47,15 @@ def __init__(
         act_func: str = "relu",
         dropout: float = 0.0,
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         bias: bool = True,
         dtype=None,
         device=None,
     ) -> None:
         super().__init__()
 
         self.norm = Norm(
-            func="layer" if norm else None, in_dim=in_len, **norm_kwargs, dtype=dtype, device=device
+            func="layer" if norm else None, in_dim=in_len, **(norm_kwargs or dict()), dtype=dtype, device=device
         )
         self.linear = nn.Linear(in_len, out_len, bias=bias, dtype=dtype, device=device)
         self.dropout = Dropout(dropout)
@@ -123,7 +124,7 @@ def __init__(
         dropout: float = 0.0,
         norm: bool = True,
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         residual: bool = False,
         order: str = "CDNRA",
         bias: bool = True,
@@ -152,15 +153,15 @@ def __init__(
                     in_dim=out_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
             else:
                 self.norm = Norm(
                     norm_type,
                     in_dim=in_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
         else:
             self.norm = Norm(None)
@@ -231,7 +232,7 @@ class ChannelTransformBlock(nn.Module):
         act_func: Name of the activation function
         dropout: Dropout probability
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
-        norm_kwargs: Additional arguments to be passed to the normalization layer
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         order: A string representing the order in which operations are
             to be performed on the input. For example, "CDNA" means that the
             operations will be performed in the order: convolution, dropout,
@@ -250,7 +251,7 @@ def __init__(
         dropout: float = 0.0,
         order: str = "CDNA",
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         if_equal: bool = False,
         dtype=None,
         device=None,
@@ -274,15 +275,15 @@ def __init__(
                     in_dim=out_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
             else:
                 self.norm = Norm(
                     "batch",
                     in_dim=in_channels,
                     dtype=dtype,
                     device=device,
-                    **norm_kwargs,
+                    **(norm_kwargs or dict()),
                 )
         else:
             self.norm = Norm(None)
@@ -441,6 +442,7 @@ class ConvTower(nn.Module):
         pool_size: Width of the pooling layers
         dropout: Dropout probability
         norm: If True, apply batch norm
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         residual: If True, apply residual connection
         order: A string representing the order in which operations are
             to be performed on the input. For example, "CDNRA" means that the
@@ -464,7 +466,7 @@ def __init__(
         dilation_mult: float = 1,
         act_func: str = "relu",
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         pool_func: Optional[str] = None,
         pool_size: Optional[int] = None,
         residual: bool = False,
@@ -565,15 +567,16 @@ class FeedForwardBlock(nn.Module):
         in_len: Length of the input tensor
         dropout: Dropout probability
         act_func: Name of the activation function
-        kwargs: Additional arguments to be passed to the linear layers
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        **kwargs: Additional arguments to be passed to the linear layers
     """
 
     def __init__(
         self,
         in_len: int,
         dropout: float = 0.0,
         act_func: str = "relu",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         **kwargs,
     ) -> None:
         super().__init__()
@@ -693,6 +696,7 @@ class TransformerBlock(nn.Module):
         key_len: Length of the key vectors
         value_len: Length of the value vectors.
         pos_dropout: Dropout probability in the positional embeddings
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
         dtype: Data type of the weights
         device: Device on which to store the weights
     """
@@ -710,12 +714,12 @@ def __init__(
         key_len: Optional[int] = None,
         value_len: Optional[int] = None,
         pos_dropout: Optional[float] = None,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         dtype=None,
         device=None,
     ) -> None:
         super().__init__()
-        self.norm = Norm("layer", in_len, **norm_kwargs)
+        self.norm = Norm("layer", in_len, **(norm_kwargs or dict()))
 
         if flash_attn:
             if (
@@ -795,10 +799,10 @@ class TransformerTower(nn.Module):
         key_len: Length of the key vectors
         value_len: Length of the value vectors.
         pos_dropout: Dropout probability in the positional embeddings
-        attn_dropout: Dropout probability in the output layer
-        ff_droppout: Dropout probability in the linear feed-forward layers
-        flash_attn: If True, uses Flash Attention with Rotational Position Embeddings. key_len, value_len,
-            pos_dropout and n_pos_features are ignored.
+        attn_dropout: Dropout probability in the attention layer
+        ff_dropout: Dropout probability in the feed-forward layers
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        flash_attn: If True, uses Flash Attention with Rotational Position Embeddings
         dtype: Data type of the weights
         device: Device on which to store the weights
     """
@@ -814,7 +818,7 @@ def __init__(
         pos_dropout: float = 0.0,
         attn_dropout: float = 0.0,
         ff_dropout: float = 0.0,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         flash_attn: bool = False,
         dtype=None,
         device=None,
@@ -865,17 +869,20 @@ class UnetBlock(nn.Module):
         in_channels: Number of channels in the input
         y_in_channels: Number of channels in the higher-resolution representation.
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
-        norm_kwargs: Additional arguments to be passed to the normalization layer
-        device: Device on which to store the weights
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type of the weights
+        device: Device on which to store the weights
     """
 
     def __init__(
         self,
         in_channels: int,
         y_in_channels: int,
         norm_type="batch",
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
+        act_func="gelu_borzoi",
         dtype=None,
         device=None,
     ) -> None:
@@ -887,7 +894,7 @@ def __init__(
             norm=True,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
-            act_func="gelu_borzoi",
+            act_func=act_func,
             order="NACDR",
             dtype=dtype,
             device=device,
@@ -899,7 +906,7 @@ def __init__(
             norm=True,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
-            act_func="gelu_borzoi",
+            act_func=act_func,
             order="NACD",
             if_equal=True,
             dtype=dtype,
@@ -932,16 +939,18 @@ class UnetTower(nn.Module):
         in_channels: Number of channels in the input
         y_in_channels: Number of channels in the higher-resolution representations.
         n_blocks: Number of U-net blocks
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         kwargs: Additional arguments to be passed to the U-net blocks
     """
 
     def __init__(
-        self, in_channels: int, y_in_channels: List[int], n_blocks: int, **kwargs
+        self, in_channels: int, y_in_channels: List[int], n_blocks: int, act_func: str = "gelu_borzoi", **kwargs
     ) -> None:
         super().__init__()
         self.blocks = nn.ModuleList()
         for y_c in y_in_channels:
-            self.blocks.append(UnetBlock(in_channels, y_c, **kwargs))
+            self.blocks.append(UnetBlock(in_channels, y_c, act_func=act_func, **kwargs))
 
     def forward(self, x: Tensor, ys: List[Tensor]) -> Tensor:
         """
diff --git a/src/grelu/model/heads.py b/src/grelu/model/heads.py
@@ -27,6 +27,8 @@ class ConvHead(nn.Module):
         norm: If True, batch normalization will be included.
         act_func: Activation function for the convolutional layer
         pool_func: Pooling function.
+        norm: If True, batch normalization will be included.
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layer
         dtype: Data type for the layers.
         device: Device for the layers.
     """
@@ -38,7 +40,7 @@ def __init__(
         act_func: Optional[str] = None,
         pool_func: Optional[str] = None,
         norm: bool = False,
-        norm_kwargs: Optional[dict] = dict(),
+        norm_kwargs: Optional[dict] = None,
         dtype=None,
         device=None,
     ) -> None:
@@ -56,7 +58,7 @@ def __init__(
             self.n_tasks,
             act_func=self.act_func,
             norm=self.norm,
-            norm_kwargs=norm_kwargs,
+            norm_kwargs=(norm_kwargs or dict()),
             dtype=dtype,
             device=device,
         )
diff --git a/src/grelu/model/layers.py b/src/grelu/model/layers.py
@@ -20,8 +20,15 @@ class Activation(nn.Module):
     A nonlinear activation layer.
 
     Args:
-        func: The type of activation function. Supported values are 'relu',
-            'elu', 'softplus', 'gelu', 'gelu_borzoi', 'gelu_enformer' and 'exp'. If None, will return nn.Identity.
+        func: The type of activation function. Supported values are:
+            - 'relu': Standard ReLU activation
+            - 'elu': Exponential Linear Unit
+            - 'softplus': Softplus activation
+            - 'gelu': Standard GELU activation using PyTorch's default approximation
+            - 'gelu_borzoi': GELU activation using tanh approximation (different from PyTorch's default)
+            - 'gelu_enformer': Custom GELU implementation from Enformer
+            - 'exp': Exponential activation
+            - None: Returns identity function (no activation)
 
     Raises:
         NotImplementedError: If 'func' is not a supported activation function.
@@ -159,6 +166,14 @@ class Norm(nn.Module):
             'syncbatch', 'instance',  or 'layer'. If None, will return nn.Identity.
         in_dim: Number of features in the input tensor.
         **kwargs: Additional arguments to pass to the normalization function.
+            Common arguments include:
+            - eps: Small constant added to denominator for numerical stability.
+                Defaults to 1e-5 for all normalization types unless overridden.
+            - momentum: Value used for the running_mean and running_var computation.
+                Defaults to 0.1 for batch and sync batch norm.
+            - affine: If True, adds learnable affine parameters. Defaults to True.
+            - track_running_stats: If True, tracks running mean and variance.
+                Defaults to True for batch and sync batch norm.
     """
 
     def __init__(
diff --git a/src/grelu/model/models.py b/src/grelu/model/models.py
@@ -496,6 +496,10 @@ class BorzoiModel(BaseModel):
             If None, no pooling will be applied at the end.
         flash_attn: If True, uses Flash Attention with Rotational Position Embeddings. key_len, value_len,
             pos_dropout and n_pos_features are ignored.
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers.
+            Defaults to {"eps": 0.001}.
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type for the layers.
         device: Device for the layers.
     """
@@ -518,17 +522,19 @@ def __init__(
         pos_dropout: float = 0.01,
         attn_dropout: float = 0.05,
         ff_dropout: float = 0.2,
-        norm_kwargs: Optional[dict] = {"eps" : 0.001},
+        norm_kwargs: Optional[dict] = None,
         n_heads: int = 8,
         n_pos_features: int = 32,
         # Head
         crop_len: int = 16,
+        act_func: str = "gelu_borzoi",
         final_act_func: Optional[str] = None,
         final_pool_func: Optional[str] = "avg",
         flash_attn=False,
         dtype=None,
         device=None,
     ) -> None:
+        norm_kwargs = norm_kwargs or {"eps": 0.001}
         super().__init__(
             embedding=BorzoiTrunk(
                 stem_channels=stem_channels,
@@ -548,6 +554,7 @@ def __init__(
                 n_pos_features=n_pos_features,
                 crop_len=crop_len,
                 flash_attn=flash_attn,
+                act_func=act_func,
                 dtype=dtype,
                 device=device,
             ),
@@ -567,6 +574,19 @@ def __init__(
 class BorzoiPretrainedModel(BaseModel):
     """
     Borzoi model with published weights (ported from Keras).
+
+    Args:
+        n_tasks: Number of tasks for the model to predict
+        fold: Which fold of the model to load (default=0)
+        n_transformers: Number of transformer blocks to use (default=8)
+        crop_len: Number of positions to crop at either end of the output (default=0)
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers.
+            Defaults to {"eps": 0.001}.
+        final_pool_func: Name of the pooling function to apply to the final output (default="avg")
+        dtype: Data type for the layers
+        device: Device for the layers
     """
 
     def __init__(
@@ -577,10 +597,13 @@ def __init__(
         n_transformers: int = 8,
         # head
         crop_len=0,
+        act_func="gelu_borzoi",
+        norm_kwargs: Optional[dict] = None,
         final_pool_func="avg",
         dtype=None,
         device=None,
     ):
+        norm_kwargs = norm_kwargs or {"eps": 0.001}
         model = BorzoiModel(
             crop_len=crop_len,
             n_tasks=7611,
@@ -595,9 +618,10 @@ def __init__(
             pos_dropout=0.01,
             attn_dropout=0.05,
             ff_dropout=0.2,
-            norm_kwargs={"eps": 0.001},
+            norm_kwargs=norm_kwargs,
             n_heads=8,
             n_pos_features=32,
+            act_func=act_func,
             final_act_func=None,
             final_pool_func=None,
             dtype=dtype,
diff --git a/src/grelu/model/trunks/borzoi.py b/src/grelu/model/trunks/borzoi.py