Merge pull request #147 from johahi/borzoi-small-fixes

avantikalal · web-flow · commit c35c01675c7f · 2025-06-27T13:51:48.000-07:00
Fixed small Borzoi discrepancies, fixes #144
diff --git a/src/grelu/model/blocks.py b/src/grelu/model/blocks.py
@@ -46,14 +46,15 @@ def __init__(
         act_func: str = "relu",
         dropout: float = 0.0,
         norm: bool = False,
+        norm_kwargs: Optional[dict] = dict(),
         bias: bool = True,
         dtype=None,
         device=None,
     ) -> None:
         super().__init__()
 
         self.norm = Norm(
-            func="layer" if norm else None, in_dim=in_len, dtype=dtype, device=device
+            func="layer" if norm else None, in_dim=in_len, **norm_kwargs, dtype=dtype, device=device
         )
         self.linear = nn.Linear(in_len, out_len, bias=bias, dtype=dtype, device=device)
         self.dropout = Dropout(dropout)
@@ -122,7 +123,7 @@ def __init__(
         dropout: float = 0.0,
         norm: bool = True,
         norm_type="batch",
-        norm_kwargs=None,
+        norm_kwargs: Optional[dict] = dict(),
         residual: bool = False,
         order: str = "CDNRA",
         bias: bool = True,
@@ -142,7 +143,6 @@ def __init__(
             "R",
         ], "The string supplied in order must contain one occurrence each of A, C, D, N and R."
         self.order = order
-        norm_kwargs = norm_kwargs or dict()
 
         # Create norm
         if norm:
@@ -250,7 +250,7 @@ def __init__(
         dropout: float = 0.0,
         order: str = "CDNA",
         norm_type="batch",
-        norm_kwargs=None,
+        norm_kwargs: Optional[dict] = dict(),
         if_equal: bool = False,
         dtype=None,
         device=None,
@@ -265,7 +265,6 @@ def __init__(
             "N",
         ], "The string supplied in order must contain one occurrence each of A, C, D and N."
         self.order = order
-        norm_kwargs = norm_kwargs or dict()
 
         # Create batch norm
         if norm:
@@ -465,6 +464,7 @@ def __init__(
         dilation_mult: float = 1,
         act_func: str = "relu",
         norm: bool = False,
+        norm_kwargs: Optional[dict] = dict(),
         pool_func: Optional[str] = None,
         pool_size: Optional[int] = None,
         residual: bool = False,
@@ -507,6 +507,7 @@ def __init__(
                     dilation=dilation,
                     act_func=act_func,
                     norm=norm,
+                    norm_kwargs=norm_kwargs,
                     residual=residual,
                     pool_func=pool_func,
                     pool_size=pool_size,
@@ -572,13 +573,15 @@ def __init__(
         in_len: int,
         dropout: float = 0.0,
         act_func: str = "relu",
+        norm_kwargs: Optional[dict] = dict(),
         **kwargs,
     ) -> None:
         super().__init__()
         self.dense1 = LinearBlock(
             in_len,
             in_len * 2,
             norm=True,
+            norm_kwargs=norm_kwargs,
             dropout=dropout,
             act_func=act_func,
             bias=True,
@@ -588,6 +591,7 @@ def __init__(
             in_len * 2,
             in_len,
             norm=False,
+            norm_kwargs=norm_kwargs,
             dropout=dropout,
             act_func=None,
             bias=True,
@@ -706,11 +710,12 @@ def __init__(
         key_len: Optional[int] = None,
         value_len: Optional[int] = None,
         pos_dropout: Optional[float] = None,
+        norm_kwargs: Optional[dict] = dict(),
         dtype=None,
         device=None,
     ) -> None:
         super().__init__()
-        self.norm = Norm("layer", in_len)
+        self.norm = Norm("layer", in_len, **norm_kwargs)
 
         if flash_attn:
             if (
@@ -752,6 +757,7 @@ def __init__(
             in_len=in_len,
             dropout=ff_dropout,
             act_func="relu",
+            norm_kwargs=norm_kwargs,
             dtype=dtype,
             device=device,
         )
@@ -808,6 +814,7 @@ def __init__(
         pos_dropout: float = 0.0,
         attn_dropout: float = 0.0,
         ff_dropout: float = 0.0,
+        norm_kwargs: Optional[dict] = dict(),
         flash_attn: bool = False,
         dtype=None,
         device=None,
@@ -825,6 +832,7 @@ def __init__(
                     key_len=key_len,
                     value_len=value_len,
                     pos_dropout=pos_dropout,
+                    norm_kwargs=norm_kwargs,
                     dtype=dtype,
                     device=device,
                 )
@@ -867,7 +875,7 @@ def __init__(
         in_channels: int,
         y_in_channels: int,
         norm_type="batch",
-        norm_kwargs=None,
+        norm_kwargs: Optional[dict] = dict(),
         dtype=None,
         device=None,
     ) -> None:
@@ -877,10 +885,10 @@ def __init__(
             in_channels,
             1,
             norm=True,
-            act_func="gelu",
-            order="NACDR",
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
+            act_func="gelu_borzoi",
+            order="NACDR",
             dtype=dtype,
             device=device,
         )
@@ -891,7 +899,7 @@ def __init__(
             norm=True,
             norm_type=norm_type,
             norm_kwargs=norm_kwargs,
-            act_func="gelu",
+            act_func="gelu_borzoi",
             order="NACD",
             if_equal=True,
             dtype=dtype,
diff --git a/src/grelu/model/heads.py b/src/grelu/model/heads.py
@@ -38,6 +38,7 @@ def __init__(
         act_func: Optional[str] = None,
         pool_func: Optional[str] = None,
         norm: bool = False,
+        norm_kwargs: Optional[dict] = dict(),
         dtype=None,
         device=None,
     ) -> None:
@@ -55,6 +56,7 @@ def __init__(
             self.n_tasks,
             act_func=self.act_func,
             norm=self.norm,
+            norm_kwargs=norm_kwargs,
             dtype=dtype,
             device=device,
         )
diff --git a/src/grelu/model/layers.py b/src/grelu/model/layers.py
@@ -21,7 +21,7 @@ class Activation(nn.Module):
 
     Args:
         func: The type of activation function. Supported values are 'relu',
-            'elu', 'softplus', 'gelu', 'gelu_enformer' and 'exp'. If None, will return nn.Identity.
+            'elu', 'softplus', 'gelu', 'gelu_borzoi', 'gelu_enformer' and 'exp'. If None, will return nn.Identity.
 
     Raises:
         NotImplementedError: If 'func' is not a supported activation function.
@@ -36,6 +36,8 @@ def __init__(self, func: str) -> None:
             self.layer = nn.ELU()
         elif func == "gelu":
             self.layer = nn.GELU()
+        elif func == "gelu_borzoi":
+            self.layer = nn.GELU(approximate = 'tanh')
         elif func == "gelu_enformer":
             self.layer = GELU()
         elif func == "softplus":
diff --git a/src/grelu/model/models.py b/src/grelu/model/models.py
@@ -515,8 +515,10 @@ def __init__(
         n_transformers: int = 8,
         key_len: int = 64,
         value_len: int = 192,
-        pos_dropout: float = 0.0,
-        attn_dropout: float = 0.0,
+        pos_dropout: float = 0.01,
+        attn_dropout: float = 0.05,
+        ff_dropout: float = 0.2,
+        norm_kwargs: Optional[dict] = {"eps" : 0.001},
         n_heads: int = 8,
         n_pos_features: int = 32,
         # Head
@@ -540,6 +542,8 @@ def __init__(
                 value_len=value_len,
                 pos_dropout=pos_dropout,
                 attn_dropout=attn_dropout,
+                ff_dropout=ff_dropout,
+                norm_kwargs=norm_kwargs,
                 n_heads=n_heads,
                 n_pos_features=n_pos_features,
                 crop_len=crop_len,
@@ -551,6 +555,7 @@ def __init__(
                 n_tasks,
                 in_channels=round(channels * 1.25),
                 norm=False,
+                norm_kwargs=norm_kwargs,
                 act_func=final_act_func,
                 pool_func=final_pool_func,
                 dtype=dtype,
@@ -587,8 +592,10 @@ def __init__(
             n_transformers=8,
             key_len=64,
             value_len=192,
-            pos_dropout=0.0,
-            attn_dropout=0.0,
+            pos_dropout=0.01,
+            attn_dropout=0.05,
+            ff_dropout=0.2,
+            norm_kwargs={"eps": 0.001},
             n_heads=8,
             n_pos_features=32,
             final_act_func=None,
diff --git a/src/grelu/model/trunks/borzoi.py b/src/grelu/model/trunks/borzoi.py
@@ -71,7 +71,7 @@ def __init__(
                     norm=True,
                     norm_type=norm_type,
                     norm_kwargs=norm_kwargs,
-                    act_func="gelu",
+                    act_func="gelu_borzoi",
                     order="NACDR",
                     pool_func="max",
                     pool_size=2,
@@ -142,6 +142,7 @@ def __init__(
         value_len: int,
         pos_dropout: float,
         attn_dropout: float,
+        ff_dropout: float,
         n_heads: int,
         n_pos_features: int,
         # Crop
@@ -173,6 +174,8 @@ def __init__(
             value_len=value_len,
             pos_dropout=pos_dropout,
             attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            norm_kwargs=norm_kwargs,
             n_heads=n_heads,
             n_pos_features=n_pos_features,
             flash_attn=flash_attn,
@@ -192,7 +195,7 @@ def __init__(
             in_channels=channels,
             out_channels=round(channels * 1.25),
             kernel_size=1,
-            act_func="gelu",
+            act_func="gelu_borzoi",
             dropout=0.1,
             norm=True,
             norm_type=norm_type,
@@ -201,7 +204,7 @@ def __init__(
             device=device,
             dtype=dtype,
         )
-        self.act = Activation("gelu")
+        self.act = Activation("gelu_borzoi")
         self.crop = Crop(crop_len=crop_len)
 
     def forward(self, x: Tensor) -> Tensor: