doc update

Muhammed Hasan Celik · Muhammed Hasan Celik · commit 5c48bf4d4990 · 2025-07-07T16:18:34.000-07:00
diff --git a/src/grelu/model/blocks.py b/src/grelu/model/blocks.py
@@ -870,7 +870,8 @@ class UnetBlock(nn.Module):
         y_in_channels: Number of channels in the higher-resolution representation.
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
         norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers
-        act_func: Name of the activation function
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type of the weights
         device: Device on which to store the weights
     """
@@ -938,7 +939,8 @@ class UnetTower(nn.Module):
         in_channels: Number of channels in the input
         y_in_channels: Number of channels in the higher-resolution representations.
         n_blocks: Number of U-net blocks
-        act_func: Name of the activation function
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         kwargs: Additional arguments to be passed to the U-net blocks
     """
 
diff --git a/src/grelu/model/layers.py b/src/grelu/model/layers.py
@@ -20,8 +20,15 @@ class Activation(nn.Module):
     A nonlinear activation layer.
 
     Args:
-        func: The type of activation function. Supported values are 'relu',
-            'elu', 'softplus', 'gelu', 'gelu_borzoi', 'gelu_enformer' and 'exp'. If None, will return nn.Identity.
+        func: The type of activation function. Supported values are:
+            - 'relu': Standard ReLU activation
+            - 'elu': Exponential Linear Unit
+            - 'softplus': Softplus activation
+            - 'gelu': Standard GELU activation using PyTorch's default approximation
+            - 'gelu_borzoi': GELU activation using tanh approximation (different from PyTorch's default)
+            - 'gelu_enformer': Custom GELU implementation from Enformer
+            - 'exp': Exponential activation
+            - None: Returns identity function (no activation)
 
     Raises:
         NotImplementedError: If 'func' is not a supported activation function.
@@ -159,6 +166,14 @@ class Norm(nn.Module):
             'syncbatch', 'instance',  or 'layer'. If None, will return nn.Identity.
         in_dim: Number of features in the input tensor.
         **kwargs: Additional arguments to pass to the normalization function.
+            Common arguments include:
+            - eps: Small constant added to denominator for numerical stability.
+                Defaults to 1e-5 for all normalization types unless overridden.
+            - momentum: Value used for the running_mean and running_var computation.
+                Defaults to 0.1 for batch and sync batch norm.
+            - affine: If True, adds learnable affine parameters. Defaults to True.
+            - track_running_stats: If True, tracks running mean and variance.
+                Defaults to True for batch and sync batch norm.
     """
 
     def __init__(
diff --git a/src/grelu/model/models.py b/src/grelu/model/models.py
@@ -496,6 +496,10 @@ class BorzoiModel(BaseModel):
             If None, no pooling will be applied at the end.
         flash_attn: If True, uses Flash Attention with Rotational Position Embeddings. key_len, value_len,
             pos_dropout and n_pos_features are ignored.
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers.
+            Defaults to {"eps": 0.001}.
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type for the layers.
         device: Device for the layers.
     """
@@ -570,6 +574,19 @@ def __init__(
 class BorzoiPretrainedModel(BaseModel):
     """
     Borzoi model with published weights (ported from Keras).
+
+    Args:
+        n_tasks: Number of tasks for the model to predict
+        fold: Which fold of the model to load (default=0)
+        n_transformers: Number of transformer blocks to use (default=8)
+        crop_len: Number of positions to crop at either end of the output (default=0)
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
+        norm_kwargs: Optional dictionary of keyword arguments to pass to the normalization layers.
+            Defaults to {"eps": 0.001}.
+        final_pool_func: Name of the pooling function to apply to the final output (default="avg")
+        dtype: Data type for the layers
+        device: Device for the layers
     """
 
     def __init__(
diff --git a/src/grelu/model/trunks/borzoi.py b/src/grelu/model/trunks/borzoi.py
@@ -22,6 +22,8 @@ class BorzoiConvTower(nn.Module):
         n_blocks: Number of convolutional/pooling blocks, including the stem
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
         norm_kwargs: Additional arguments to be passed to the normalization layer
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type for the layers.
         device: Device for the layers.
     """
@@ -123,6 +125,8 @@ class BorzoiTrunk(nn.Module):
             pos_dropout and n_pos_features are ignored.
         norm_type: Type of normalization to apply: 'batch', 'syncbatch', 'layer', 'instance' or None
         norm_kwargs: Additional arguments to be passed to the normalization layer
+        act_func: Name of the activation function. Defaults to 'gelu_borzoi' which uses
+            tanh approximation (different from PyTorch's default GELU implementation).
         dtype: Data type for the layers.
         device: Device for the layers.
     """