upate max pos embedding for special tokens

aditya0by0 · aditya0by0 · commit 4777e0560d1f · 2025-08-16T14:12:12.000+02:00
diff --git a/.gitignore b/.gitignore
@@ -170,3 +170,4 @@ electra_pretrained.ckpt
 .jupyter
 .virtual_documents
 .isort.cfg
+.vscode
diff --git a/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py b/chebai_proteins/preprocessing/datasets/deepGO/go_uniprot.py
@@ -105,9 +105,9 @@ class _GOUniProtDataExtractor(_DynamicDataset, ABC):
     # TODO: should we be really allowing all branches for single dataset?
     _ALL_GO_BRANCHES: str = "all"
     _GO_BRANCH_NAMESPACE: Dict[str, str] = {
-        "BP": "biological_process",
-        "MF": "molecular_function",
-        "CC": "cellular_component",
+        "BP": "biological_process",  # Huge branch, with 20,000+ GO terms
+        "MF": "molecular_function",  # smaller branch, with 6000+ GO terms
+        "CC": "cellular_component",  # smallest branch, with 2,000+ GO terms
     }
 
     def __init__(self, go_branch: str, max_sequence_len: int = 1002, **kwargs):
diff --git a/configs/model/electra.yml b/configs/model/electra.yml
@@ -3,8 +3,11 @@ init_args:
   optimizer_kwargs:
     lr: 1e-3
   config:
-    vocab_size: 31
-    max_position_embeddings: 1002
+    vocab_size: 31 # 21 unique + embedding offset (10)
+    # For classification:[Maximum sequence length (1002) (padding will be also upto 1002)] + 1 for CLS token
+    # For pretraining: [Maximum sequence length (1002) (padding will be also upto 1002)] + 10 embedding offset (includes all special tokens)
+    # Hence, use max of (classification, pretraining): max_position_embeddings = 1002 + 10  = 1012
+    max_position_embeddings: 1012
     num_attention_heads: 8
     num_hidden_layers: 6
     type_vocab_size: 1
diff --git a/pyproject.toml b/pyproject.toml
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
 name = "chebai-proteins"
 version = "0.0.2"
 description = "Repository for protein prediction and classification, built on top of the python-chebai codebase"
-authors = [{name="", email=""}]
+authors = []
 readme = "README.md"
 license = { text = "AGPL-3.0" }
 requires-python = ">=3.9, <3.13"