gfzhou · alchemistcai · Sep 26, 2024
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/openvs/args.py b/openvs/args.py
@@ -1,6 +1,8 @@
+'''Typed arguments defination for argparse type checking and code completion.'''
+
 import os,sys
 from tap import Tap
-from typing import Any, Callable, List, Tuple, Union
+from typing import Any, Callable, List, Optional, Tuple, Union
 from typing_extensions import Literal
 
 class ExtractSmilesArgs(Tap):
@@ -13,19 +15,28 @@ class ExtractSmilesArgs(Tap):
     validatefn: str
     datarootdir: str
 
+
 class VanillaModelArgs(Tap):
+    '''Typed args for Vanilla model.'''
     nnodes: int = 3000
+    '''Neuron nodes number in one layer'''
     nBits: int = 1024
+    '''Length of morgan fingerprint vector.'''
     dataset_type: Literal["binaryclass", "multiclass", "regression"]
+    '''Predict form.'''
     dropout: float = 0.5
+    '''Dropout factor in dropout layer.'''
     nlayers: int = 2
+    '''Number of same layers.'''
+
 
 class TrainArgs(Tap):
+    '''Typed args for training mode.'''
     modelid: str = "0"
     i_iter: int = 1
-    train_datafn: str = None
-    test_datafn: str = None
-    validate_datafn: str = None
+    train_datafn: Optional[str] = None
+    test_datafn: Optional[str] = None
+    validate_datafn: Optional[str] = None
     hit_ratio: float = 0.0
     score_cutoff: float = 0.0
     prefix: str = ""
@@ -35,29 +46,32 @@ class TrainArgs(Tap):
     rand_seed: int = 66666
     log_frequency: int = 500
     weight_class: bool = False
-    class_weights: List=[1,1,1,1]
+    class_weights: List[float] = [1, 1, 1, 1]
     patience: int = 5
-    disable_progress_bar : bool = False
+    disable_progress_bar: bool = False
     inferenceDropout: bool = False
-    
+
 
 class EvalArgs(Tap):
-    topNs: List = [10, 100, 1000, 10000]
-    thresholds: List = [0.2, 0.35, 0.5]
-    target_threshold: float = None
+    topNs: List[int] = [10, 100, 1000, 10000]
+    thresholds: List[float] = [0.2, 0.35, 0.5]
+    target_threshold: Optional[float] = None
     target_recall: float = 0.9 #only used in validation set evaluation
     rand_active_prob: float
     dataset_type: Literal["test", "validate"]
     disable_progress_bar : bool = False
 
+
 class PredictArgs(Tap):
-    modelfn: str = None
-    database_type: str = None
-    database_path: str = None
-    prediction_path: str = None
+    '''Typed args for predicting mode.'''
+    modelfn: Optional[str] = None
+    database_type: Optional[str] = None
+    database_path: Optional[str] = None
+    prediction_path: Optional[str] = None
     disable_progress_bar: bool = True
-    batch_size : int = 10000
+    '''Whether to disable progresss bar.'''
+    batch_size: int = 10000
     outfileformat: str = "feather"
+    '''Extension name of the output file.'''
     run_platform: str="auto" #Literal["gpu", "slurm", "auto"], I need "auto" to be default
     i_iter: int
-
diff --git a/openvs/models.py b/openvs/models.py
@@ -1,11 +1,16 @@
+'''Network model implementations to acclerate visual screening.'''
+
 import os,sys
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from openvs.args import VanillaModelArgs
 
+
 class VanillaNet(nn.Module):
-    def __init__(self, args: VanillaModelArgs ):
+    '''A classical one-to-one network.'''
+
+    def __init__(self, args: VanillaModelArgs):
         super().__init__()
         nBits = args.nBits
         nnodes = args.nnodes
@@ -24,7 +29,7 @@ def __init__(self, args: VanillaModelArgs ):
         self.dropout1 = nn.Dropout(dropoutfreq)
         self.dropout2 = nn.Dropout(dropoutfreq)
         self.out_activation = nn.Sigmoid()
-    
+
     def forward(self, x):
         x = F.relu(self.bn1(self.fc1(x)))
         x = self.dropout1(x)
@@ -36,8 +41,10 @@ def forward(self, x):
             x = self.out_activation(x)
         return x
 
+
 class VanillaNet2(nn.Module):
-    def __init__(self, args: VanillaModelArgs ):
+    '''A classical one-to-one network.'''
+    def __init__(self, args: VanillaModelArgs):
         super().__init__()
         nBits = args.nBits
         nnodes = args.nnodes
@@ -55,7 +62,7 @@ def __init__(self, args: VanillaModelArgs ):
         self.bn3 = nn.BatchNorm1d(num_features=nnodes)
         self.dropout = nn.Dropout(dropoutfreq)
         self.out_activation = nn.Sigmoid()
-    
+
     def forward(self, x):
         x = F.relu(self.bn1(self.fc1(x)))
         x = self.dropout(x)
@@ -82,13 +89,13 @@ def __init__(self, args: VanillaModelArgs ):
         self.fc_in = nn.Linear(nBits, nnodes)
         self.fcs = nn.ModuleList([nn.Linear(nnodes, nnodes) for i in range(self.nlayers)] )
         self.fc_out = nn.Linear(nnodes, 1)
-        
+
         self.bn1 = nn.BatchNorm1d(num_features=nnodes)
         self.bns = nn.ModuleList([nn.BatchNorm1d(num_features=nnodes) for i in range(self.nlayers)])
-        
+
         self.dropout = nn.Dropout(dropoutfreq)
         self.out_activation = nn.Sigmoid()
-    
+
     def forward(self, x):
         x = F.relu(self.bn1(self.fc_in(x)))
         x = self.dropout(x)

diff --git a/openvs/utils/cluster.py b/openvs/utils/cluster.py
@@ -1,29 +1,37 @@
+'''Clustering algoritms.'''
+
 import os,sys
 import numpy as np
 import torch
 from time import time
 
-def one_to_all_tanimoto(x, X):
+def one_to_all_tanimoto(x, X) -> torch.Tensor:
+    '''Calculate 1 - tanimoto similarity between vector x and vector set X.
+
+    If x and X[:,i] are same,tanimoto[i] is 0;if x and X[:,1] are totally different,tanimoto[i] is 1;otherwise it's between 0~1.
+
+    In clustering algoritms,two vectors' `distance` is shorter when they are more similar.
+    '''
     c = torch.sum(X*x, dim=1)
     a = torch.sum(X,dim=1)
     b = torch.sum(x)
-    
+
     return 1-c.type(torch.float)/(a+b-c).type(torch.float)
-
 
-def one_to_all_euclidean(x, X, dist_metric="euclidean"):
-    return torch.sqrt(torch.sum((X-x)**2,dim=1))
+
+def one_to_all_euclidean(x, X, dist_metric="euclidean") -> torch.Tensor:
+    '''Calculate euclidean distance between vector x and vector set X.'''
+    return torch.sqrt(torch.sum((X - x)**2, dim=1))
 
 
 class BestFirstClustering():
-    def __init__(self, cutoff, dist_metric="tanimoto", dtype=torch.uint8):
+    def __init__(self, cutoff, dist_metric: str="tanimoto", dtype=torch.uint8):
 
+        self.cutoff = cutoff
         if dist_metric == "euclidean":
-            self.cutoff = cutoff
-            self.one_to_all_d = one_to_all_gpu_euclidean
+            self.one_to_all_d = one_to_all_euclidean
 
         elif dist_metric == 'tanimoto':
-            self.cutoff = cutoff
             self.one_to_all_d = one_to_all_tanimoto
         if torch.cuda.is_available():
             self.use_gpu = True