Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
46 changes: 30 additions & 16 deletions openvs/args.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
'''Typed arguments defination for argparse type checking and code completion.'''

import os,sys
from tap import Tap
from typing import Any, Callable, List, Tuple, Union
from typing import Any, Callable, List, Optional, Tuple, Union
from typing_extensions import Literal

class ExtractSmilesArgs(Tap):
Expand All @@ -13,19 +15,28 @@ class ExtractSmilesArgs(Tap):
validatefn: str
datarootdir: str


class VanillaModelArgs(Tap):
'''Typed args for Vanilla model.'''
nnodes: int = 3000
'''Neuron nodes number in one layer'''
nBits: int = 1024
'''Length of morgan fingerprint vector.'''
dataset_type: Literal["binaryclass", "multiclass", "regression"]
'''Predict form.'''
dropout: float = 0.5
'''Dropout factor in dropout layer.'''
nlayers: int = 2
'''Number of same layers.'''


class TrainArgs(Tap):
'''Typed args for training mode.'''
modelid: str = "0"
i_iter: int = 1
train_datafn: str = None
test_datafn: str = None
validate_datafn: str = None
train_datafn: Optional[str] = None
test_datafn: Optional[str] = None
validate_datafn: Optional[str] = None
hit_ratio: float = 0.0
score_cutoff: float = 0.0
prefix: str = ""
Expand All @@ -35,29 +46,32 @@ class TrainArgs(Tap):
rand_seed: int = 66666
log_frequency: int = 500
weight_class: bool = False
class_weights: List=[1,1,1,1]
class_weights: List[float] = [1, 1, 1, 1]
patience: int = 5
disable_progress_bar : bool = False
disable_progress_bar: bool = False
inferenceDropout: bool = False


class EvalArgs(Tap):
topNs: List = [10, 100, 1000, 10000]
thresholds: List = [0.2, 0.35, 0.5]
target_threshold: float = None
topNs: List[int] = [10, 100, 1000, 10000]
thresholds: List[float] = [0.2, 0.35, 0.5]
target_threshold: Optional[float] = None
target_recall: float = 0.9 #only used in validation set evaluation
rand_active_prob: float
dataset_type: Literal["test", "validate"]
disable_progress_bar : bool = False


class PredictArgs(Tap):
modelfn: str = None
database_type: str = None
database_path: str = None
prediction_path: str = None
'''Typed args for predicting mode.'''
modelfn: Optional[str] = None
database_type: Optional[str] = None
database_path: Optional[str] = None
prediction_path: Optional[str] = None
disable_progress_bar: bool = True
batch_size : int = 10000
'''Whether to disable progresss bar.'''
batch_size: int = 10000
outfileformat: str = "feather"
'''Extension name of the output file.'''
run_platform: str="auto" #Literal["gpu", "slurm", "auto"], I need "auto" to be default
i_iter: int

21 changes: 14 additions & 7 deletions openvs/models.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
'''Network model implementations to acclerate visual screening.'''

import os,sys
import torch
import torch.nn as nn
import torch.nn.functional as F
from openvs.args import VanillaModelArgs


class VanillaNet(nn.Module):
def __init__(self, args: VanillaModelArgs ):
'''A classical one-to-one network.'''

def __init__(self, args: VanillaModelArgs):
super().__init__()
nBits = args.nBits
nnodes = args.nnodes
Expand All @@ -24,7 +29,7 @@ def __init__(self, args: VanillaModelArgs ):
self.dropout1 = nn.Dropout(dropoutfreq)
self.dropout2 = nn.Dropout(dropoutfreq)
self.out_activation = nn.Sigmoid()

def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = self.dropout1(x)
Expand All @@ -36,8 +41,10 @@ def forward(self, x):
x = self.out_activation(x)
return x


class VanillaNet2(nn.Module):
def __init__(self, args: VanillaModelArgs ):
'''A classical one-to-one network.'''
def __init__(self, args: VanillaModelArgs):
super().__init__()
nBits = args.nBits
nnodes = args.nnodes
Expand All @@ -55,7 +62,7 @@ def __init__(self, args: VanillaModelArgs ):
self.bn3 = nn.BatchNorm1d(num_features=nnodes)
self.dropout = nn.Dropout(dropoutfreq)
self.out_activation = nn.Sigmoid()

def forward(self, x):
x = F.relu(self.bn1(self.fc1(x)))
x = self.dropout(x)
Expand All @@ -82,13 +89,13 @@ def __init__(self, args: VanillaModelArgs ):
self.fc_in = nn.Linear(nBits, nnodes)
self.fcs = nn.ModuleList([nn.Linear(nnodes, nnodes) for i in range(self.nlayers)] )
self.fc_out = nn.Linear(nnodes, 1)

self.bn1 = nn.BatchNorm1d(num_features=nnodes)
self.bns = nn.ModuleList([nn.BatchNorm1d(num_features=nnodes) for i in range(self.nlayers)])

self.dropout = nn.Dropout(dropoutfreq)
self.out_activation = nn.Sigmoid()

def forward(self, x):
x = F.relu(self.bn1(self.fc_in(x)))
x = self.dropout(x)
Expand Down
26 changes: 17 additions & 9 deletions openvs/utils/cluster.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,37 @@
'''Clustering algoritms.'''

import os,sys
import numpy as np
import torch
from time import time

def one_to_all_tanimoto(x, X):
def one_to_all_tanimoto(x, X) -> torch.Tensor:
'''Calculate 1 - tanimoto similarity between vector x and vector set X.

If x and X[:,i] are same,tanimoto[i] is 0;if x and X[:,1] are totally different,tanimoto[i] is 1;otherwise it's between 0~1.

In clustering algoritms,two vectors' `distance` is shorter when they are more similar.
'''
c = torch.sum(X*x, dim=1)
a = torch.sum(X,dim=1)
b = torch.sum(x)

return 1-c.type(torch.float)/(a+b-c).type(torch.float)


def one_to_all_euclidean(x, X, dist_metric="euclidean"):
return torch.sqrt(torch.sum((X-x)**2,dim=1))

def one_to_all_euclidean(x, X, dist_metric="euclidean") -> torch.Tensor:
'''Calculate euclidean distance between vector x and vector set X.'''
return torch.sqrt(torch.sum((X - x)**2, dim=1))


class BestFirstClustering():
def __init__(self, cutoff, dist_metric="tanimoto", dtype=torch.uint8):
def __init__(self, cutoff, dist_metric: str="tanimoto", dtype=torch.uint8):

self.cutoff = cutoff
if dist_metric == "euclidean":
self.cutoff = cutoff
self.one_to_all_d = one_to_all_gpu_euclidean
self.one_to_all_d = one_to_all_euclidean

elif dist_metric == 'tanimoto':
self.cutoff = cutoff
self.one_to_all_d = one_to_all_tanimoto
if torch.cuda.is_available():
self.use_gpu = True
Expand Down
Loading