Skip to content

Commit 801a628

Browse files
Merge pull request PolusAI#235 from misterbrandonwalker/extract_protein_ligand
extract protein and ligand in same workflow step
2 parents ebdb2e7 + a3a8a34 commit 801a628

File tree

7 files changed

+207
-2
lines changed

7 files changed

+207
-2
lines changed

.github/workflows/docker_build.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ jobs:
2626
rename_residues_mol, combine_structure,
2727
remove_terminal_residue_name_prefixes, molgan,
2828
pdbbind_refined, onionnet-sfct, smina, pdbfixer,
29-
fix_pdb_atom_column, extract_protein, generate_conformers] # No username for pdbind_refined
29+
fix_pdb_atom_column, extract_protein, extract_ligand_protein, generate_conformers] # No username for pdbind_refined
3030
# skip data/ and cwl_adapters/file_format_conversions/biosimspace/
3131
runs-on: [ubuntu-latest]
3232

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#!/usr/bin/env cwl-runner
2+
cwlVersion: v1.0
3+
4+
class: CommandLineTool
5+
6+
label: A tool that employs OpenMM to extract ligands and protein from a PDB file
7+
8+
doc: |-
9+
A tool that employs OpenMM to extract ligands and protein from a PDB file
10+
11+
baseCommand: ['python', '/extract_ligand_protein.py']
12+
13+
hints:
14+
DockerRequirement:
15+
dockerPull: mrbrandonwalker/extract_ligand_protein
16+
17+
inputs:
18+
input_pdb_path:
19+
label: Input pdb file path
20+
doc: |-
21+
Input pdb file path
22+
Type: string
23+
File type: input
24+
Accepted formats: pdb
25+
Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/data/utils/cat_protein.pdb
26+
type: File
27+
format:
28+
- edam:format_1476
29+
inputBinding:
30+
prefix: --input_pdb_path
31+
32+
output_pdb_path:
33+
label: Output pdb file path
34+
doc: |-
35+
Output pdb file path
36+
Type: string
37+
File type: output
38+
Accepted formats: pdb
39+
Example file: https://github.com/bioexcel/biobb_structure_utils/raw/master/biobb_structure_utils/test/reference/utils/ref_cat_pdb.pdb
40+
type: string
41+
format:
42+
- edam:format_1476
43+
inputBinding:
44+
prefix: --output_pdb_path
45+
default: system.pdb
46+
47+
output_pdb_ligand_path:
48+
label: Output pdb ligand file path
49+
doc: |-
50+
Output pdb ligand file path
51+
Type: string
52+
File type: output
53+
Accepted formats: sdf
54+
type: string
55+
format:
56+
- edam:format_1476
57+
inputBinding:
58+
prefix: --output_pdb_ligand_path
59+
default: ligand_system.pdb
60+
61+
outputs:
62+
output_pdb_path:
63+
label: Output pdb file path
64+
doc: |-
65+
Output pdb file path
66+
type: File
67+
outputBinding:
68+
glob: $(inputs.output_pdb_path)
69+
format: edam:format_1476
70+
71+
output_pdb_ligand_path:
72+
label: Output ligand pdb file path
73+
doc: |-
74+
Output ligand pdb file path
75+
Use optional File? since ligand may not exist in complex
76+
type: File?
77+
outputBinding:
78+
glob: $(inputs.output_pdb_ligand_path)
79+
format: edam:format_1476
80+
81+
$namespaces:
82+
edam: https://edamontology.org/
83+
84+
$schemas:
85+
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl

cwl_adapters/extract_protein.cwl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,4 +58,4 @@ $namespaces:
5858
edam: https://edamontology.org/
5959

6060
$schemas:
61-
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl
61+
- https://raw.githubusercontent.com/edamontology/edamontology/master/EDAM_dev.owl

docker/dockerBuild.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ sudo docker build --no-cache --pull -f Dockerfile_onionnet-sfct -t polusai/onion
3434
sudo docker build --no-cache --pull -f Dockerfile_smina -t cyangnyu/smina .
3535
sudo docker build --no-cache --pull -f Dockerfile_pdbfixer -t ndonyapour/pdbfixer .
3636
sudo docker build --no-cache --pull -f Dockerfile_extract_protein -t ndonyapour/extract_protein .
37+
sudo docker build --no-cache --pull -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein .
3738
sudo docker build --no-cache --pull -f Dockerfile_fix_pdb_atom_column -t ndonyapour/fix_pdb_atom_column .
3839
sudo docker build --no-cache --pull -f Dockerfile_generate_conformers -t ndonyapour/generate_conformers .
3940

dockerPull.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,6 @@ docker pull mrbrandonwalker/diffdock_gpu
2828
docker pull mrbrandonwalker/diffdock_cpu
2929
docker pull ndonyapour/pdbfixer
3030
docker pull ndonyapour/extract_protein
31+
docker pull mrbrandonwalker/extract_ligand_protein
3132
docker pull ndonyapour/fix_pdb_atom_column
3233
docker pull ndonyapour/generate_conformers
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# docker build -f Dockerfile_extract_ligand_protein -t mrbrandonwalker/extract_ligand_protein .
2+
FROM condaforge/mambaforge
3+
# NOT mambaforge-pypy3 (mdanalysis is incompatible with pypy)
4+
RUN mamba install mdanalysis
5+
6+
ADD extract_ligand_protein.py .
7+
ADD Dockerfile_extract_ligand_protein .
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
# pylint: disable=no-member
2+
import sys
3+
import os
4+
import argparse
5+
6+
import MDAnalysis as mda
7+
8+
9+
def parse_arguments() -> argparse.Namespace:
10+
""" This function parses the arguments.
11+
12+
Returns:
13+
argparse.Namespace: The command line arguments
14+
"""
15+
parser = argparse.ArgumentParser()
16+
parser.add_argument('--input_pdb_path', type=str)
17+
parser.add_argument('--output_pdb_path', type=str)
18+
parser.add_argument('--output_pdb_ligand_path', type=str)
19+
args = parser.parse_args()
20+
return args
21+
22+
23+
def extract_ligand_protein(input_pdb_path: str, output_pdb_path: str, output_pdb_ligand_path: str) -> None:
24+
""" Extract ligand & protein from the PDB file
25+
26+
Args:
27+
input_pdb_path (str): The path to the input pdb file
28+
output_pdb_path (str): The path to the output pdb file
29+
output_pdb_ligand_path (str): The path to the output pdb ligand file
30+
"""
31+
32+
# Load the PDB file
33+
u = mda.Universe(input_pdb_path)
34+
35+
# Get unique residue names
36+
protein_atoms = u.select_atoms('protein') # use simple atom selection when possible
37+
38+
# Create a new Universe with only protein atoms
39+
protein_u = mda.Universe.empty(n_atoms=protein_atoms.n_atoms, trajectory=True) # needed for coordinates
40+
protein_u.atoms = protein_atoms
41+
42+
# duplicate the universe object
43+
dup_u = mda.Universe(input_pdb_path)
44+
45+
# now do the same for the ligand, not protein and not water or salts
46+
ligand_atoms = u.select_atoms('not protein')
47+
48+
try:
49+
# guess the bonds, since input PDB may not have bonds
50+
dup_u.atoms.guess_bonds()
51+
except ValueError:
52+
# ValueError: vdw radii for types: AS. These can be defined manually using the keyword 'vdwradii'
53+
print('Error: Could not guess bonds. Check the input PDB file.')
54+
55+
has_bonds = False
56+
try:
57+
num_bonds = len(dup_u.atoms.bonds)
58+
has_bonds = True
59+
except mda.exceptions.NoDataError:
60+
print('No bonds found in the PDB file.')
61+
62+
# Identify water molecules based on the connectivity pattern (Oxygen bonded to two Hydrogens)
63+
if has_bonds:
64+
water_indices = set()
65+
for atom in dup_u.atoms: # dont use selection resname == 'HOH', pdb file may have different water residue names
66+
if atom.name == 'O' and len(atom.bonds) == 2: # if hydrogens are added
67+
bonded_atoms_names = set([a.name for a in atom.bonded_atoms])
68+
if bonded_atoms_names == {'H'}: # Check if both bonds are Hydrogens
69+
water_indices.add(atom.index)
70+
water_indices.update([a.index for a in atom.bonded_atoms])
71+
72+
# now want to remove all salts, waters without H
73+
non_bonded = set()
74+
for atom in dup_u.atoms:
75+
if len(atom.bonds) == 0:
76+
non_bonded.add(atom.index)
77+
78+
# Remove water by excluding the water indices
79+
if len(water_indices) > 0:
80+
water_indices_string = ' '.join([str(i) for i in water_indices])
81+
ligand_atoms = ligand_atoms.select_atoms(f'not index {water_indices_string}')
82+
83+
# Remove non bonded atoms
84+
if len(non_bonded) > 0:
85+
non_bonded_string = ' '.join([str(i) for i in non_bonded])
86+
ligand_atoms = ligand_atoms.select_atoms(f'not index {non_bonded_string}')
87+
88+
ligand_u = mda.Universe.empty(n_atoms=ligand_atoms.n_atoms, trajectory=True) # needed for coordinates
89+
ligand_u.atoms = ligand_atoms
90+
91+
with open(output_pdb_path, mode="w", encoding='utf-8') as wfile:
92+
protein_u.atoms.write(output_pdb_path)
93+
if len(ligand_u.atoms) > 0: # will crash if no ligand atoms
94+
with open(output_pdb_ligand_path, mode="w", encoding='utf-8') as wfile:
95+
ligand_u.atoms.write(output_pdb_ligand_path)
96+
97+
98+
def main() -> None:
99+
""" Reads the command line arguments and extract protein from the PDB file
100+
"""
101+
args = parse_arguments()
102+
103+
if not os.path.exists(args.input_pdb_path):
104+
print(f'Error: Can not find file {args.input_pdb_path}')
105+
sys.exit(1)
106+
107+
extract_ligand_protein(args.input_pdb_path, args.output_pdb_path, args.output_pdb_ligand_path)
108+
109+
110+
if __name__ == '__main__':
111+
main()

0 commit comments

Comments
 (0)