Skip to content

Commit 0804450

Browse files
committed
bioemu_example.tsv added
run_bioemu.py added run_bioemu.sh added bioemu run added in run_PMGen and run_utils.py
1 parent d58c6cc commit 0804450

File tree

6 files changed

+137
-6
lines changed

6 files changed

+137
-6
lines changed

README.md

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# PMGen
2-
## Still under development, mino bugs and issues might happen
2+
## Still under development, minor bugs and issues might happen
33
**PMGen** (Peptide-MHC Predictive, Modeling and Generative) is a powerful and flexible framework
44
for Peptide-MHC (pMHC) complex modeling, binding prediction, and neoantigen design. It integrates
55
cutting-edge tools such as **PANDORA** for template generation, **AlphaFold (via AFfine)** for
@@ -279,7 +279,25 @@ python run_PMGen.py \
279279
--peptide_random_fix_fraction 0 \
280280
--iterative_peptide_gen 50
281281
```
282-
282+
10. BioEmu sampling on the final iteration in Iterative Peptide Generation.
283+
```bash
284+
python run_PMGen.py \
285+
--mode wrapper \
286+
--run single \
287+
--df data/example/bioemu_exmple.tsv \
288+
--no_pandora \
289+
--output_dir outputs/bioemu_mhcreps_forpaper \
290+
--bioemu_batch_size_100 30 \
291+
--iterative_peptide_gen 5 \
292+
--return_all_outputs \
293+
--fix_anchors \
294+
--peptide_random_fix_fraction 0.6 \
295+
--batch_size 2 \
296+
--peptide_design \
297+
--binder_pred \
298+
--bioemu_run_on_iter 5 \
299+
--run_bioemu
300+
```
283301
### Output
284302

285303
Results are saved in --output_dir with the following structure:

data/example/bioemu_exmple.tsv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
peptide mhc_seq mhc_type anchors id
2+
QHIRCNIPKRIGPSKVATLVPR EEHVIIQAEFYLNPDQSGEFMFDFDGDEIFHVDMAKKETVWRLEEFGRFASFEAQGALANIAVDKANLEIMTKRSNY/TRPRFLELLKSECHFFNGTERVRFLERYFHNQEEFVRFDSDVGEYRAVTELGRPVAESWNSQKDLLEQKRGQVDTYCRHNYGVVESFTVQRRVH 2 4H25
3+
FLNKDLEVDGHFVTM GSHSMRYFFTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASQRMEPRAPWIEQEGPEYWDGETRKVKAHSQTHRVDLGTLRGYYNQSEAGSHTVQRMYGCDVGSDWRFLRGYHQYAYDGKDYIALKEDLRSWTAADMAAQTTKHKWEAAHVAEQLRAYLEGTCVEWLRRYLENGKETLQRTDFLNKDLEVDGHFVTM 1 4U6Y

run_PMGen.py

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import argparse
22
import pandas as pd
3-
from run_utils import run_PMGen_wrapper, run_PMGen_modeling, protein_mpnn_wrapper, MultipleAnchors, get_best_structres, retrieve_anchors_and_fixed_positions, assert_iterative_mode, collect_generated_binders, create_new_input_and_fixed_positions
3+
from run_utils import run_PMGen_wrapper, run_PMGen_modeling, protein_mpnn_wrapper, bioemu_assertions, MultipleAnchors, get_best_structres, retrieve_anchors_and_fixed_positions, assert_iterative_mode, collect_generated_binders, create_new_input_and_fixed_positions
44
import shutil
55
from Bio import SeqIO
66
import warnings
7+
import subprocess
78
import os
89
from Bio import BiopythonDeprecationWarning
910
warnings.filterwarnings("ignore", category=BiopythonDeprecationWarning)
@@ -74,13 +75,25 @@ def main():
7475
parser.add_argument('--mhc_design', action='store_true', help='Enables whole mhc design. we recommend to use only_pseudo_sequence_design mode.')
7576
parser.add_argument('--num_sequences_peptide', type=int, default=10, help='Number of peptide sequences to be generated. Works only with --peptide_design')
7677
parser.add_argument('--num_sequences_mhc', type=int, default=10, help='Number of mhc sequences to be generated. Works only with --only_pseudo_sequence_design or --mhc_design')
77-
parser.add_argument('--sampling_temp', type=float, default=0.1, help='ProteinMPNN sampling temperature.')
78+
parser.add_argument('--sampling_temp', type=float, default=1.5, help='ProteinMPNN sampling temperature.')
7879
parser.add_argument('--batch_size', type=int, default=1, help='ProteinMPNN batch size.')
7980
parser.add_argument('--hot_spot_thr', type=float, default=6.0, help='Distance threshold to peptide, to define hot-spots on mhc.')
8081
parser.add_argument('--binder_pred', action='store_true', help='Enables binder prediction from ProteinMPNN generated peptides.')
8182
parser.add_argument("--fix_anchors", action='store_true', help='If set, does not design anchor positions in peptide generation')
8283
parser.add_argument("--peptide_random_fix_fraction", type=float, default=0., help="Disables design for a random fraction of amino acids in peptide")
8384

85+
# BioEmu Argumetns
86+
parser.add_argument('--run_bioemu', action='store_true', help='Enables bioemu pMHC sampling.')
87+
parser.add_argument('--bioemu_num_samples', type=int, default=100, help='Sampling rounds in bioemu. You might get lower number of structures'
88+
'if --filter_samples is active')
89+
parser.add_argument('--bioemu_batch_size_100', type=int, default=10, help='Batch size you use for a sequence of length 100. The batch size '
90+
'will be calculated from this, assuming that the memory requirement to compute '
91+
'each sample scales quadratically with the sequence length.')
92+
parser.add_argument('--bioemu_filter_samples', action='store_true', help='Filter out unphysical samples with e.g. long bond distances or steric clashes.')
93+
parser.add_argument('--bioemu_run_on_iter', type=int, default=None, help='Optional, only works when iterative_peptide_gen > 0. Runs bioemu on the structure taken'
94+
'from the iteration number given by user. If not set, runs on the 0 iteration by default.')
95+
96+
8497
# Setting to Run only a part:
8598
parser.add_argument('--no_alphafold', action='store_false', help='does not run alphafold.')
8699
parser.add_argument('--only_protein_mpnn', action='store_true', help='Skips PANDORA and AF modeling, and runs ProteinMPNN for already available predictions.')
@@ -92,6 +105,7 @@ def main():
92105
parser.add_argument('--iterative_peptide_gen', type=int, default=0, help='If used, the iterative peptide generation is performed, defines the number of iterations.')
93106

94107
args = parser.parse_args()
108+
bioemu_assertions(args)
95109
for iteration in range(args.iterative_peptide_gen + 1):
96110
fixed_positions_path = None #only for iter > 0 in iteration mode
97111
# if we have entered the iterative generation mode
@@ -226,5 +240,44 @@ def main():
226240
print("Collecting the best binders")
227241
collected_generated_binders_path = collect_generated_binders(args, df, iteration)
228242

243+
if args.run_bioemu:
244+
print('**BioEmu runs initiating**')
245+
246+
output_dir = args.output_dir
247+
bioemu_input_df_path = args.df
248+
249+
if args.iterative_peptide_gen > 0: #iterative mode --> which iteration to run bioemu on? only one can be used.
250+
bioemu_run_on_iter = 0
251+
if args.bioemu_run_on_iter:
252+
assert args.bioemu_run_on_iter <= args.iterative_peptide_gen, f'Please make sure --iterative_peptide_gen is less or equal to --iterative_peptide_gen'
253+
bioemu_run_on_iter = args.bioemu_run_on_iter
254+
output_dir = os.path.join("/".join(args.output_dir.split("/")[:-1]), f"iter_{bioemu_run_on_iter}") # outputdir/
255+
bioemu_input_df_path = os.path.join(output_dir, f'input_df_{bioemu_run_on_iter}.tsv')
256+
257+
bioemu_output_dir = os.path.join(output_dir, 'bioemu')
258+
cache_embeds_dir = os.path.join(output_dir, 'alphafold')
259+
os.makedirs(bioemu_output_dir, exist_ok=True)
260+
assert os.path.isdir(cache_embeds_dir), f'{cache_embeds_dir} not found'
261+
assert os.path.isdir(bioemu_output_dir), f'{bioemu_output_dir} not found'
262+
assert os.path.isfile(bioemu_input_df_path), f'{bioemu_input_df_path} not found'
263+
print(f'Running on cache_embeds_dir: {cache_embeds_dir}')
264+
265+
bioemu_df = pd.read_csv(bioemu_input_df_path, sep='\t')
266+
for i, row in bioemu_df.iterrows():
267+
sequence = str(row['mhc_seq'].replace('/','') + row['peptide'])
268+
id = str(row['id'])
269+
bioemu_output_dir_id = os.path.join(bioemu_output_dir, id)
270+
cmd = ['./run_bioemu.sh', '--sequence', sequence, '--id', id, '--output_dir', bioemu_output_dir_id,
271+
'--cache_embeds_dir', cache_embeds_dir, '--bioemu_num_samples', args.bioemu_num_samples,
272+
'--bioemu_batch_size_100', args.bioemu_batch_size_100, '--bioemu_filter_samples', args.bioemu_filter_samples]
273+
print(f"🌀 Starting iteration {i}: {' '.join(cmd)}")
274+
try:
275+
result = subprocess.run(cmd, check=True)
276+
print(f"✅ Bioemurun-> Iteration {i}, id {id}, outpath {bioemu_output_dir_id} completed successfully\n")
277+
except subprocess.CalledProcessError as e:
278+
print(f"❌ Bioemurun-> Iteration {i}, id {id}, outpath {bioemu_output_dir_id} failed with error code {e.returncode}")
279+
# Optional: stop if a command fails
280+
281+
229282
if __name__ == "__main__":
230283
main()

run_bioemu.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Run on bioemu env
2+
from bioemu.sample import main as sample
3+
import argparse
4+
5+
6+
def parse_args():
7+
parser = argparse.ArgumentParser(description="BioEmu sampling script")
8+
9+
parser.add_argument('--sequence', type='str', required=True, help='MHC+peptide sequence with no separator. For MHC-II, Alpha+Beta+peptide')
10+
parser.add_argument('--id', type='str', required=True, help='PMGen id given to sequence input.')
11+
parser.add_argument('--output_dir', type='str', required=True, help='Output directory to save the results')
12+
parser.add_argument('--cache_embeds_dir', required=True, tupe='str', help='Path to alphafold folder containing ids/*_representations.pkl')
13+
parser.add_argument( '--bioemu_num_samples', type=int, default=10, help='Sampling rounds in BioEmu. You might get a lower number of structures if --bioemu_filter_samples is active.')
14+
parser.add_argument('--bioemu_batch_size_100', type=int, default=10, help='Batch size for a sequence of length 100. Actual batch size ' 'scales with the square of sequence length.')
15+
parser.add_argument('--bioemu_filter_samples', action='store_true', help='Filter out unphysical samples (e.g., long bond distances or steric clashes).')
16+
return parser.parse_args()
17+
18+
if __name__ == "__main__":
19+
args = parse_args()
20+
sample(sequence=args.sequence,
21+
id=args.id,
22+
num_samples=args.bioemu_num_samples,
23+
output_dir=args.output_dir,
24+
batch_size_100=args.bioemu_batch_size_100,
25+
cache_embeds_dir=args.cache_embeds_dir,
26+
filter_samples=args.bioemu_filter_samples)

run_bioemu.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
3+
# Name of the environment to activate
4+
ENV_NAME="bioemu"
5+
6+
# Detect available environment manager
7+
if command -v conda &> /dev/null; then
8+
TOOL="conda"
9+
eval "$($TOOL shell.bash hook)"
10+
elif command -v mamba &> /dev/null; then
11+
TOOL="mamba"
12+
eval "$($TOOL shell.bash hook)"
13+
elif command -v micromamba &> /dev/null; then
14+
TOOL="micromamba"
15+
eval "$($TOOL shell.bash hook)"
16+
else
17+
echo "❌ Error: conda, mamba, or micromamba not found in PATH."
18+
exit 1
19+
fi
20+
21+
echo "✅ Using $TOOL to activate environment '$ENV_NAME'..."
22+
$TOOL activate "$ENV_NAME"
23+
24+
# Now forward all arguments to the Python script
25+
python run_bioemu.py "$@"

run_utils.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -585,7 +585,7 @@ def __init__(self, PMGen_pdb, output_dir,
585585
num_sequences_peptide=10, num_sequences_mhc=3,
586586
peptide_chain='P', mhc_design=True, peptide_design=True,
587587
only_pseudo_sequence_design=True, anchor_pred=True,
588-
sampling_temp=0.05, batch_size=1, hot_spot_thr=6.0,
588+
sampling_temp=1.5, batch_size=1, hot_spot_thr=6.0,
589589
save_hotspots=True, binder_pred=False, fix_anchors=False,
590590
anchor_and_peptide=None):
591591
'''
@@ -718,7 +718,6 @@ def __peptide_design(self):
718718
"--save_probs", "1",
719719
"--save_score", "1",
720720
"--omit_AAs", "X",
721-
"--sampling_temp", "1.5"
722721
]
723722
if self.fix_anchors:# to fix anchors, fixed_pdbs file and design_only_positions should be generated
724723
# we have anchors, we need to define designable positions which are non-anchor positions
@@ -1267,6 +1266,13 @@ def create_new_input_and_fixed_positions(args, best_generated_peptides_path, ite
12671266

12681267

12691268

1269+
def bioemu_assertions(args):
1270+
if args.run_bioemu:
1271+
args.return_all_outputs = True # make sure AF2 outputs are returned
1272+
assert args.mode == 'wrapper', f'Bioemu is availabe only in wrapper mode: use --mode wrapper'
1273+
if args.iterative_peptide_gen > 0: #iterative mode --> which iteration to run bioemu on? only one can be used.
1274+
if args.bioemu_run_on_iter:
1275+
assert args.bioemu_run_on_iter <= args.iterative_peptide_gen, f'Please make sure --iterative_peptide_gen is less or equal to --iterative_peptide_gen'
12701276

12711277

12721278

0 commit comments

Comments
 (0)