Skip to content

Commit 1df42c3

Browse files
committed
mutation selection and speed boost applied
1 parent 711b032 commit 1df42c3

File tree

3 files changed

+243
-68
lines changed

3 files changed

+243
-68
lines changed

run_PMGen.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def main():
6060
parser.add_argument('--max_cores', type=int, default=4, help='Maximum number of CPU cores (only for parallel mode)')
6161
parser.add_argument('--dirty_mode', action='store_true')
6262
parser.add_argument('--initial_guess', action='store_true', help='Activates Faster and more accurate AF initial Guess mode instead of Homology modelling mode')
63+
parser.add_argument('--verbose', action='store_true')
6364

6465
# Wrapper mode argument
6566
parser.add_argument('--df', type=str, help='Recommended. Path to input TSV file (required for wrapper mode)'

run_utils.py

Lines changed: 55 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -889,10 +889,10 @@ def protein_mpnn_wrapper(output_pdbs_dict, args, max_jobs, anchor_and_peptide=No
889889
else:
890890
raise ValueError("Invalid mode! Choose 'parallel' or 'single'.")
891891

892-
892+
import time
893893
def run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, output_dir, mhc_seq_list=[], mhc_allele=None,
894-
dirty_mode=False, verbose=False, outfilename='netmhcpan_out', return_match_allele=False,
895-
match_with_netmhcpan=True):
894+
dirty_mode=False, verbose=True, outfilename='netmhcpan_out', return_match_allele=False,
895+
match_with_netmhcpan=True, n_jobs=1, parallel=False):
896896
assert mhc_type in [1,2]
897897
if not mhc_allele and len(mhc_seq_list) == 0:
898898
raise ValueError(f'at least one of mhc_seq_list or mhc_allele should be provided')
@@ -923,8 +923,17 @@ def run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, output_dir, mhc_seq_li
923923
matched_allele.append(a)
924924
if mhc_type == 1: break
925925
if verbose: print("Matched Alleles", matched_allele)
926-
processing_functions.run_netmhcpan(peptide_fasta_file, matched_allele, outfile, mhc_type)
926+
if parallel:
927+
processing_functions.run_netmhcpan_parallel(peptide_fasta_file, matched_allele, outfile, mhc_type, n_jobs=n_jobs, verbose=verbose)
928+
else:
929+
processing_functions.run_netmhcpan(peptide_fasta_file, matched_allele, outfile, mhc_type)
930+
if verbose:
931+
s = time.time()
932+
print('Parsing netmhcpan output on ', outfile)
927933
df = processing_functions.parse_netmhcpan_file(outfile)
934+
if verbose:
935+
e = time.time()
936+
print('parsing finished', 'time:', e-s)
928937
df.to_csv(outfile_csv, index=False)
929938
if not dirty_mode:
930939
os.remove(outfile)
@@ -965,7 +974,8 @@ def _process_row(self, row):
965974
elif mhc_type == 1:
966975
assert len(mhc_seq_list) == 1, (f'mhc_seq for mhc_type==1, should be string with no "/", '
967976
f'found: \n {str(row.mhc_seq)}')
968-
netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=False, outfilename=str(row.id))
977+
parallel = True if self.args.run == 'parallel' else False
978+
netmhc_df = run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, self.tmp, mhc_seq_list, verbose=self.args.verbose, outfilename=str(row.id), n_jobs=self.args.max_cores, parallel=parallel)
969979
seen_cores = []
970980
results = {'anchors': [], 'mhc_seqs': [], 'ids': [], 'peptides': [], 'mhc_types': []}
971981
counter = 0
@@ -1352,23 +1362,32 @@ def generate_mutants(peptide, positions, fasta_out, id, one_indexed=True):
13521362

13531363
def filter_peptides_from_netmhcpan_csv_output(df):
13541364
"""
1365+
Optimized version:
13551366
1. Removes rows where 'Identity' starts with 'multichain'.
13561367
2. For each remaining Identity, keeps the row with the longest peptide.
13571368
3. Sorts the resulting rows by '%Rank_EL' in ascending order (lower = better).
13581369
"""
1359-
# Step 1: Remove 'multichain' rows
1360-
df = df[~df['Identity'].astype(str).str.startswith('multichain')].copy()
1361-
# Step 2: Compute peptide lengths
1370+
# Step 1: Remove 'multichain' rows using vectorized string operation
1371+
# Avoid .copy() if not modifying the original df elsewhere
1372+
mask = ~df['Identity'].str.startswith('multichain', na=False)
1373+
df = df[mask]
1374+
1375+
# Step 2: Compute peptide lengths (vectorized, no need for .copy())
13621376
df['Peptide_length'] = df['Peptide'].str.len()
1363-
# Step 3: For each Identity, keep the row with the longest peptide
1364-
df = df.loc[df.groupby('Identity')['Peptide_length'].idxmax()]
1365-
# Step 4: Sort by '%Rank_EL' (ascending)
1366-
df = df.sort_values(by='%Rank_EL', ascending=True)
1367-
# Step 5: Drop helper column and reset index
1368-
df = df.drop(columns='Peptide_length').reset_index(drop=True)
1369-
df.drop_duplicates(subset=['Peptide'], inplace=True)
1370-
df.drop_duplicates(subset=['Identity'], inplace=True)
1371-
return df
1377+
1378+
# Step 3: Keep longest peptide per Identity (single operation)
1379+
# Use sort_values + drop_duplicates instead of groupby + idxmax (faster for large datasets)
1380+
df = df.sort_values(['Identity', 'Peptide_length'], ascending=[True, False])
1381+
df = df.drop_duplicates(subset='Identity', keep='first')
1382+
1383+
# Step 4: Drop duplicates by Peptide (moved before sorting for efficiency)
1384+
df = df.drop_duplicates(subset='Peptide', keep='first')
1385+
1386+
# Step 5: Sort by '%Rank_EL' (only once at the end)
1387+
df = df.sort_values('%Rank_EL', ascending=True)
1388+
1389+
# Step 6: Clean up and return
1390+
return df.drop(columns='Peptide_length').reset_index(drop=True)
13721391

13731392
class mutation_screen():
13741393
def __init__(self, args, df, **kwargs):
@@ -1451,18 +1470,24 @@ def process_single_id(self, id):
14511470
if self.args.benchmark:
14521471
peptide_fasta_file = os.path.join(comb_path, 'peptide_design', 'binder_pred', 'mutants_bench.fa')
14531472
generate_mutants(peptide, comb, peptide_fasta_file, id, one_indexed=True)
1473+
parallel = False
1474+
if self.args.run == 'parallel': parallel=True
14541475
df_mut = run_and_parse_netmhcpan(
14551476
peptide_fasta_file,
14561477
mhc_type=mhc_type,
14571478
output_dir=os.path.join(comb_path, 'peptide_design', 'binder_pred'),
14581479
mhc_seq_list=[],
14591480
mhc_allele='/'.join(runner.matched_allele),
1460-
dirty_mode=False, verbose=False,
1481+
dirty_mode=False, verbose=self.args.verbose,
14611482
outfilename='netmhcpan_out_mutant_benchmark',
14621483
return_match_allele=False,
1463-
match_with_netmhcpan=False
1484+
match_with_netmhcpan=False,
1485+
n_jobs=self.args.max_cores,
1486+
parallel=parallel
14641487
)
1488+
print('filter peptides from netmhcpan output')
14651489
df_mut = filter_peptides_from_netmhcpan_csv_output(df_mut)
1490+
print('filter done')
14661491
dataframe_mut = pd.DataFrame({
14671492
'peptide': df_mut.Peptide.tolist(),
14681493
'mhc_seq': [mhc_seq] * len(df_mut.Peptide),
@@ -1490,18 +1515,18 @@ def process_single_id(self, id):
14901515
return pd.DataFrame()
14911516

14921517
def run_mutation_screen(self):
1493-
if self.args.run == 'parallel':
1494-
with Pool(processes=self.args.max_cores) as pool:
1495-
results = pool.map(self.process_single_id, self.df.id.tolist())
1496-
ALL_DF = pd.concat([r for r in results if not r.empty])
1497-
else:
1518+
#if self.args.run == 'parallel':
1519+
# with Pool(processes=self.args.max_cores) as pool:
1520+
# results = pool.map(self.process_single_id, self.df.id.tolist())
1521+
# ALL_DF = pd.concat([r for r in results if not r.empty])
1522+
#else:
14981523
# sequential mode
1499-
ALL_DF = []
1500-
for id in self.df.id.tolist():
1501-
df_id = self.process_single_id(id)
1502-
if not df_id.empty:
1503-
ALL_DF.append(df_id)
1504-
ALL_DF = pd.concat(ALL_DF)
1524+
ALL_DF = []
1525+
for id in self.df.id.tolist():
1526+
df_id = self.process_single_id(id)
1527+
if not df_id.empty:
1528+
ALL_DF.append(df_id)
1529+
ALL_DF = pd.concat(ALL_DF)
15051530

15061531
ALL_DF.to_csv(os.path.join(self.args.output_dir, f'mutation_selection_{self.mt_num}.csv'), index=False)
15071532

0 commit comments

Comments
 (0)