@@ -889,10 +889,10 @@ def protein_mpnn_wrapper(output_pdbs_dict, args, max_jobs, anchor_and_peptide=No
889889 else :
890890 raise ValueError ("Invalid mode! Choose 'parallel' or 'single'." )
891891
892-
892+ import time
893893def run_and_parse_netmhcpan (peptide_fasta_file , mhc_type , output_dir , mhc_seq_list = [], mhc_allele = None ,
894- dirty_mode = False , verbose = False , outfilename = 'netmhcpan_out' , return_match_allele = False ,
895- match_with_netmhcpan = True ):
894+ dirty_mode = False , verbose = True , outfilename = 'netmhcpan_out' , return_match_allele = False ,
895+ match_with_netmhcpan = True , n_jobs = 1 , parallel = False ):
896896 assert mhc_type in [1 ,2 ]
897897 if not mhc_allele and len (mhc_seq_list ) == 0 :
898898 raise ValueError (f'at least one of mhc_seq_list or mhc_allele should be provided' )
@@ -923,8 +923,17 @@ def run_and_parse_netmhcpan(peptide_fasta_file, mhc_type, output_dir, mhc_seq_li
923923 matched_allele .append (a )
924924 if mhc_type == 1 : break
925925 if verbose : print ("Matched Alleles" , matched_allele )
926- processing_functions .run_netmhcpan (peptide_fasta_file , matched_allele , outfile , mhc_type )
926+ if parallel :
927+ processing_functions .run_netmhcpan_parallel (peptide_fasta_file , matched_allele , outfile , mhc_type , n_jobs = n_jobs , verbose = verbose )
928+ else :
929+ processing_functions .run_netmhcpan (peptide_fasta_file , matched_allele , outfile , mhc_type )
930+ if verbose :
931+ s = time .time ()
932+ print ('Parsing netmhcpan output on ' , outfile )
927933 df = processing_functions .parse_netmhcpan_file (outfile )
934+ if verbose :
935+ e = time .time ()
936+ print ('parsing finished' , 'time:' , e - s )
928937 df .to_csv (outfile_csv , index = False )
929938 if not dirty_mode :
930939 os .remove (outfile )
@@ -965,7 +974,8 @@ def _process_row(self, row):
965974 elif mhc_type == 1 :
966975 assert len (mhc_seq_list ) == 1 , (f'mhc_seq for mhc_type==1, should be string with no "/", '
967976 f'found: \n { str (row .mhc_seq )} ' )
968- netmhc_df = run_and_parse_netmhcpan (peptide_fasta_file , mhc_type , self .tmp , mhc_seq_list , verbose = False , outfilename = str (row .id ))
977+ parallel = True if self .args .run == 'parallel' else False
978+ netmhc_df = run_and_parse_netmhcpan (peptide_fasta_file , mhc_type , self .tmp , mhc_seq_list , verbose = self .args .verbose , outfilename = str (row .id ), n_jobs = self .args .max_cores , parallel = parallel )
969979 seen_cores = []
970980 results = {'anchors' : [], 'mhc_seqs' : [], 'ids' : [], 'peptides' : [], 'mhc_types' : []}
971981 counter = 0
@@ -1352,23 +1362,32 @@ def generate_mutants(peptide, positions, fasta_out, id, one_indexed=True):
13521362
13531363def filter_peptides_from_netmhcpan_csv_output (df ):
13541364 """
1365+ Optimized version:
13551366 1. Removes rows where 'Identity' starts with 'multichain'.
13561367 2. For each remaining Identity, keeps the row with the longest peptide.
13571368 3. Sorts the resulting rows by '%Rank_EL' in ascending order (lower = better).
13581369 """
1359- # Step 1: Remove 'multichain' rows
1360- df = df [~ df ['Identity' ].astype (str ).str .startswith ('multichain' )].copy ()
1361- # Step 2: Compute peptide lengths
1370+ # Step 1: Remove 'multichain' rows using vectorized string operation
1371+ # Avoid .copy() if not modifying the original df elsewhere
1372+ mask = ~ df ['Identity' ].str .startswith ('multichain' , na = False )
1373+ df = df [mask ]
1374+
1375+ # Step 2: Compute peptide lengths (vectorized, no need for .copy())
13621376 df ['Peptide_length' ] = df ['Peptide' ].str .len ()
1363- # Step 3: For each Identity, keep the row with the longest peptide
1364- df = df .loc [df .groupby ('Identity' )['Peptide_length' ].idxmax ()]
1365- # Step 4: Sort by '%Rank_EL' (ascending)
1366- df = df .sort_values (by = '%Rank_EL' , ascending = True )
1367- # Step 5: Drop helper column and reset index
1368- df = df .drop (columns = 'Peptide_length' ).reset_index (drop = True )
1369- df .drop_duplicates (subset = ['Peptide' ], inplace = True )
1370- df .drop_duplicates (subset = ['Identity' ], inplace = True )
1371- return df
1377+
1378+ # Step 3: Keep longest peptide per Identity (single operation)
1379+ # Use sort_values + drop_duplicates instead of groupby + idxmax (faster for large datasets)
1380+ df = df .sort_values (['Identity' , 'Peptide_length' ], ascending = [True , False ])
1381+ df = df .drop_duplicates (subset = 'Identity' , keep = 'first' )
1382+
1383+ # Step 4: Drop duplicates by Peptide (moved before sorting for efficiency)
1384+ df = df .drop_duplicates (subset = 'Peptide' , keep = 'first' )
1385+
1386+ # Step 5: Sort by '%Rank_EL' (only once at the end)
1387+ df = df .sort_values ('%Rank_EL' , ascending = True )
1388+
1389+ # Step 6: Clean up and return
1390+ return df .drop (columns = 'Peptide_length' ).reset_index (drop = True )
13721391
13731392class mutation_screen ():
13741393 def __init__ (self , args , df , ** kwargs ):
@@ -1451,18 +1470,24 @@ def process_single_id(self, id):
14511470 if self .args .benchmark :
14521471 peptide_fasta_file = os .path .join (comb_path , 'peptide_design' , 'binder_pred' , 'mutants_bench.fa' )
14531472 generate_mutants (peptide , comb , peptide_fasta_file , id , one_indexed = True )
1473+ parallel = False
1474+ if self .args .run == 'parallel' : parallel = True
14541475 df_mut = run_and_parse_netmhcpan (
14551476 peptide_fasta_file ,
14561477 mhc_type = mhc_type ,
14571478 output_dir = os .path .join (comb_path , 'peptide_design' , 'binder_pred' ),
14581479 mhc_seq_list = [],
14591480 mhc_allele = '/' .join (runner .matched_allele ),
1460- dirty_mode = False , verbose = False ,
1481+ dirty_mode = False , verbose = self . args . verbose ,
14611482 outfilename = 'netmhcpan_out_mutant_benchmark' ,
14621483 return_match_allele = False ,
1463- match_with_netmhcpan = False
1484+ match_with_netmhcpan = False ,
1485+ n_jobs = self .args .max_cores ,
1486+ parallel = parallel
14641487 )
1488+ print ('filter peptides from netmhcpan output' )
14651489 df_mut = filter_peptides_from_netmhcpan_csv_output (df_mut )
1490+ print ('filter done' )
14661491 dataframe_mut = pd .DataFrame ({
14671492 'peptide' : df_mut .Peptide .tolist (),
14681493 'mhc_seq' : [mhc_seq ] * len (df_mut .Peptide ),
@@ -1490,18 +1515,18 @@ def process_single_id(self, id):
14901515 return pd .DataFrame ()
14911516
14921517 def run_mutation_screen (self ):
1493- if self .args .run == 'parallel' :
1494- with Pool (processes = self .args .max_cores ) as pool :
1495- results = pool .map (self .process_single_id , self .df .id .tolist ())
1496- ALL_DF = pd .concat ([r for r in results if not r .empty ])
1497- else :
1518+ # if self.args.run == 'parallel':
1519+ # with Pool(processes=self.args.max_cores) as pool:
1520+ # results = pool.map(self.process_single_id, self.df.id.tolist())
1521+ # ALL_DF = pd.concat([r for r in results if not r.empty])
1522+ # else:
14981523 # sequential mode
1499- ALL_DF = []
1500- for id in self .df .id .tolist ():
1501- df_id = self .process_single_id (id )
1502- if not df_id .empty :
1503- ALL_DF .append (df_id )
1504- ALL_DF = pd .concat (ALL_DF )
1524+ ALL_DF = []
1525+ for id in self .df .id .tolist ():
1526+ df_id = self .process_single_id (id )
1527+ if not df_id .empty :
1528+ ALL_DF .append (df_id )
1529+ ALL_DF = pd .concat (ALL_DF )
15051530
15061531 ALL_DF .to_csv (os .path .join (self .args .output_dir , f'mutation_selection_{ self .mt_num } .csv' ), index = False )
15071532
0 commit comments