@@ -113,6 +113,23 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
113113 print ('total number of proteins with AA mutation:' , df_mutAnno .shape [0 ])
114114 df_mutAnno .to_csv (file_mutAnno , sep = '\t ' , index = None )
115115
116+ # collect individual information for later use when adding unchanged proteins per individual
117+ all_individuals = []
118+ if isinstance (individual , str ):
119+ if individual not in ['' , 'None' ]:
120+ all_individuals = [i .strip () for i in individual .split (',' ) if i .strip ()]
121+ elif isinstance (individual , (list , tuple )):
122+ all_individuals = [i for i in individual if i ]
123+
124+ mutated_samples_by_protein = {}
125+ if 'individual' in df_mutAnno .columns :
126+ for _ , row in df_mutAnno [['protein_id' , 'individual' ]].dropna (subset = ['individual' ]).iterrows ():
127+ samples = [i .strip () for i in str (row ['individual' ]).split (',' ) if i .strip ()]
128+ if not samples :
129+ continue
130+ mutated_samples_by_protein .setdefault (row ['protein_id' ], set ()).update (samples )
131+ use_individual_specific = len (all_individuals ) > 0 and len (mutated_samples_by_protein ) > 0
132+
116133 # collect protein sequences
117134 files_proteins_changed = ['{}/{}.mutated_protein.fa' .format (tempfolder , chromosome ) for chromosome in chromosomes_mutated ]
118135 file_proteins_changed = outprefix + '.pergeno.protein_changed.fa'
@@ -128,15 +145,29 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
128145 if s .description .endswith ('\t changed' ):
129146 proteins_changed_ids .append (s .id )
130147 fout_proteins_changed .write ('>{}\n {}\n ' .format (s .description , str (s .seq )))
131-
148+
132149 proteins_changed_ids = set ([i .split ('__' )[0 ] for i in proteins_changed_ids ])
133150 conn = buildSqlite .get_connection (file_sqlite )
134151 df_protein_description = pd .read_sql ("SELECT * FROM protein_description" , conn )
135- df_protein_description = df_protein_description [~ df_protein_description ['protein_id_fasta' ].isin (proteins_changed_ids )]
136- for _ , s in df_protein_description .iterrows ():
137- protein_id , protein_description , protein_id_fasta , AA_seq = s ['protein_id' ], s ['protein_description' ], s ['protein_id_fasta' ], s ['AA_seq' ]
138- fout_proteins_all .write ('>{}\t unchanged\n {}\n ' .format (protein_description , str (AA_seq )))
139-
152+
153+ if not use_individual_specific :
154+ df_protein_description = df_protein_description [~ df_protein_description ['protein_id_fasta' ].isin (proteins_changed_ids )]
155+ for _ , s in df_protein_description .iterrows ():
156+ protein_description , AA_seq = s ['protein_description' ], s ['AA_seq' ]
157+ fout_proteins_all .write ('>{}\t unchanged\n {}\n ' .format (protein_description , str (AA_seq )))
158+ else :
159+ for _ , s in df_protein_description .iterrows ():
160+ protein_description , protein_id_fasta , AA_seq = s ['protein_description' ], s ['protein_id_fasta' ], s ['AA_seq' ]
161+ mutated_samples = mutated_samples_by_protein .get (protein_id_fasta , set ())
162+ if len (mutated_samples ) == 0 :
163+ fout_proteins_all .write ('>{}\t unchanged\n {}\n ' .format (protein_description , str (AA_seq )))
164+ continue
165+ if len (mutated_samples ) >= len (all_individuals ):
166+ continue
167+ samples_to_write = [i for i in all_individuals if i not in mutated_samples ]
168+ header = '{}\t unchanged\t {}' .format (protein_id_fasta , ',' .join (samples_to_write ))
169+ fout_proteins_all .write ('>{}\n {}\n ' .format (header , str (AA_seq )))
170+
140171 conn .close ()
141172 fout_proteins_changed .close ()
142173 fout_proteins_all .close ()
@@ -402,4 +433,4 @@ def main():
402433 main_PrecsionProDB_Sqlite (file_genome , file_gtf , file_mutations , file_protein , threads , outprefix , datatype , protein_keyword , filter_PASS , individual , chromosome_only , keep_all , file_sqlite , info_field = f .info_field , info_field_thres = f .info_field_thres )
403434
404435if __name__ == '__main__' :
405- main ()
436+ main ()
0 commit comments