Skip to content

Commit cd75f8b

Browse files
committed
Add individual specific unchanged protein handling
1 parent 5001e2f commit cd75f8b

File tree

1 file changed

+38
-7
lines changed

1 file changed

+38
-7
lines changed

src/PrecisionProDB_Sqlite.py

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,23 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
113113
print('total number of proteins with AA mutation:', df_mutAnno.shape[0])
114114
df_mutAnno.to_csv(file_mutAnno, sep='\t', index=None)
115115

116+
# collect individual information for later use when adding unchanged proteins per individual
117+
all_individuals = []
118+
if isinstance(individual, str):
119+
if individual not in ['', 'None']:
120+
all_individuals = [i.strip() for i in individual.split(',') if i.strip()]
121+
elif isinstance(individual, (list, tuple)):
122+
all_individuals = [i for i in individual if i]
123+
124+
mutated_samples_by_protein = {}
125+
if 'individual' in df_mutAnno.columns:
126+
for _, row in df_mutAnno[['protein_id', 'individual']].dropna(subset=['individual']).iterrows():
127+
samples = [i.strip() for i in str(row['individual']).split(',') if i.strip()]
128+
if not samples:
129+
continue
130+
mutated_samples_by_protein.setdefault(row['protein_id'], set()).update(samples)
131+
use_individual_specific = len(all_individuals) > 0 and len(mutated_samples_by_protein) > 0
132+
116133
# collect protein sequences
117134
files_proteins_changed = ['{}/{}.mutated_protein.fa'.format(tempfolder, chromosome) for chromosome in chromosomes_mutated]
118135
file_proteins_changed = outprefix + '.pergeno.protein_changed.fa'
@@ -128,15 +145,29 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
128145
if s.description.endswith('\tchanged'):
129146
proteins_changed_ids.append(s.id)
130147
fout_proteins_changed.write('>{}\n{}\n'.format(s.description, str(s.seq)))
131-
148+
132149
proteins_changed_ids = set([i.split('__')[0] for i in proteins_changed_ids])
133150
conn = buildSqlite.get_connection(file_sqlite)
134151
df_protein_description = pd.read_sql("SELECT * FROM protein_description", conn)
135-
df_protein_description = df_protein_description[~ df_protein_description['protein_id_fasta'].isin(proteins_changed_ids)]
136-
for _, s in df_protein_description.iterrows():
137-
protein_id, protein_description, protein_id_fasta, AA_seq = s['protein_id'], s['protein_description'], s['protein_id_fasta'], s['AA_seq']
138-
fout_proteins_all.write('>{}\tunchanged\n{}\n'.format(protein_description, str(AA_seq)))
139-
152+
153+
if not use_individual_specific:
154+
df_protein_description = df_protein_description[~ df_protein_description['protein_id_fasta'].isin(proteins_changed_ids)]
155+
for _, s in df_protein_description.iterrows():
156+
protein_description, AA_seq = s['protein_description'], s['AA_seq']
157+
fout_proteins_all.write('>{}\tunchanged\n{}\n'.format(protein_description, str(AA_seq)))
158+
else:
159+
for _, s in df_protein_description.iterrows():
160+
protein_description, protein_id_fasta, AA_seq = s['protein_description'], s['protein_id_fasta'], s['AA_seq']
161+
mutated_samples = mutated_samples_by_protein.get(protein_id_fasta, set())
162+
if len(mutated_samples) == 0:
163+
fout_proteins_all.write('>{}\tunchanged\n{}\n'.format(protein_description, str(AA_seq)))
164+
continue
165+
if len(mutated_samples) >= len(all_individuals):
166+
continue
167+
samples_to_write = [i for i in all_individuals if i not in mutated_samples]
168+
header = '{}\tunchanged\t{}'.format(protein_id_fasta, ','.join(samples_to_write))
169+
fout_proteins_all.write('>{}\n{}\n'.format(header, str(AA_seq)))
170+
140171
conn.close()
141172
fout_proteins_changed.close()
142173
fout_proteins_all.close()
@@ -402,4 +433,4 @@ def main():
402433
main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite, info_field=f.info_field, info_field_thres=f.info_field_thres)
403434

404435
if __name__ == '__main__':
405-
main()
436+
main()

0 commit comments

Comments
 (0)