Skip to content

Commit f0efd93

Browse files
authored
Add script to generate splice files from Ensembl proteomes
This script processes proteome data by parsing FASTA files, extracting transcripts, and writing splice files for species with multiple transcripts.
1 parent 32ac18b commit f0efd93

File tree

1 file changed

+133
-0
lines changed

1 file changed

+133
-0
lines changed
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
import os
2+
import sys
3+
from Bio import SeqIO
4+
5+
6+
7+
"""
8+
python generate_splice_file_Ensembl.py input_folder
9+
10+
"""
11+
12+
13+
14+
def parse_proteomes(folder=None, min_sequence_length=0): # list_oma_species
15+
"""
16+
17+
parsing fasta files of proteins located in /proteome/
18+
using Bio.SeqIO.parse
19+
Each fasta file is for one species. The file name is the species name.
20+
All proteomes should be with the same extension.
21+
output: prot_recs_lists: a dic with key as species name and its value list of Biopython record of species.
22+
"""
23+
# Adapted from https://github.com/DessimozLab/FastOMA/blob/main/FastOMA/_utils_roothog.py
24+
extension = ".fa"
25+
26+
project_files = os.listdir(folder)
27+
print(folder, " as proteome folder, found ", len(project_files), " files.")
28+
species_names = [] # query/input species name based on the file name
29+
for file in project_files:
30+
species_name = file.split(extension)[0]
31+
#print("%s: %s | %s", file, species_name, ext)
32+
#if ext in ("fa", "fasta"):
33+
species_names.append(species_name)
34+
#fasta_format_keep = ext # last one is stored either fa or fasta
35+
36+
# todo accept all fasta formats in the input prtoeome folder, fasta, fa, fna, ..
37+
prot_recs_lists = {} # key: species name, value is a dic of query protein Biopython records. # 'MYCGE': [SeqRecord(seq=Seq('MDFDK
38+
#smallprot_recs_lists ={}
39+
for species_name in species_names:
40+
prot_address = os.path.join(folder, species_name + extension)
41+
prots_record = list(rec for rec in SeqIO.parse(prot_address, "fasta") if len(rec) >= min_sequence_length)
42+
#prots_record_small = list(rec for rec in SeqIO.parse(prot_address, "fasta") if len(rec) < min_sequence_length)
43+
# logger.debug(prots_record)
44+
#print(f"{species_name} contains {len(prots_record)} that are at least {min_sequence_length} long.")
45+
prot_recs_lists[species_name] = prots_record
46+
#smallprot_recs_lists[species_name]=prots_record_small
47+
48+
print("There are ", len(species_names), "species in the proteome folder.")
49+
return species_names, prot_recs_lists#, fasta_format_keep #, smallprot_recs_lists
50+
51+
def extract_spliced_prots(gene2prot_transcript):
52+
genes_withmore1=[]
53+
diff_prot_transcript=[]
54+
prots_with_splice={}
55+
for species_name, gene2prot_transcript_sp in gene2prot_transcript.items():
56+
57+
for gene in gene2prot_transcript_sp.keys():
58+
prot_transcript_list=gene2prot_transcript_sp[gene]
59+
if len(prot_transcript_list)>1:
60+
genes_withmore1.append((species_name,gene))
61+
prots= [prot for (prot,transcript) in prot_transcript_list]
62+
transcripts= [transcript for (prot,transcript) in prot_transcript_list]
63+
if len(set(prots))!=len(set(transcripts)):
64+
print(species_name,gene)
65+
if set(prots)!=set(transcripts):
66+
diff_prot_transcript.append((species_name,gene))
67+
if species_name in prots_with_splice:
68+
prots_with_splice[species_name].append(prots)
69+
else:
70+
prots_with_splice[species_name]=[prots]
71+
print('There are ',len(genes_withmore1),' genes with more than one transcripts.')
72+
print('For these genes, there are ',len(diff_prot_transcript),' genes that the protein id is different than transcript id.')
73+
print('We found ',len(prots_with_splice),' species that have some genes with more than one transcripts.')
74+
return prots_with_splice
75+
76+
77+
78+
79+
def extract_transcripts(species_names, prot_recs_lists):
80+
total_splices=0
81+
gene2prot_transcript={}
82+
for species_name, prot_recs_list in prot_recs_lists.items():
83+
gene2prot_transcript[species_name]={} #
84+
85+
for prot_rec in prot_recs_list:
86+
prot_name= prot_rec.id # >PSK40689.1 pep primary_assembly:GCA003013735v1:PYFQ01000001:776887:779901:1 gene:C7M61_000337 transcript:C7M61_000337_t1 gene_biotype:protein_coding transcript_biotype:protein_coding description:RIC1 domain-containing protein [Source:UniProtKB/TrEMBL;Acc:A0A2P7YXL8]
87+
transcriptid = prot_rec.description.split("transcript:")[1].split(" ")[0]
88+
geneid = prot_rec.description.split("gene:")[1].split(" ")[0]
89+
protid= prot_rec.id
90+
if geneid in gene2prot_transcript[species_name]:
91+
total_splices+=1
92+
gene2prot_transcript[species_name][geneid].append((protid,transcriptid))
93+
94+
else:
95+
gene2prot_transcript[species_name][geneid] = [(protid,transcriptid)]
96+
97+
98+
print('Total number transcripts that are more than one transcript per gene is', total_splices)
99+
print('Total number of species',len(gene2prot_transcript),'. For species ',species_name,', the number of genes is ',len(gene2prot_transcript[species_name]))
100+
101+
return gene2prot_transcript
102+
103+
def write_splice_files(proteome_folder,prots_with_splice,output_folder):
104+
105+
print('Writing ', len(prots_with_splice), 'splice files')
106+
os.mkdir(output_folder)
107+
for species_name, prots_list in prots_with_splice.items():
108+
file_splice= open(output_folder+species_name+".splice",'w')
109+
for prots in prots_list:
110+
line= ";".join(prots)
111+
file_splice.write(line+"\n")
112+
file_splice.close()
113+
print('Splice files per species are ready in ', output_folder)
114+
return 1
115+
116+
117+
118+
if __name__ == "__main__":
119+
120+
121+
proteome_folder= sys.argv[1]
122+
123+
print("Parsing the proteome folder", proteome_folder)
124+
species_names, prot_recs_lists = parse_proteomes(proteome_folder)
125+
126+
gene2prot_transcript = extract_transcripts(species_names, prot_recs_lists)
127+
128+
prots_with_splice = extract_spliced_prots(gene2prot_transcript)
129+
output_folder= output_folder= proteome_folder+"/../splice_files/"
130+
write_splice_files(proteome_folder,prots_with_splice, output_folder)
131+
print("done!")
132+
133+

0 commit comments

Comments
 (0)