Skip to content

Commit edbd6c6

Browse files
authored
Create HOGcomposition.py
1 parent 12aa43c commit edbd6c6

File tree

1 file changed

+83
-0
lines changed

1 file changed

+83
-0
lines changed

utils/HOGcomposition.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
2+
import os
3+
import sys
4+
5+
in_folder=sys.argv[1]
6+
fastoma_out_folder=sys.argv[2]
7+
8+
```
9+
To run
10+
python HOGcomposition.py in_folder output
11+
in_folder : where proteome folder is
12+
output: where RootHOGs.tsv is
13+
14+
output:
15+
species composition table (HOGs in rows, species in columns and each cell to be the protein names of that HOG for each species)
16+
17+
```
18+
19+
project_files = os.listdir(in_folder+"/proteome/")
20+
21+
22+
fasta_format_keep = ""
23+
species_names = [] # query/input species name based on the file name
24+
for file in project_files:
25+
species_name, ext = file.rsplit('.', 1)
26+
if ext in ("fa", "fasta"):
27+
species_names.append(species_name)
28+
fasta_format_keep = ext # last one is stored either fa or fasta
29+
print("number of species:", len(species_names))
30+
31+
32+
33+
prot2species = {}
34+
for species_name in species_names:
35+
prot_address = os.path.join(in_folder+"/proteome/", species_name + "." + fasta_format_keep)
36+
37+
file_prot = open(prot_address,'r')
38+
for line in file_prot:
39+
if line.startswith(">"):
40+
prot_name=line.strip().split(" ")[0][1:]
41+
prot2species[prot_name] = species_name
42+
print("total number of proteins in the fasta files",len(prot2species))
43+
44+
roothog_dic={}
45+
roothog_file= open(fastoma_out_folder+"/RootHOGs.tsv",'r')
46+
for line in roothog_file:
47+
if line.startswith("RootHOG"):
48+
continue
49+
roothog,protein, omamerroothog= line.strip().split("\t")
50+
if roothog in roothog_dic:
51+
roothog_dic[roothog].append(protein)
52+
else:
53+
roothog_dic[roothog]=[protein]
54+
print("number of HOGs", len(roothog_dic))
55+
56+
57+
file_out=open("HOGcomposition.tsv",'w')
58+
file_out.write('RootHOG'+'\t'+'\t'.join(species_names)+'\n')
59+
for roothog, prots in roothog_dic.items():
60+
61+
prot_species_dic={}
62+
for species_name in species_names:
63+
prot_species_dic[species_name]=[]
64+
65+
for prot in prots:
66+
species_name=prot2species[prot]
67+
prot_species_dic[species_name].append(prot)
68+
69+
prot_species=[]
70+
for species_name in species_names:
71+
prots_raw= prot_species_dic[species_name]
72+
if prots:
73+
prot_species.append(str(prots_raw)[1:-1])
74+
else:
75+
prot_species.append('')
76+
77+
78+
#prot_species_str=','.join(prot_species)
79+
file_out.write(roothog+'\t'+'\t'.join(prot_species)+'\n')
80+
81+
file_out.close()
82+
83+
print("output is written as file ", "HOGcomposition.tsv")

0 commit comments

Comments
 (0)