Skip to content

Commit 00bed75

Browse files
committed
ENH update 08_Conserved_domain_annotation
1 parent ab2aa36 commit 00bed75

File tree

5 files changed

+203
-3
lines changed

5 files changed

+203
-3
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
'''
2+
Map CDD annotation.
3+
'''
4+
def merge_habitat_motif(cdd,annotate,infile,outfile):
5+
import pandas as pd
6+
7+
motif = pd.read_csv(cdd,compression='gzip',sep='\t')
8+
motif['cdd'] = motif['cdd'].apply(lambda x:x.split('|')[2])
9+
motif['cdd'] = motif['cdd'].astype('int')
10+
habitat = pd.read_csv(infile,sep='\t',header=None,names=['smorf','habitat'])
11+
result = motif.merge(habitat,'right',on='smorf')
12+
13+
cdd = pd.read_csv(annotate,compression='gzip',sep='\t',header=None,names=['cdd','accession','short_name','description','PSSM_Length'])
14+
merged = result.merge(cdd,'left',on='cdd')
15+
merged.to_csv(outfile,index=None,sep='\t')
16+
17+
def merge_motif(cdd,annotate,infile,outfile):
18+
import pandas as pd
19+
20+
motif = pd.read_csv(cdd,compression='gzip',sep='\t')
21+
motif['cdd'] = motif['cdd'].apply(lambda x:x.split('|')[2])
22+
motif['cdd'] = motif['cdd'].astype('int')
23+
multi = pd.read_csv(infile,sep='\t',header=None,names=['smorf','number'])
24+
result = motif.merge(multi,'right',on='smorf')
25+
26+
cdd = pd.read_csv(annotate,compression='gzip',sep='\t',header=None,names=['cdd','accession','short_name','description','PSSM_Length'])
27+
merged = result.merge(cdd,'left',on='cdd')
28+
merged.to_csv(outfile,index=None,sep='\t')
29+
30+
cdd = '1_cdd_tcov_90AA.tsv.gz'
31+
annotate = 'cddid_all.tbl.gz'
32+
infile1 = '90AA_multi_habitat.tsv'
33+
infile2 = 'multi_genus_3.tsv'
34+
outfile1 = '90AA_multi_habitat_cdd.tsv'
35+
outfile2 = 'multi_genus_3_cdd.tsv'
36+
merge_habitat_motif(cdd,annotate,infile1,outfile1)
37+
merge_habitat_motif(cdd,annotate,infile2,outfile2)
File renamed without changes.

General_Scripts/08_Conserved_domain_annotation/03_multi-genus_enrichment/03_extract_count.py renamed to General_Scripts/08_Conserved_domain_annotation/03_multi-genus_enrichment/04_extract_count_cdd_habitat.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
'''
2-
2+
Calculate fraction of habitats and cdd annotation of families.
33
'''
44
def extract(infile_1,infile_2,outfile):
55
smorf_set = set()
@@ -70,5 +70,7 @@ def count_multi_habitat(infile):
7070
outfile2 = 'whole_3_selected_habitat.tsv'
7171
extract(infile1,infile3,outfile2)
7272

73+
count_multi_cdd(infile2)
7374
count_multi_cdd(outfile1)
75+
count_multi_habitat(infile3)
7476
count_multi_habitat(outfile2)
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
'''
2+
Map Pfam clan.
3+
Count Pfam number.
4+
'''
5+
6+
def select_pfam(infile,outfile):
7+
with open(outfile,'wt') as out:
8+
with open(infile,'rt') as f:
9+
for line in f:
10+
if line.startswith('smorf'):
11+
out.write(line)
12+
else:
13+
linelist = line.strip().split('\t')
14+
if len(linelist) > 10 and linelist[10].startswith('pfam'):
15+
out.write(line)
16+
17+
# map Pfam clan A
18+
def pfam2clan(in_file1,in_file2,out_file):
19+
pfam_dict = {}
20+
21+
with open(in_file1,'rt') as f1:
22+
for line in f1:
23+
linelist = line.strip().split('\t')
24+
pfam = linelist[0].replace('PF','')
25+
pfam_dict[pfam] = f'{linelist[1]}\t{linelist[2]}\t{linelist[4]}'
26+
27+
with open(out_file,'wt') as out:
28+
out.write(f'smorf\tcdd\tquery_length\tscore\talign_length\tidentity\tevalue\ttarget_length\ttcov\tnumber\taccession\tshort_name\tdescription\tPSSM_Length\tclan_id\tclan\tshort_description\n')
29+
with open(in_file2,'rt') as f2:
30+
for line in f2:
31+
if line.startswith('smorf'):
32+
continue
33+
else:
34+
line = line.replace('\n','')
35+
linelist = line.split('\t')
36+
pf = linelist[10].replace('pfam','')
37+
if pf in pfam_dict.keys():
38+
out.write(f'{line}\t{pfam_dict[pf]}\n')
39+
else:
40+
out.write(f'{line}\n')
41+
42+
# map Pfam clan C
43+
def map_clan_c(in_file1,in_file2,out_file):
44+
pfam_dict = {}
45+
46+
with open(in_file1,'rt') as f1:
47+
for line in f1:
48+
linelist = line.strip().split('\t')
49+
pfam_dict[linelist[1]] = linelist[2]
50+
51+
with open(out_file,'wt') as out:
52+
with open(in_file2,'rt') as f2:
53+
for line in f2:
54+
line = line.strip()
55+
if line.startswith('smorf'):
56+
out.write(f'{line}\tclan_description\n')
57+
else:
58+
linelist = line.split('\t')
59+
if len(linelist) == 17 and linelist[-2] in pfam_dict.keys():
60+
out.write(f'{line}\t{pfam_dict[linelist[-2]]}\n')
61+
else:
62+
out.write(f'{line}\n')
63+
64+
# group ribosomal and unknown proteins
65+
def modify(infile,outfile):
66+
with open(outfile,'wt') as out:
67+
with open(infile,'rt') as f:
68+
for line in f:
69+
line = line.strip()
70+
if line.startswith('smorf'):
71+
out.write(f'group\t{line}\n')
72+
else:
73+
linelist = line.split('\t')
74+
if len(linelist) >16:
75+
if len(linelist) == 17:
76+
if 'ribosomal' in linelist[-1].casefold():
77+
out.write(f'Ribosomal protein\t{line}\n')
78+
elif 'uncharacterised' in linelist[-1].casefold() or 'unknown' in linelist[-1].casefold():
79+
out.write(f'Domain of unknown function\t{line}\n')
80+
else:
81+
group = linelist[-1].split(',')[0]
82+
out.write(f'{group}\t{line}\n')
83+
else:
84+
if 'ribosomal' in linelist[-2].casefold():
85+
out.write(f'Ribosomal protein\t{line}\n')
86+
else:
87+
out.write(f'{linelist[-1]}\t{line}\n')
88+
else:
89+
out.write(f'\t{line}\n')
90+
91+
def count_pfam(infile,outfile1,outfile2):
92+
group = {}
93+
pfam = {}
94+
all = set()
95+
96+
pfam_info = {}
97+
98+
out1 = open(outfile1,'wt')
99+
out2 = open(outfile2,'wt')
100+
101+
with open(infile,'rt') as f:
102+
for line in f:
103+
if line.startswith('group'):
104+
continue
105+
else:
106+
linelist = line.strip().split('\t')
107+
if linelist[0] not in group.keys():
108+
group[linelist[0]] = set()
109+
group[linelist[0]].add(linelist[1])
110+
if linelist[11] not in pfam.keys():
111+
pfam[linelist[11]] = set()
112+
pfam_info[linelist[11]] = f'{linelist[0]}\t{linelist[12]}\t{linelist[13]}\t{linelist[15]}\t{linelist[16]}\t{linelist[17]}'
113+
pfam[linelist[11]].add(linelist[1])
114+
115+
all.add(linelist[1])
116+
117+
out1.write(f'group\tcount\tall_count\n')
118+
for key,value in group.items():
119+
number = len(value)
120+
all_number = len(all)
121+
out1.write(f'{key}\t{number}\t{all_number}\n')
122+
123+
out2.write(f'pfam\tcount\tall_count\tgroup\tshort_name\tdescription\tclan_id\tclan\tshort_description\n')
124+
for key,value in pfam.items():
125+
number = len(value)
126+
all_number = len(all)
127+
out2.write(f'{key}\t{number}\t{all_number}\t{pfam_info[key]}\n')
128+
129+
out1.close()
130+
out2.close()
131+
132+
infile1 = 'multi_genus_3_cdd.tsv'
133+
infile2 = '90AA_multi_habitat_cdd.tsv'
134+
outfile1 = 'multi_genus_3_pfam.tsv'
135+
outfile2 = '90AA_multi_habitat_pfam.tsv'
136+
select_pfam(infile1,outfile1)
137+
select_pfam(infile2,outfile2)
138+
139+
pfama = 'Pfam-A.clans.tsv'
140+
outfile3 = 'multi_genus_3_pfam_clan_A.tsv'
141+
outfile4 = '90AA_multi_habitat_pfam_clan_A.tsv'
142+
pfam2clan(pfama,outfile1,outfile3)
143+
pfam2clan(pfama,outfile2,outfile4)
144+
145+
pfamc = 'pfam-c_format.txt'
146+
outfile5 = 'multi_genus_3_pfam_clan_A_C.tsv'
147+
outfile6 = '90AA_multi_habitat_pfam_clan_A_C.tsv'
148+
map_clan_c(pfamc,outfile5,outfile6)
149+
150+
outfile7 = 'multi_genus_3_pfam_clan_A_C_group.tsv'
151+
outfile8 = '90AA_multi_habitat_pfam_clan_A_C_group.tsv'
152+
modify(outfile7,outfile8)
153+
154+
outfile9 = 'multi_genus_3_pfam_group_count.tsv'
155+
outfile10 = 'multi_genus_3_pfam_count.tsv'
156+
outfile11 = '90AA_multi_habitat_pfam_group_count.tsv'
157+
outfile12 = '90AA_multi_habitat_pfam_count.tsv'
158+
count_pfam(outfile7,outfile9,outfile10)
159+
count_pfam(outfile8,outfile11,outfile12)

General_Scripts/08_Conserved_domain_annotation/Readme.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,5 +22,7 @@
2222
| **Code** | **Description** |
2323
| :---: | :---: | :---: | :---: |
2424
| 01_multi_genus.py | Extract multi-genus and specific-genus families. | 90AA_taxa_multi_specific.tsv | multi_genus_3.tsv specific_genus_3.tsv |
25-
| 02_keep_size.py | Keep the same size for selected clusers from multi-genus and the whole clusters. | 90AA_taxa_multi_specific.tsv multi_genus_3.tsv| whole_3_selected.tsv |
26-
| 03_extract_count.py | Calculate fraction of habitats and cdd annotation of families from the whole GMSCwith the same size distribution. | whole_3_selected.tsv 90AA_multi_habitat_cdd.tsv 90AA_multi_habitat.tsv | whole_3_selected_habitat_cdd.tsv whole_3_selected_habitat.tsv |
25+
| 02_map_cdd_3.py | Map CDD annotation | 1_cdd_tcov_90AA.tsv.gz cddid_all.tbl.gz 90AA_multi_habitat.tsv multi_genus_3.tsv | 90AA_multi_habitat_cdd.tsv multi_genus_3_cdd.tsv|
26+
| 03_keep_size.py | Keep the same size for selected clusers from multi-genus and the whole clusters. | 90AA_taxa_multi_specific.tsv multi_genus_3.tsv| whole_3_selected.tsv |
27+
| 04_extract_count_cdd_habitat.py | Calculate fraction of habitats and cdd annotation of families. | whole_3_selected.tsv 90AA_multi_habitat_cdd.tsv 90AA_multi_habitat.tsv | whole_3_selected_habitat_cdd.tsv whole_3_selected_habitat.tsv |
28+
| 05_count_pfam.py | Map Pfam clan and count Pfam number. | multi_genus_3_cdd.tsv 90AA_multi_habitat_cdd.tsv Pfam-A.clans.tsv pfam-c_format.txt | multi_genus_3_pfam_count.tsv 90AA_multi_habitat_pfam_count.tsv |

0 commit comments

Comments
 (0)